Source code for indicsyllabifier.core

#!/usr/bin/env python
# mlsplit - Split Malayalam words into letters
# This script splits Malayalam words into letters.
# Ref: http://tinyurl.com/3v729s
# Copyright (C) 2008 Baiju M <baiju.m.mail AT gmail.com>
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or (at
# your option) any later version.
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.


import re
from silpa_common.langdetect import detect_lang


[docs]class Syllabalizer:
    """
    Syllabalizer class provides methods to syllabify unicode strings
    """
    def syllabify_ml(self, text):
        signs = [u'\u0d02', u'\u0d03', u'\u0d3e', u'\u0d3f', u'\u0d40',
                 u'\u0d41', u'\u0d42', u'\u0d43', u'\u0d44', u'\u0d46',
                 u'\u0d47', u'\u0d48', u'\u0d4a', u'\u0d4b', u'\u0d4c',
                 u'\u0d4d', u'\u0d57'
                 ]
        limiters = ['.', '\"', '\'', '`', '!', ';', ',', '?']
        chandrakkala = u'\u0d4d'
        lst_chars = []
        for char in text:
            if char in limiters:
                lst_chars.append(char)
            elif char in signs:
                lst_chars[-1] = lst_chars[-1] + char
            else:
                try:
                    if lst_chars[-1][-1] == chandrakkala:
                        lst_chars[-1] = lst_chars[-1] + char
                    else:
                        lst_chars.append(char)
                except IndexError:
                    lst_chars.append(char)

        return lst_chars

    def syllabify_kn(self, text):
        signs = [u'\u0c82', u'\u0c83', u'\u0cbd', u'\u0cbe', u'\u0cbf',
                 u'\u0cc0', u'\u0cc1', u'\u0cc2', u'\u0cc3', u'\u0cc4',
                 u'\u0cc6', u'\u0cc7', u'\u0cc8', u'\u0cca', u'\u0ccb',
                 u'\u0ccc', u'\u0ccd'
                 ]
        limiters = ['.', '\"', '\'', '`', '!', ';', ', ', '?']

        halant = u'\u0ccd'
        lst_chars = []
        for char in text:
            if char in limiters:
                lst_chars.append(char)
            elif char in signs:
                lst_chars[-1] = lst_chars[-1] + char
            else:
                try:
                    if lst_chars[-1][-1] == halant:
                        lst_chars[-1] = lst_chars[-1] + char
                    else:
                        lst_chars.append(char)
                except IndexError:
                    lst_chars.append(char)

        return lst_chars

    def syllabify_bn(self, text):
        signs = [u'\u0981', u'\u0982', u'\u0983', u'\u09bd', u'\u09be',
                 u'\u09bf', u'\u09c0', u'\u09c1', u'\u09c2', u'\u09c3',
                 u'\u09c4', u'\u09c6', u'\u09c7', u'\u09c8', u'\u09ca',
                 u'\u09cb', u'\u09cc', u'\u09cd', u'\u09d7'
                 ]
        limiters = ['.', '\"', '\'', '`', '!', ';', ', ', '?']

        halant = u'\u09cd'
        lst_chars = []
        for char in text:
            if char in limiters:
                lst_chars.append(char)
            elif char in signs:
                lst_chars[-1] = lst_chars[-1] + char
            else:
                try:
                    if lst_chars[-1][-1] == halant:
                        lst_chars[-1] = lst_chars[-1] + char
                    else:
                        lst_chars.append(char)
                except IndexError:
                    lst_chars.append(char)

        return lst_chars

    def syllabify_hi(self, text):
        signs = [u'\u0902', u'\u0903', u'\u093e', u'\u093f', u'\u0940',
                 u'\u0941', u'\u0942', u'\u0943', u'\u0944', u'\u0946',
                 u'\u0947', u'\u0948', u'\u094a', u'\u094b', u'\u094c',
                 u'\u094d']
        limiters = ['.', '\"', '\'', '`', '!', ';', ', ', '?']

        virama = u'\u094d'
        lst_chars = []
        for char in text:
            if char in limiters:
                lst_chars.append(char)
            elif char in signs:
                lst_chars[-1] = lst_chars[-1] + char
            else:
                try:
                    if lst_chars[-1][-1] == virama:
                        lst_chars[-1] = lst_chars[-1] + char
                    else:
                        lst_chars.append(char)
                except IndexError:
                    lst_chars.append(char)

        return lst_chars

    def syllabify_ta(self, text):
        signs = [u'\u0b81', u'\u0b82', u'\u0b83', u'\u0bbd', u'\u0bbe',
                 u'\u0bbf', u'\u0bc0', u'\u0bc1', u'\u0bc2', u'\u0bc3',
                 u'\u0bc4', u'\u0bc6', u'\u0bc7', u'\u0bc8', u'\u0bca',
                 u'\u0bcb', u'\u0bcc', u'\u0bcd', u'\u0bd7']
        limiters = ['.', '\"', '\'', '`', '!', ';', ', ', '?']

        virama = u'\u0bcd'
        lst_chars = []
        for char in text:
            if char in limiters:
                lst_chars.append(char)
            elif char in signs:
                lst_chars[-1] = lst_chars[-1] + char
            else:
                try:
                    if lst_chars[-1][-1] == virama:
                        lst_chars[-1] = lst_chars[-1] + char
                    else:
                        lst_chars.append(char)
                except IndexError:
                    lst_chars.append(char)

        return lst_chars
    #Source: http://www.python-forum.org/pythonforum/viewtopic.php?
    #f=14&t=5810#p42091
    #Author: Cabu

    def syllabify_en(self, text):
        text = " " + text + " "
        vowel_list = ['a', 'e', 'i', 'o', 'u', 'y']
        vowel_pairs = ['ai', 'au', 'aw', 'ee', 'ea', 'oa', 'oi', 'ou',
                       'oo', 'ow', 'oy', 'uu']
        consonant_list = ['b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm',
                          'n', 'p', 'q', 'r', 's', 't', 'v', 'w', 'x', 'z']
        consonant_blends = ['bl', 'br', 'ch', 'chr', 'cl', 'cr', 'dr', 'fl',
                            'fr', 'gl', 'gr', 'kn', 'pl', 'pr', 'sc', 'sh',
                            'sk', 'sl', 'sm', 'sn', 'sp', 'spr', 'squ', 'st',
                            'str', 'sw', 'th', 'tr', 'thr', 'nt', 'wh']

        # Cut numbers in digits
        p = re.compile("([0-9])([0-9])", re.IGNORECASE)
        for i in range(2):
            text = p.sub("\\1#\\2", text)

        # Cut i / vowel (- o) / consonant
        p = re.compile("i([aeiuy])([bcdfghjklmnpqrstvwxz])", re.IGNORECASE)
        text = p.sub("i+\\1+\\2", text)

        # Cut the / vowel / consonant
        p = re.compile("the([aeiouy])([bcdfghjklmnpqrstvwxz])", re.IGNORECASE)
        text = p.sub("the+\\1+\\2", text)

        # Cut vowel / vowel except for pairs
        position = 0
        while position < len(text)-1:
            if text[position] in vowel_list and text[position+1] in vowel_list:
                if not (text[position:position+2] in vowel_pairs):
                    if not (
                            text[position-1:position+3] in
                            ["tion", "dual", "nion", "quir", "tiou"]
                    ):
                        text = text[:position+1] + "_" + text[position+1:]
            position = position + 1

        # Cut consonant / consonant (ll, mm, ...)
        p = re.compile("([bcdfghjklmnpqrstvwxz])\\1([^ ])", re.IGNORECASE)
        text = p.sub("\\1-\\1\\2", text)

        # Cut vowel / consonant vowel
        start = 0
        end = 0
        while start < len(text)-1:
            if text[start] in vowel_list and text[start+1] in consonant_list:
                end = start + 1
                while end <= len(text)-1 and text[end] in consonant_list:
                    end = end + 1
                if (
                        end <= len(text)-1 and
                        (text[start+1:end] in consonant_list or
                            text[start+1:end] in consonant_blends) and
                        text[end] in vowel_list and
                        text[end:end+2] != "e "
                ):
                    text = text[:start+1] + "/" + text[start+1:]
            start = start + 1

        # Cut vowel consonant / consonant+ vowel (trumpet,
        #simple, understanding, ...)
        start = 0
        end = 0
        while start < len(text)-1:
            if text[start] in vowel_list and text[start+1] in consonant_list:
                end = start + 2
                while end <= len(text)-1 and text[end] in consonant_list:
                    end = end + 1
                if (end <= len(text)-1 and end > start+2 and
                        text[end] in vowel_list):
                    if not (text[start+1:end] in consonant_blends):
                        text = text[:start+2] + "-" + text[start+2:]
            start = start + 1

        # Return the words splitted
        return text

[docs]    def get_module_name(self):
        """
        return module name
        """
        return "Syllabification"

[docs]    def get_info(self):
        """
        return module info
        """
        return "Syllabify each word in the given text"

[docs]    def syllabify(self, text):
        """
        syllabifies the given text

        :param text: the input  text.
        :type text: str.
        :returns: text with syllables marked.
        """
        if text.strip() == "":
            return []
        lang = detect_lang(text.split(" ")[0])[text.split(" ")[0]]
        if(lang == "ml_IN"):
            return self.syllabify_ml(text)
        if(lang == "hi_IN"):
            return self.syllabify_hi(text)
        if(lang == "kn_IN"):
            return self.syllabify_kn(text)
        if(lang == "bn_IN"):
            return self.syllabify_bn(text)
        if(lang == "ta_IN"):
            return self.syllabify_ta(text)
        if(lang == "en_US"):
            return self.syllabify_en(text)
        lst_chars = []

        for char in text:
            lst_chars.append(char)
        return lst_chars


def getInstance():
        return Syllabalizer()