Source code for indicsyllabifier.core

#!/usr/bin/env python
# mlsplit - Split Malayalam words into letters
# This script splits Malayalam words into letters.
# Ref: http://tinyurl.com/3v729s
# Copyright (C) 2008 Baiju M <baiju.m.mail AT gmail.com>
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or (at
# your option) any later version.
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.


import re
from silpa_common.langdetect import detect_lang


[docs]class Syllabalizer: """ Syllabalizer class provides methods to syllabify unicode strings """ def syllabify_ml(self, text): signs = [u'\u0d02', u'\u0d03', u'\u0d3e', u'\u0d3f', u'\u0d40', u'\u0d41', u'\u0d42', u'\u0d43', u'\u0d44', u'\u0d46', u'\u0d47', u'\u0d48', u'\u0d4a', u'\u0d4b', u'\u0d4c', u'\u0d4d', u'\u0d57' ] limiters = ['.', '\"', '\'', '`', '!', ';', ',', '?'] chandrakkala = u'\u0d4d' lst_chars = [] for char in text: if char in limiters: lst_chars.append(char) elif char in signs: lst_chars[-1] = lst_chars[-1] + char else: try: if lst_chars[-1][-1] == chandrakkala: lst_chars[-1] = lst_chars[-1] + char else: lst_chars.append(char) except IndexError: lst_chars.append(char) return lst_chars def syllabify_kn(self, text): signs = [u'\u0c82', u'\u0c83', u'\u0cbd', u'\u0cbe', u'\u0cbf', u'\u0cc0', u'\u0cc1', u'\u0cc2', u'\u0cc3', u'\u0cc4', u'\u0cc6', u'\u0cc7', u'\u0cc8', u'\u0cca', u'\u0ccb', u'\u0ccc', u'\u0ccd' ] limiters = ['.', '\"', '\'', '`', '!', ';', ', ', '?'] halant = u'\u0ccd' lst_chars = [] for char in text: if char in limiters: lst_chars.append(char) elif char in signs: lst_chars[-1] = lst_chars[-1] + char else: try: if lst_chars[-1][-1] == halant: lst_chars[-1] = lst_chars[-1] + char else: lst_chars.append(char) except IndexError: lst_chars.append(char) return lst_chars def syllabify_bn(self, text): signs = [u'\u0981', u'\u0982', u'\u0983', u'\u09bd', u'\u09be', u'\u09bf', u'\u09c0', u'\u09c1', u'\u09c2', u'\u09c3', u'\u09c4', u'\u09c6', u'\u09c7', u'\u09c8', u'\u09ca', u'\u09cb', u'\u09cc', u'\u09cd', u'\u09d7' ] limiters = ['.', '\"', '\'', '`', '!', ';', ', ', '?'] halant = u'\u09cd' lst_chars = [] for char in text: if char in limiters: lst_chars.append(char) elif char in signs: lst_chars[-1] = lst_chars[-1] + char else: try: if lst_chars[-1][-1] == halant: lst_chars[-1] = lst_chars[-1] + char else: lst_chars.append(char) except IndexError: lst_chars.append(char) return lst_chars def syllabify_hi(self, text): signs = [u'\u0902', u'\u0903', u'\u093e', u'\u093f', u'\u0940', u'\u0941', u'\u0942', u'\u0943', u'\u0944', u'\u0946', u'\u0947', u'\u0948', u'\u094a', u'\u094b', u'\u094c', u'\u094d'] limiters = ['.', '\"', '\'', '`', '!', ';', ', ', '?'] virama = u'\u094d' lst_chars = [] for char in text: if char in limiters: lst_chars.append(char) elif char in signs: lst_chars[-1] = lst_chars[-1] + char else: try: if lst_chars[-1][-1] == virama: lst_chars[-1] = lst_chars[-1] + char else: lst_chars.append(char) except IndexError: lst_chars.append(char) return lst_chars def syllabify_ta(self, text): signs = [u'\u0b81', u'\u0b82', u'\u0b83', u'\u0bbd', u'\u0bbe', u'\u0bbf', u'\u0bc0', u'\u0bc1', u'\u0bc2', u'\u0bc3', u'\u0bc4', u'\u0bc6', u'\u0bc7', u'\u0bc8', u'\u0bca', u'\u0bcb', u'\u0bcc', u'\u0bcd', u'\u0bd7'] limiters = ['.', '\"', '\'', '`', '!', ';', ', ', '?'] virama = u'\u0bcd' lst_chars = [] for char in text: if char in limiters: lst_chars.append(char) elif char in signs: lst_chars[-1] = lst_chars[-1] + char else: try: if lst_chars[-1][-1] == virama: lst_chars[-1] = lst_chars[-1] + char else: lst_chars.append(char) except IndexError: lst_chars.append(char) return lst_chars #Source: http://www.python-forum.org/pythonforum/viewtopic.php? #f=14&t=5810#p42091 #Author: Cabu def syllabify_en(self, text): text = " " + text + " " vowel_list = ['a', 'e', 'i', 'o', 'u', 'y'] vowel_pairs = ['ai', 'au', 'aw', 'ee', 'ea', 'oa', 'oi', 'ou', 'oo', 'ow', 'oy', 'uu'] consonant_list = ['b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', 't', 'v', 'w', 'x', 'z'] consonant_blends = ['bl', 'br', 'ch', 'chr', 'cl', 'cr', 'dr', 'fl', 'fr', 'gl', 'gr', 'kn', 'pl', 'pr', 'sc', 'sh', 'sk', 'sl', 'sm', 'sn', 'sp', 'spr', 'squ', 'st', 'str', 'sw', 'th', 'tr', 'thr', 'nt', 'wh'] # Cut numbers in digits p = re.compile("([0-9])([0-9])", re.IGNORECASE) for i in range(2): text = p.sub("\\1#\\2", text) # Cut i / vowel (- o) / consonant p = re.compile("i([aeiuy])([bcdfghjklmnpqrstvwxz])", re.IGNORECASE) text = p.sub("i+\\1+\\2", text) # Cut the / vowel / consonant p = re.compile("the([aeiouy])([bcdfghjklmnpqrstvwxz])", re.IGNORECASE) text = p.sub("the+\\1+\\2", text) # Cut vowel / vowel except for pairs position = 0 while position < len(text)-1: if text[position] in vowel_list and text[position+1] in vowel_list: if not (text[position:position+2] in vowel_pairs): if not ( text[position-1:position+3] in ["tion", "dual", "nion", "quir", "tiou"] ): text = text[:position+1] + "_" + text[position+1:] position = position + 1 # Cut consonant / consonant (ll, mm, ...) p = re.compile("([bcdfghjklmnpqrstvwxz])\\1([^ ])", re.IGNORECASE) text = p.sub("\\1-\\1\\2", text) # Cut vowel / consonant vowel start = 0 end = 0 while start < len(text)-1: if text[start] in vowel_list and text[start+1] in consonant_list: end = start + 1 while end <= len(text)-1 and text[end] in consonant_list: end = end + 1 if ( end <= len(text)-1 and (text[start+1:end] in consonant_list or text[start+1:end] in consonant_blends) and text[end] in vowel_list and text[end:end+2] != "e " ): text = text[:start+1] + "/" + text[start+1:] start = start + 1 # Cut vowel consonant / consonant+ vowel (trumpet, #simple, understanding, ...) start = 0 end = 0 while start < len(text)-1: if text[start] in vowel_list and text[start+1] in consonant_list: end = start + 2 while end <= len(text)-1 and text[end] in consonant_list: end = end + 1 if (end <= len(text)-1 and end > start+2 and text[end] in vowel_list): if not (text[start+1:end] in consonant_blends): text = text[:start+2] + "-" + text[start+2:] start = start + 1 # Return the words splitted return text
[docs] def get_module_name(self): """ return module name """ return "Syllabification"
[docs] def get_info(self): """ return module info """ return "Syllabify each word in the given text"
[docs] def syllabify(self, text): """ syllabifies the given text :param text: the input text. :type text: str. :returns: text with syllables marked. """ if text.strip() == "": return [] lang = detect_lang(text.split(" ")[0])[text.split(" ")[0]] if(lang == "ml_IN"): return self.syllabify_ml(text) if(lang == "hi_IN"): return self.syllabify_hi(text) if(lang == "kn_IN"): return self.syllabify_kn(text) if(lang == "bn_IN"): return self.syllabify_bn(text) if(lang == "ta_IN"): return self.syllabify_ta(text) if(lang == "en_US"): return self.syllabify_en(text) lst_chars = [] for char in text: lst_chars.append(char) return lst_chars
def getInstance(): return Syllabalizer()