Source code for indicngram.core

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Ngram
# Copyright 2008-2009 Santhosh Thottingal <santhosh.thottingal@gmail.com>
# http://www.smc.org.in
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#
import indicsyllabifier


[docs]class Ngram: """ Ngram class.You need to create an object to use the function """
[docs] def syllableNgram(self, text, window_size=2): """ :param text: The text to be split into ngrams. :type word: str. :param window_size: window size to be used while making the ngrams. :type window_size: int. :returns: list of syllable ngrams. """ window_size = int(window_size) words = text.split(" ") ngrams = [] for word in words: s = indicsyllabifier.getInstance() #TODO-Normalize before taking ngram!!! syllables = s.syllabify(word) syllable_count = len(syllables) window_start = 0 window_end = 0 while window_start + window_size <= syllable_count: if(window_start + window_size < syllable_count): window_end = window_start + window_size else: window_end = syllable_count ngrams.append(syllables[window_start:window_end]) window_start = window_start+1 return ngrams
[docs] def letterNgram(self, word, window_size=2): """ :param word: The word to be split into ngrams. :type word: str. :param window_size: window size to be used while making the ngrams. :type window_size: int. :returns: list of ngrams. """ window_size = int(window_size) word = word.strip() ngrams = [] #TODO-Normalize before taking ngram!!! letter_count = len(word) window_start = 0 window_end = 0 while window_start + window_size <= letter_count: if(window_start + window_size < letter_count): window_end = window_start + window_size else: window_end = letter_count ngrams.append(word[window_start:window_end]) window_start = window_start+1 return ngrams
[docs] def wordNgram(self, text, window_size=2): """ :param text: The text to be split into ngrams. :type word: str. :param window_size: window size to be used while making the ngrams. :type window_size: int. :returns: list of word ngrams. """ window_size = int(window_size) words = text.split() ngrams = [] word_count = len(words) window_start = 0 window_end = 0 while window_start + window_size <= word_count: if(window_start + window_size < word_count): window_end = window_start + window_size else: window_end = word_count words[window_start:window_end] ngrams.append(words[window_start:window_end]) window_start = window_start+1 return ngrams
[docs] def get_module_name(self): """ returns the module's name """ return "Ngram Library"
[docs] def get_info(self): """ returns info on the module """ return "Ngram Library for English and Indian languages"
def getInstance(): return Ngram()

Related Topics

This Page