Source code for spellchecker.core

# -*- coding: utf-8 -*-
# Spellchecker
# Copyright 2008-2010 Santhosh Thottingal <santhosh.thottingal@gmail.com>
# http://www.smc.org.in
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#
import os
import string
import codecs
import inexactsearch
import urllib

from indexer import DictionaryIndex
from langdetect import _detect_lang

__all__ = ['Spellchecker', 'getInstance']


[docs]class Spellchecker: """ Spellchecker Class. contains spell checking and suggestion methods. """ def __init__(self): self.NWORDS = None self.lang = None self.dictionaries = {} def words(self, text): #for punct in string.punctuation: # text = text.replace(punct,"") words = text.split() return set(words) def train(self, features=None): if not self.lang in self.dictionaries: index = DictionaryIndex() self.dictionaries[self.lang] = index.load_index(self.lang + ".dic") def get_wordlist(self, word=""): index = self.dictionaries.get(self.lang, None) if index is None: self.train() index = self.dictionaries.get(self.lang, None) words = [] if word == "": return words byte_offset = index.get(word[0], None) if byte_offset is None: return words path = os.path.join(os.path.dirname(__file__), "dicts/" + self.lang + ".dic") fp = codecs.open(path, "r", encoding="utf-8", errors="ignore") fp.seek(int(byte_offset)) while True: line = fp.readline().strip() if len(line) > 0 and not word[0] == line[0]: break words.append(line) return words def levenshtein(self, s1, s2): if len(s1) < len(s2): return self.levenshtein(s2, s1) if not s1: return len(s2) previous_row = xrange(len(s2) + 1) for i, c1 in enumerate(s1): current_row = [i + 1] for j, c2 in enumerate(s2): insertions = previous_row[j + 1] + 1 deletions = current_row[j] + 1 substitutions = previous_row[j] + (c1 != c2) current_row.append(min(insertions, deletions, substitutions)) previous_row = current_row return previous_row[-1]
[docs] def suggest(self, word, language=None, distance=2): """ Gives a list of words similar to the given word :param word: The word for which spelling suggestions are required. :type word: str. :param distance: suggestion will contain words with length =word length +/- distance :type distance: int :returns: A list of suggested spellings. >>> a.suggest(u"cate") [u'cat', u'cater', u'caters', u'cats'] """ word = word.strip() if word == "": return None if self.lang != language: self.NWORDS = None if language is None: self.lang = _detect_lang(word)[word] else: self.lang = language if self.NWORDS is None: self.NWORDS = self.get_wordlist(word) if word in self.NWORDS: return word candidates = [] for candidate in self.NWORDS: #skip if the first letter is different #if candidate[0] != word[0]: # continue ''' if the length difference is greater than the threshold distance, skip ''' if len(candidate) - len(word) > distance \ or len(word) - len(candidate) > distance: continue if not self.levenshtein(candidate, word) > distance: candidates.append(candidate) candidates = self.filter_candidates(word, candidates) if len(candidates) == 0: ''' try inserting spaces in between the letters to see if the word got merged ''' pos = 2 while pos < len(word) - 2: if self.check(word[:pos], self.lang) \ and self.check(word[pos:], self.lang): candidates.append(word[:pos] + " " + word[pos:]) candidates.append(word[:pos] + "-" + word[pos:]) pos += 1 return candidates
def filter_candidates(self, word, candidates): filtered_candidates = [] isearch = inexactsearch.getInstance() #TODO sort by score for candidate in candidates: if isearch.compare(word, candidate) >= 0.6: # if both words sounds alike - almost filtered_candidates.append(candidate) return filtered_candidates
[docs] def check(self, word, language=None): """ Checks whether given word has correct spelling. :param word: The word whose spelling tis to be checked. :type word: str. :param language: *optional* language code for the word. :type languge: str. :returns: Boolean True or False >>> a.check(u"അംഗദന്‍") True """ word = word.strip() if word == "": return None #If it is a number, don't do spelcheck if is_number(word): return True if self.lang != language: self.NWORDS = None if language is None: self.lang = _detect_lang(word)[word] else: self.lang = language if word == "": return True if self.NWORDS is None: self.NWORDS = self.get_wordlist(word) if self.NWORDS is None: # Dictionary not found return False result = word in self.NWORDS #if it is english word, try converting the first letter to lower case. #This will happen if the word is first word of a sentence if result is False and word.upper() != word.lower(): newword = word[0].lower() + word[1:] self.NWORDS = self.get_wordlist(newword) return newword in self.NWORDS else: return result
def strip_punctuations(self, s): exclude = set(string.punctuation) return ''.join(ch for ch in s if ch not in exclude)
[docs] def check_batch(self, text, language=None): """ Return a list of misspelled words give a chunk of text. :param text: Input text. :type text: str :returns: list of mispelled words. >>> a.check_batch(u"thire is only one anser") [u'thire', u'anser'] """ words = urllib.unquote(text) words = words.split() misspelled_words = [] for word in words: tempword = self.strip_punctuations(word) if not self.check(tempword, language): misspelled_words.append(word) return misspelled_words
[docs] def get_module_name(self): """ Returns module name. """ return "Spellchecker"
[docs] def get_info(self): """ Returns module info """ return "Indic Spellchecker"
def getInstance(): return Spellchecker() def is_number(s): try: float(s) return True except ValueError: return False

Related Topics

This Page