Source code for inexactsearch.core
#! /usr/bin/env python
# -*- coding: utf-8 -*-
# Approximate Search
# Copyright 2008 Santhosh Thottingal <santhosh.thottingal@gmail.com>
# http://www.smc.org.in
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#
# If you find any bugs or have any suggestions email: santhosh.thottingal@gmail.com
_all_ = ['InexactSearch', 'getInstance']
import os
import soundex
[docs]class InexactSearch:
"""
This class provides methods for fuzzy searching using word distance
as well as phonetics.
"""
[docs] def bigram_average(self,str1, str2):
"""Return approximate string comparator measure (between 0.0 and 1.0)
using bigrams.
:param str1: string 1 for comparison
:str1 type : str
:param str2: string 2 for comparison
:str2 type : str
:returns: int score between 0.0 and 1.0
>>> score = bigram_avearage(str1, str2)
0.7
Bigrams are two-character sub-strings contained in a string. For example,
'peter' contains the bigrams: pe,et,te,er.
This routine counts the number of common bigrams and divides by the
average number of bigrams. The resulting number is returned.
"""
# Quick check if the strings are the same - - - - - - - - - - - - - - - - - -
#
if (str1 == str2):
return 1
bigr1 = []
bigr2 = []
# Make a list of bigrams for both strings - - - - - - - - - - - - - - - - - -
#
for i in range(1, len(str1)):
bigr1.append(str1[i-1:i+1])
for i in range(1, len(str2)):
bigr2.append(str2[i-1:i+1])
# Compute average number of bigrams - - - - - - - - - - - - - - - - - - - - -
#
average = (len(bigr1)+len(bigr2)) / 2.0
if (average == 0.0):
return str1
# Get common bigrams - - - - - - - - - - - - - - - - - - - - - - - - - - - -
#
common = 0.0
if (len(bigr1) < len(bigr2)): # Count using the shorter bigram list
short_bigr = bigr1
long_bigr = bigr2
else:
short_bigr = bigr2
long_bigr = bigr1
for b in short_bigr:
if (b in long_bigr):
if long_bigr.index(b) == short_bigr.index(b) :
common += 1.0
else:
dislocation = (long_bigr.index(b) -
short_bigr.index(b)) / average
if dislocation < 0 :
dislocation = dislocation * -1
common += 1.0 - dislocation
long_bigr[long_bigr.index(b)] = [] # Mark this bigram as counted
w = common / average
return w
[docs] def compare(self, string1, string2):
"""
Compare strings using soundex if not possible givees biggram avearage.
:param str1: string 1 for comparison.
:type str1: str.
:param str2: string 2 for comparison
:type str2: str.
:returns: int score between 0.0 and 1.0
"""
weight = 0
if string1 == string2:
return 1
sx = soundex.getInstance()
soundex_match = sx.compare(string1, string2)
if soundex_match == 0:
weight = 1.0
if soundex_match == 1:
weight = 0.9
if soundex_match == 2:
weight = 0.8
if weight == 0:
return self.bigram_average(string1, string2)
else:
return weight
[docs] def search(self, text, key):
"""
Searches for the key in the given text.
:param text: text in which search has to be done.
:type text: str.
:param key: key which has to be searched
:type key: str.
:returns: A dictionary with words in the string as keys and the score against the key as the value
"""
key = key.strip()
words = text.split()
search_results = {}
for word in words:
word = word.strip()
search_results[word]= self.compare(word, key)
return search_results
[docs] def get_module_name(self):
""" returns module name"""
return "Approximate Search"
[docs] def get_info(self):
""" Returns info on the module"""
return "Approximate Search for a string in the given text. Based on bigram search algorithm and indic soundex algorithms"
[docs]def getInstance():
"""
returns an instance of :class: `InexactSearch` class.
"""
return InexactSearch()