A set that supports searching for members by N-gram string similarity.
In Python 2, items should be unicode string or a plain ASCII str (bytestring) - do not use UTF-8 or other multi-byte encodings, because multi-byte characters will be split up.
Parameters: |
|
---|
Instance variables:
Variables: |
|
---|
Add an item to the N-gram index (if it has not already been added).
>>> from ngram import NGram
>>> n = NGram()
>>> n.add("ham")
>>> list(n)
['ham']
>>> n.add("spam")
>>> list(n)
['ham', 'spam']
Remove all elements from this set.
>>> from ngram import NGram
>>> n = NGram(['spam', 'eggs'])
>>> list(n)
['eggs', 'spam']
>>> n.clear()
>>> list(n)
[]
Compares two strings and returns their similarity.
Parameters: |
|
---|---|
Returns: | similarity between 0.0 and 1.0. |
>>> from ngram import NGram
>>> NGram.compare('spa', 'spam')
0.375
>>> NGram.compare('ham', 'bam')
0.25
>>> NGram.compare('spam', 'pam') #N=2
0.375
>>> NGram.compare('ham', 'ams', N=1)
0.5
Return a new NGram object with the same settings, and referencing the same items. Copy is shallow in that each item is not recursively copied. Optionally specify alternate items to populate the copy.
>>> from ngram import NGram
>>> from copy import deepcopy
>>> n = NGram(['eggs', 'spam'])
>>> m = n.copy()
>>> m.add('ham')
>>> list(n)
['eggs', 'spam']
>>> list(m)
['eggs', 'ham', 'spam']
>>> p = n.copy(['foo', 'bar'])
>>> list(p)
['foo', 'bar']
Return the difference of two or more sets as a new set.
>>> from ngram import NGram
>>> a = NGram(['spam', 'eggs'])
>>> b = NGram(['spam', 'ham'])
>>> list(a.difference(b))
['eggs']
Remove from this set all elements from other set.
>>> from ngram import NGram
>>> n = NGram(['spam', 'eggs'])
>>> other = set(['spam'])
>>> n.difference_update(other)
>>> list(n)
['eggs']
Remove an element from a set if it is a member.
If the element is not a member, do nothing.
>>> from ngram import NGram
>>> n = NGram(['spam', 'eggs'])
>>> n.discard('spam')
>>> n.discard('ham')
>>> list(n)
['eggs']
Simply return the best match to the query, None on no match.
>>> from ngram import NGram
>>> n = NGram(["Spam","Eggs","Ham"], key=lambda x:x.lower(), N=1)
>>> n.find('Hom')
'Ham'
>>> n.find("Spom")
'Spam'
>>> n.find("Spom", 0.8)
Return most similar item to the provided one, or None if nothing exceeds the threshold.
>>> from ngram import NGram
>>> n = NGram([(0, "Spam"), (1, "Ham"), (2, "Eggsy"), (3, "Egggsy")],
... key=lambda x:x[1].lower())
>>> n.finditem((3, 'Hom'))
(1, 'Ham')
>>> n.finditem((4, "Oggsy"))
(2, 'Eggsy')
>>> n.finditem((4, "Oggsy"), 0.8)
Return the intersection of two or more sets as a new set.
>>> from ngram import NGram
>>> a = NGram(['spam', 'eggs'])
>>> b = NGram(['spam', 'ham'])
>>> list(a.intersection(b))
['spam']
Update the set with the intersection of itself and other sets.
>>> from ngram import NGram
>>> n = NGram(['spam', 'eggs'])
>>> other = set(['spam', 'ham'])
>>> n.intersection_update(other)
>>> list(n)
['spam']
Retrieve the subset of items that share n-grams the query string.
Parameters: | query – look up items that share N-grams with this string. |
---|---|
Returns: | mapping from matched string to the number of shared N-grams. |
>>> from ngram import NGram
>>> n = NGram(["ham","spam","eggs"])
>>> n.items_sharing_ngrams("mam")
{'ham': 2, 'spam': 2}
Get the key string for the item.
>>> from ngram import NGram
>>> n = NGram(key=lambda x:x[1])
>>> n.key((3,"ham"))
'ham'
Similarity for two sets of n-grams.
Note : | similarity = (a**e - d**e)/a**e where a is “all n-grams”, d is “different n-grams” and e is the warp. |
---|---|
Parameters: |
|
Returns: | similarity in the range 0.0 to 1.0. |
>>> from ngram import NGram
>>> NGram.ngram_similarity(5, 10)
0.5
>>> NGram.ngram_similarity(5, 10, warp=2)
0.75
>>> NGram.ngram_similarity(5, 10, warp=3)
0.875
>>> NGram.ngram_similarity(2, 4, warp=2)
0.75
>>> NGram.ngram_similarity(3, 4)
0.75
Iterates over the ngrams of a string (no padding).
>>> from ngram import NGram
>>> n = NGram()
>>> list(n._split("hamegg"))
['ham', 'ame', 'meg', 'egg']
Pads a string and iterates over its ngrams.
>>> from ngram import NGram
>>> n = NGram()
>>> list(n.split("ham"))
['$$h', '$ha', 'ham', 'am$', 'm$$']
Pad a string in preparation for splitting into ngrams.
>>> from ngram import NGram
>>> n = NGram()
>>> n.pad('ham')
'$$ham$$'
Remove and return an arbitrary set element. Raises KeyError if the set is empty.
>>> from ngram import NGram
>>> n = NGram(['spam', 'eggs'])
>>> n.pop()
'eggs'
Remove an item from the set. Inverts the add operation.
>>> from ngram import NGram
>>> n = NGram(['spam', 'eggs'])
>>> n.remove('spam')
>>> list(n)
['eggs']
Search the index for items whose key exceeds threshold similarity to the query string.
Parameters: | query – returned items will have at least threshold similarity to the query string. |
---|---|
Returns: | list of pairs of (item, similarity) by decreasing similarity. |
>>> from ngram import NGram
>>> n = NGram([(0, "SPAM"), (1, "SPAN"), (2, "EG")], key=lambda x:x[1])
>>> n.search("SPA")
[((0, 'SPAM'), 0.375), ((1, 'SPAN'), 0.375)]
>>> n.search("M")
[((0, 'SPAM'), 0.125)]
>>> n.search("EG")
[((2, 'EG'), 1.0)]
Search the index for items whose key exceeds the threshold similarity to the key of the given item.
Returns: | list of pairs of (item, similarity) by decreasing similarity. |
---|
>>> from ngram import NGram
>>> n = NGram([(0, "SPAM"), (1, "SPAN"), (2, "EG"),
... (3, "SPANN")], key=lambda x:x[1])
>>> n.searchitem((2, "SPA"), 0.35)
[((0, 'SPAM'), 0.375), ((1, 'SPAN'), 0.375)]
Pads a string and iterates over its ngrams.
>>> from ngram import NGram
>>> n = NGram()
>>> list(n.split("ham"))
['$$h', '$ha', 'ham', 'am$', 'm$$']
Pads the string key of an item and iterates over its ngrams.
>>> from ngram import NGram
>>> n = NGram(key=lambda x:x[1])
>>> item = (3,"ham")
>>> list(n.splititem(item))
['$$h', '$ha', 'ham', 'am$', 'm$$']
Return the symmetric difference of two sets as a new set.
>>> from ngram import NGram
>>> a = NGram(['spam', 'eggs'])
>>> b = NGram(['spam', 'ham'])
>>> list(a.symmetric_difference(b))
['eggs', 'ham']
Update the set with the symmetric difference of itself and other.
>>> from ngram import NGram
>>> n = NGram(['spam', 'eggs'])
>>> other = set(['spam', 'ham'])
>>> n.symmetric_difference_update(other)
>>> list(n)
['eggs', 'ham']
Return the union of two or more sets as a new set.
>>> from ngram import NGram
>>> a = NGram(['spam', 'eggs'])
>>> b = NGram(['spam', 'ham'])
>>> list(a.union(b))
['eggs', 'ham', 'spam']
Update the set with new items.
>>> from ngram import NGram
>>> n = NGram(["spam"])
>>> n.update(["eggs"])
>>> list(n)
['eggs', 'spam']