Module `dhivehi_nlp.trigram_similarity`

Trigram similarity divides words or phrases into sequences of three consecutive letters, placed in a set where the order doesn't matter and duplicates are removed. Used to find string matches even if certain characters are different, based on similarity value.

ލިޔުނު އަކުރުތަކާ ނުވަތަ ބަސްތަކާއި އެއްގޮތް ބަސްތައް ހޯދުން

Expand source code

"""Trigram similarity divides words or phrases into sequences of three
consecutive letters, placed in a set where the order doesn't matter and
duplicates are removed. Used to find string matches even if certain characters
are different, based on similarity value.

ލިޔުނު އަކުރުތަކާ ނުވަތަ ބަސްތަކާއި އެއްގޮތް ބަސްތައް ހޯދުން
"""

import re
from dhivehi_nlp import dictionary


def generate_trigrams(text: str):
    """
    Return a set of trigrams in a given text.
    Preprocessing is done where each space is changed to two spaces. Also, two
    spaces to the beginning and one space to the end of the string are added.

    ލިޔެފައިވާ ބަހުގެ ނުވަތަ ބަސްތަކުގެ ޓްރައިގްރާމްތައް ސެޓެއްގެ ގޮތުގައި އަނބުރާ ދޭނެއެވެ
    
    >>> generate_trigrams("ބަޔަކު")
    {
        "  ބ",
        " ބަ",
        "ބަޔ",
        "ަޔަ",
        "ޔަކ",
        "ަކު",
        "ކު ",
    }
    """
    text = text.strip()
    text = re.sub(" ", "  ", text)
    text = f"  {text} "
    return set([text[i : i + 3] for i in range(len(text) - 2)])


def _compare_trigrams(trig1: set, trig2: set):
    """
    Checks how many trigrams from the first set are present in the second and
    returns that value divided by the length of the second set.
    """
    count = 0
    for i in trig1:
        if i in trig2:
            count += 1
    return count / len(trig2)


def get_similarity(query: str, text=None, max_output=10):
    """
    Finds the trigram similarity of words compared to the query string and
    returns a list of similar words from the text list ordered according to 
    similarity in descending order.
    The text keyword argument should be a list of strings.
    If a list of words are not provided in the text keyword argument, the
    wordlist from the dictionary is used instead.
    The max_output keyword argument determines the size of the return list and
    is set to 10 by default if argument is not given.

    ލިޔުނު ބަހާއި އެއްގޮތް ބަސްތައް އެއްގޮތްވާ ނިސްބަތުން ލިސްޓެއް ގޮތުގައި އަނބުރާ ދޭނެއެވެ
    
    >>> get_similarity("ބަޔަކު", max_output=5)
    [
        {"word": "ބަޔަކު", "similarity": 1.0},
        {"word": "ބަ", "similarity": 0.6666666666666666},
        {"word": "ބ", "similarity": 0.5},
        {"word": "ބަޔޭބަޔޭ", "similarity": 0.42857142857142855},
        {"word": "ބަޔާން", "similarity": 0.42857142857142855},
    ]

    >>> text = "ރަށްތައް އުފެދިފައިވާ ގޮތުން ވަކިވަކި ކުދިކުދި ރަށްރަށް ހުރި ކަމުގައި ވިޔަސް އެއްބަޔަކު އަނެއް ބަޔަކަށް ބަރޯސާވާ ކަމާއި ވަކި ދަތުރުފަތުރުކޮށް އެއްބައެއްގެ".split()
    >>> get_similarity("ބަޔަކު", text, max_output=3)
    [
        {"word": "ބަޔަކަށް", "similarity": 0.5555555555555556},
        {"word": "އެއްބަޔަކު", "similarity": 0.45454545454545453},
        {"word": "ބަރޯސާވާ", "similarity": 0.2222222222222222},
    ]
    """
    query_trig = generate_trigrams(query)
    if text == None:
        text = dictionary.get_wordlist()
    results = []
    for word in text:
        word_trig = generate_trigrams(word)
        results.append(
            {"word": word, "similarity": _compare_trigrams(query_trig, word_trig)}
        )
    results = sorted(results, key=lambda k: k["similarity"])[::-1]
    if len(results) > max_output:
        results = results[:max_output]
    return results

Functions

def generate_trigrams(text: str)

Return a set of trigrams in a given text. Preprocessing is done where each space is changed to two spaces. Also, two spaces to the beginning and one space to the end of the string are added.

ލިޔެފައިވާ ބަހުގެ ނުވަތަ ބަސްތަކުގެ ޓްރައިގްރާމްތައް ސެޓެއްގެ ގޮތުގައި އަނބުރާ ދޭނެއެވެ

>>> generate_trigrams("ބަޔަކު")
{
    "  ބ",
    " ބަ",
    "ބަޔ",
    "ަޔަ",
    "ޔަކ",
    "ަކު",
    "ކު ",
}

Expand source code

def generate_trigrams(text: str):
    """
    Return a set of trigrams in a given text.
    Preprocessing is done where each space is changed to two spaces. Also, two
    spaces to the beginning and one space to the end of the string are added.

    ލިޔެފައިވާ ބަހުގެ ނުވަތަ ބަސްތަކުގެ ޓްރައިގްރާމްތައް ސެޓެއްގެ ގޮތުގައި އަނބުރާ ދޭނެއެވެ
    
    >>> generate_trigrams("ބަޔަކު")
    {
        "  ބ",
        " ބަ",
        "ބަޔ",
        "ަޔަ",
        "ޔަކ",
        "ަކު",
        "ކު ",
    }
    """
    text = text.strip()
    text = re.sub(" ", "  ", text)
    text = f"  {text} "
    return set([text[i : i + 3] for i in range(len(text) - 2)])

def get_similarity(query: str, text=None, max_output=10)

Finds the trigram similarity of words compared to the query string and returns a list of similar words from the text list ordered according to similarity in descending order. The text keyword argument should be a list of strings. If a list of words are not provided in the text keyword argument, the wordlist from the dictionary is used instead. The max_output keyword argument determines the size of the return list and is set to 10 by default if argument is not given.

ލިޔުނު ބަހާއި އެއްގޮތް ބަސްތައް އެއްގޮތްވާ ނިސްބަތުން ލިސްޓެއް ގޮތުގައި އަނބުރާ ދޭނެއެވެ

>>> get_similarity("ބަޔަކު", max_output=5)
[
    {"word": "ބަޔަކު", "similarity": 1.0},
    {"word": "ބަ", "similarity": 0.6666666666666666},
    {"word": "ބ", "similarity": 0.5},
    {"word": "ބަޔޭބަޔޭ", "similarity": 0.42857142857142855},
    {"word": "ބަޔާން", "similarity": 0.42857142857142855},
]

>>> text = "ރަށްތައް އުފެދިފައިވާ ގޮތުން ވަކިވަކި ކުދިކުދި ރަށްރަށް ހުރި ކަމުގައި ވިޔަސް އެއްބަޔަކު އަނެއް ބަޔަކަށް ބަރޯސާވާ ކަމާއި ވަކި ދަތުރުފަތުރުކޮށް އެއްބައެއްގެ".split()
>>> get_similarity("ބަޔަކު", text, max_output=3)
[
    {"word": "ބަޔަކަށް", "similarity": 0.5555555555555556},
    {"word": "އެއްބަޔަކު", "similarity": 0.45454545454545453},
    {"word": "ބަރޯސާވާ", "similarity": 0.2222222222222222},
]

Expand source code

def get_similarity(query: str, text=None, max_output=10):
    """
    Finds the trigram similarity of words compared to the query string and
    returns a list of similar words from the text list ordered according to 
    similarity in descending order.
    The text keyword argument should be a list of strings.
    If a list of words are not provided in the text keyword argument, the
    wordlist from the dictionary is used instead.
    The max_output keyword argument determines the size of the return list and
    is set to 10 by default if argument is not given.

    ލިޔުނު ބަހާއި އެއްގޮތް ބަސްތައް އެއްގޮތްވާ ނިސްބަތުން ލިސްޓެއް ގޮތުގައި އަނބުރާ ދޭނެއެވެ
    
    >>> get_similarity("ބަޔަކު", max_output=5)
    [
        {"word": "ބަޔަކު", "similarity": 1.0},
        {"word": "ބަ", "similarity": 0.6666666666666666},
        {"word": "ބ", "similarity": 0.5},
        {"word": "ބަޔޭބަޔޭ", "similarity": 0.42857142857142855},
        {"word": "ބަޔާން", "similarity": 0.42857142857142855},
    ]

    >>> text = "ރަށްތައް އުފެދިފައިވާ ގޮތުން ވަކިވަކި ކުދިކުދި ރަށްރަށް ހުރި ކަމުގައި ވިޔަސް އެއްބަޔަކު އަނެއް ބަޔަކަށް ބަރޯސާވާ ކަމާއި ވަކި ދަތުރުފަތުރުކޮށް އެއްބައެއްގެ".split()
    >>> get_similarity("ބަޔަކު", text, max_output=3)
    [
        {"word": "ބަޔަކަށް", "similarity": 0.5555555555555556},
        {"word": "އެއްބަޔަކު", "similarity": 0.45454545454545453},
        {"word": "ބަރޯސާވާ", "similarity": 0.2222222222222222},
    ]
    """
    query_trig = generate_trigrams(query)
    if text == None:
        text = dictionary.get_wordlist()
    results = []
    for word in text:
        word_trig = generate_trigrams(word)
        results.append(
            {"word": word, "similarity": _compare_trigrams(query_trig, word_trig)}
        )
    results = sorted(results, key=lambda k: k["similarity"])[::-1]
    if len(results) > max_output:
        results = results[:max_output]
    return results