Функция Soundex для турецкого языка в Python

Функция Soundex изначально реализована для английского языка, но поскольку в турецком языке много специальных символов, я не смог найти подходящего решения для этого случая, здесь есть один очень хороший пример для английского Реализация Soundex на python


person GurhanCagin    schedule 09.01.2019    source источник


Ответы (1)


Я создал следующие 2 функции Python для обработки soundex для турецких слов на основе приведенного выше описания ссылки. Надеюсь, это поможет и вам.

def tr_lower(text):
    text = re.sub(r"İ", "i", text)
    text = re.sub(r"I", "ı", text)
    text = re.sub(r"Ç", "ç", text)
    text = re.sub(r"Ş", "ş", text)
    text = re.sub(r"Ü", "ü", text)
    text = re.sub(r"Ğ", "ğ", text)
    text = text.lower() 
    return text

def soundex(query: str):
    """
    https://en.wikipedia.org/wiki/Soundex
    :param query:
    :return:
    """
    # lower the word with taking care of Turkish characters
    query = tr_lower(query)
    # replace first character of word if its Turkish, if not done it its not possible to match words like şule & sule
    query = re.sub(r"^ı", "i", query)
    query = re.sub(r"^ç", "c", query)
    query = re.sub(r"^ş", "s", query)
    query = re.sub(r"^ü", "u", query)
    query = re.sub(r"^ğ", "g", query)


    # Step 0: Clean up the query string
    query = query.lower()
    letters = [char for char in query if char.isalpha()]

    # Step 1: Save the first letter. Remove all occurrences of a, e, i, o, u, y, h, w.

    # If query contains only 1 letter, return query+"000" (Refer step 5)
    if len(query) == 1:
        return query + "000"

    to_remove = ('a', 'e', 'i', 'ı', 'o', 'ö', 'u', 'ü', 'y', 'h', 'w')

    first_letter = letters[0]
    letters = letters[1:]
    letters = [char for char in letters if char not in to_remove]

    if len(letters) == 0:
        return first_letter + "000"

    # Step 2: Replace all consonants (include the first letter) with digits according to rules

    to_replace = {
        ('b', 'f', 'p', 'v'): 1, 
        ('c', 'ç', 'g', 'ğ', 'j', 'k', 'q', 's', 'ş', 'x', 'z'): 2,
        ('d', 't'): 3, 
        ('l',): 4, 
        ('m', 'n'): 5, 
        ('r',): 6
        }

    first_letter = [value if first_letter else first_letter for group, value in to_replace.items()
                    if first_letter in group]
    letters = [value if char else char
               for char in letters
               for group, value in to_replace.items()
               if char in group]


    # Step 3: Replace all adjacent same digits with one digit.
    letters = [char for ind, char in enumerate(letters)
               if (ind == len(letters) - 1 or (ind+1 < len(letters) and char != letters[ind+1]))]


    # Step 4: If the saved letter’s digit is the same the resulting first digit, remove the digit (keep the letter)
    if first_letter == letters[0]:
        letters[0] = query[0]
    else:
        letters.insert(0, query[0])

    # Step 5: Append 3 zeros if result contains less than 3 digits.
    # Remove all except first letter and 3 digits after it.

    first_letter = letters[0]
    letters = letters[1:]

    letters = [char for char in letters if isinstance(char, int)][0:3]

    while len(letters) < 3:
        letters.append(0)

    letters.insert(0, first_letter)
    _string = "".join([str(l) for l in letters])
    return _string
person GurhanCagin    schedule 09.01.2019