# from transliteration_resources import ( # azure_transliteration, # indic_trans, # indic_transliteration_OTHER_GUJARATI, # indic_transliteration_OTHER_GURMUKHI, # indic_transliteration_OTHER_ORIYA, # om_transliterator, # libindic, # indic_transliteration_IAST, # indic_transliteration_ITRANS, # # polyglot_trans, # sheetal, # unicode_transliteration_GURMUKHI, # indic_transliteration_GURMUKHI, # transliteration_LATIN_CYRILLIC, # indic_transliteration_TELUGU, # unicode_transliteration_GURMUKHI_LATIN, # indic_transliteration_GURMUKHI_LATIN, # transliteration_CYRILIC_LATIN, # ConvertToLatin, # readonly, # indic_transliteration_OTHER_DEVANAGRI, # indic_transliteration_DEVANAGRI_OTHER, # indic_transliteration_KANNADA_OTHER, # indic_transliteration_OTHER_KANNADA, # indic_transliteration_TAMIL_OTHER, # indic_transliteration_OTHER_TAMIL, # indic_transliteration_TELUGU_OTHER, # indic_transliteration_MALAYALAM_OTHER, # indic_transliteration_OTHER_GUJARATI, # indic_transliteration_OTHER_GURMUKHI, # indic_transliteration_OTHER_ORIYA, # translit_CHINESE_LATIN, # translit_th_sin_mng_heb_to_latin # ) # , translit_THAI_LATIN import subprocess import sys import os import requests, uuid, json from indictrans import Transliterator from om_transliterator import Transliterator as om_Transliterator from indic_transliteration import sanscript from indic_transliteration.sanscript import transliterate from libindic.transliteration import getInstance t = getInstance() from indic_transliteration import sanscript from indic_transliteration.sanscript import transliterate from indicnlp.transliterate.unicode_transliterate import UnicodeIndicTransliterator from transliterate import translit # , get_available_language_codes from indic_transliteration.sanscript import SchemeMap, SCHEMES, transliterate # import polyglot # from polyglot.transliteration import Transliterator as poly # from polyglot.text import Text import pinyin from anyascii import anyascii # from MNF.settings import BasePath basePath = "/home/user/mnf/project/MNF" # -> Directly Usable azure api for transliteration def azure_transliteration(text, source_lang, source_script, dest_script): if source_script == "Devanagari": source_script = "Deva" elif source_script == "Arabic": source_script = "Arab" elif source_script == "Latin": source_script = "Latn" elif source_script == "Kannada": source_script = "knda" elif source_script == "Tamil": source_script = "Taml" elif source_script == "Bengali": source_script = "Beng" elif source_script == "Telugu": source_script = "Telu" elif source_script == "Malayalam": source_script = "Mlym" elif source_script == "Cyrillic": source_script = "Cyrl" elif source_script == "Gurmukhi": source_script = "Guru" elif source_script == "Telugu": source_script = "Telu" elif source_script == "Gujarati": source_script = "Gujr" elif source_script == "Oriya": source_script = "Orya" elif source_script == "Sinhala": source_script = "Sinh" elif source_script == "Hanji": source_script = "Hans" elif source_script == "Thai": source_script = "Thai" elif source_script == "Hebrew": source_script = "Hebr" if dest_script == "Devanagari": dest_script = "Deva" elif dest_script == "Arabic": dest_script = "Arab" elif dest_script == "Latin": dest_script = "Latn" elif dest_script == "Kannada": dest_script = "knda" elif dest_script == "Tamil": dest_script = "Taml" elif dest_script == "Cyrillic": dest_script = "Cyrl" elif dest_script == "Malayalam": dest_script = "Mlym" elif dest_script == "Gurmukhi": dest_script = "Guru" elif dest_script == "Telugu": dest_script = "Telu" elif dest_script == "Gujarati": dest_script = "Gujr" elif dest_script == "Oriya": dest_script = "Orya" elif dest_script == "Bengali": dest_script = "Beng" elif dest_script == "Sinhala": dest_script = "Sinh" elif dest_script == "Hanji": dest_script = "Hans" elif dest_script == "Thai": dest_script = "Thai" elif dest_script == "Hebrew": dest_script = "Hebr" subscription_key = "959354878e73458e898a69f1f5887b69" endpoint = "https://api.cognitive.microsofttranslator.com" location = "eastus" path = '/translate' constructed_url = endpoint + path headers = { 'Ocp-Apim-Subscription-Key': subscription_key, 'Ocp-Apim-Subscription-Region': location, 'Content-type': 'application/json', 'X-ClientTraceId': str(uuid.uuid4()) } constructed_url1 = "https://api.cognitive.microsofttranslator.com/transliterate?api-version=3.0" print("source_script", source_script) print("dest_script", dest_script) print("source_lang", source_lang) print("text", text) params = {'language': source_lang, 'fromScript': source_script, 'toScript': dest_script} body = [{'text': text}] # try: request = requests.post(constructed_url1, params=params, headers=headers, json=body) response = request.json() print(response) out = response[0]['text'] # except Exception as e: # print("The error was ", e) # out = text return out print(azure_transliteration("mera naam dharmesh hai", "hi", "Latn", "Deva")) from collections import Counter def two_sources_two_outputs(sources_name, O): print("sources name is", sources_name, O) dict1 = Counter(O) print("dict1", dict1) sorted_values = sorted(dict1.values(), reverse=True) # Sort the values print("sorted_value", sorted_values) sorted_dict = {} for i in sorted_values: for k in dict1.keys(): if dict1[k] == i: sorted_dict[k] = dict1[k] print("sorted_Dict", sorted_dict) sources = list(sorted_dict.keys()) print(sources) rm =[] for r in Counter(O).keys(): temp = [i for i in range(len(O)) if O[i] == r] rm.append(temp) print("rm", rm) resANDmethods_indexes={} fs = list(Counter(O).keys()) print("fs", fs) for t in range(len(fs)): resANDmethods_indexes.update({fs[t]: rm[t]}) print("here it is", resANDmethods_indexes) out1 = sources[0] source1 = [sources_name[str(i)] for i in resANDmethods_indexes[out1]] print(source1) if len(sources)==1: return (out1, source1), ("", "") else: out2 = sources[1] source2 = [sources_name[str(i)] for i in resANDmethods_indexes[out2]] print((out1, source1), (out2, source2)) return (out1, source1), (out2, source2) def selection_source_transliteration(sources_name, O, priority_list): seq = list(Counter(O).values()) print(seq) seq.sort(reverse = True) print(seq) check=[] temp="y" # -> check if highest sequence value is greater than all other values for i in range(len(seq)-1): if seq[0]>seq[i+1]: check.append(i) print("check here is -> ", check) # -> check if the highest occurence of a number in seq is greater than all others? if len(check)==(len(seq)-1): temp = "yes" print("check", check) if temp=="yes": print("here1") (o1, s1), (o2, s2) = two_sources_two_outputs(sources_name, O) print((o1, s1), (o2, s2)) output1 = o1 source1 = s1 print(seq) if len(seq)==2: output2 = o2 source2 = s2 else: temp1="y" check1=[] for i in range(len(seq)-2): if seq[1]>seq[i+2]: check.append(i) if len(check1)==(len(seq)-2): temp1 = "yes" if temp1=="yes": output2 = o2 source2 = s2 else: for i in priority_list: temp_source="test" if i not in source1: temp_source = i break if temp_source=="test": output2 = o2 source2 = s2 else: if temp_source != priority_list[1]: output2= O[priority_list.index(temp_source)] source2=temp_source else: output2= O[priority_list.index(priority_list[1])] source2=priority_list[1] else: (o1, s1), (o2, s2) = two_sources_two_outputs(sources_name, O) if priority_list[0] in s1: output1= o1 source1= s1 elif priority_list[0] in s2: output1= o2 source1= s2 else: output1=O[0] source1= priority_list[0] temp_source = "test" for i in priority_list: if i not in source1: temp_source = i break if temp_source=="test": output2 = o2 source2 = s2 else: if temp_source != priority_list[1]: output2= O[priority_list.index(temp_source)] source2=temp_source else: output2= O[priority_list.index(priority_list[1])] source2=priority_list[1] return(output1, source1), (output2, source2) def space_after_punct(text): import re # text = text.replace('...',' ... ') text = text.replace(". . .", " ... ") text = re.sub("([,!?()…-])", r"\1 ", text) text = re.sub("\s{2,}", " ", text) return text def final_transliterated_sentence(original, transliterated): original = space_after_punct(original) punct_list = [ "!", '"', "#", "$", "%", "&", "'", "(", ")", "*", "+", ",", " ", "-", ".", "/", ":", ";", "<", "=", ">", "?", "@", "[", "\\", "]", "^", "_", "`", "{", "|", "}", "~", "…", "...", "।", ] sentence = [] j = 0 for i in range(len(original.split())): if original.split()[i] in punct_list: sentence.append(original.split()[i]) elif original.split()[i][-1] in punct_list: temp = transliterated.split()[j] + original.split()[i][-1] sentence.append(temp) j = j + 1 elif original.split()[i][-1] not in punct_list: temp = transliterated.split()[j] sentence.append(temp) j = j + 1 transliterated_sentence = " ".join(sentence) transliterated_sentence.replace(" ... ", "...") transliterated_sentence.replace("… ", "…") return transliterated_sentence def compare_outputs_transliteration(word, outputs, sources_name, priority_list): # print(outputs) # doc2 = docx.Document() # sections = doc2.sections # for section in sections: # section.top_margin = Inches(0.2) # section.bottom_margin = Inches(0.2) # section.left_margin = Inches(0.2) # section.right_margin = Inches(0.2) # section = doc2.sections[-1] # new_height = section.page_width # section.page_width = section.page_height # section.page_height = new_height # name = 'Final table ' + doc_file # doc2.add_heading(name, 0) # doc_para = doc2.add_paragraph() # doc_para.add_run('Translation resources used : Google, IBM watson, AWS, Azure, Lingvanex, Yandex').bold = True # table2 = doc2.add_table(rows=1, cols=4) # table2.style = 'TableGrid' # hdr_Cells = table2.rows[0].cells # hdr_Cells[0].paragraphs[0].add_run("Input").bold = True # hdr_Cells[1].paragraphs[0].add_run("Output1").bold = True # hdr_Cells[2].paragraphs[0].add_run("Output2").bold = True # hdr_Cells[3].paragraphs[0].add_run("Output3").bold = True O1ANDS1, O2ANDS2 = selection_source_transliteration( sources_name, outputs, priority_list ) print(O1ANDS1, "compare all transliterations") # add_dial_comparison_doc2_transliteration(doc2, table2, word, O1ANDS1, O2ANDS2, sources_name) return O1ANDS1[0] # -> Directly Usable Polyglot api for transliteration # def polyglot_trans(text, source_script, dest_script): # # from polyglot.downloader import downloader # if source_script=="Latin": # source_script="en" # elif source_script=="Arabic": # source_script="ar" # elif source_script=="Hanji": # source_script="zh" # if dest_script=="Latin": # dest_script="en" # elif dest_script=="Arabic": # dest_script="ar" # elif source_script=="Hanji": # source_script="zh" # new_text = "" # text_break = Text(text) # for x in text_break.transliterate(dest_script): # new_text = new_text + str(x) # return new_text # -> Directly Usable indic_trans api for transliteration def indic_trans(text, source_script, dest_script): if source_script == "Devanagari": source_script = "hin" elif source_script == "Arabic": source_script = "urd" elif source_script == "Kannada": source_script = "kan" elif source_script == "Tamil": source_script = "tam" elif source_script == "Latin": source_script = "eng" elif source_script == "Bengali": source_script = "ben" elif source_script == "Telugu": source_script = "tel" elif source_script == "Malayalam": source_script = "mal" elif source_script == "Tamil": source_script = "tam" elif source_script == "Oriya": source_script = "ori" elif source_script == "Gujarati": source_script = "guj" elif source_script == "Gurmukhi": source_script = "pan" if dest_script == "Devanagari": dest_script = "hin" elif dest_script == "Arabic": dest_script = "urd" elif dest_script == "Kannada": dest_script = "kan" elif dest_script == "Latin": dest_script = "eng" elif source_script == "Tamil": source_script = "tam" elif dest_script == "Gujarati": dest_script = "guj" elif dest_script == "Oriya": dest_script = "ori" elif dest_script == "Telugu": dest_script = "tel" elif dest_script == "Malayalam": dest_script = "mal" # elif dest_script=="Gurmukhi": # dest_script="Guru" elif dest_script == "Telugu": dest_script = "Telu" elif dest_script == "Gujarati": dest_script = "Gujr" elif dest_script == "Oriya": dest_script = "Orya" elif dest_script == "Bengali": dest_script = "Ben" elif dest_script == "Tamil": dest_script = "tam" elif dest_script == "Gurmukhi": dest_script = "pan" trn = Transliterator(source=source_script, target=dest_script, build_lookup=True) out = trn.transform(text) return out # -> Directly Usable om_translator api for transliteration def om_transliterator(text): transliterator = om_Transliterator() out = transliterator.knda_to_latn(text) return out # -> Directly Usable libindic api for transliteration def libindic(text, dest_script): if dest_script == "Devanagari": dest_script = "hi" elif dest_script == "Latin": dest_script = "en" elif dest_script == "Malayalam": dest_script = "ml" elif dest_script == "Gujarati": dest_script = "gu" elif dest_script == "Oriya": dest_script = "or" elif dest_script == "Telugu": dest_script = "te" elif dest_script == "Bengali": dest_script = "bn" elif dest_script == "Tamil": dest_script = "ta" elif dest_script == "Kannada": dest_script = "kn" elif dest_script == "Gurmukhi": dest_script = "gu" code = dest_script + '_IN' out = t.transliterate(text, code) return out # -> Directly Usable indic_transliteration_IAST api for transliteration def indic_transliteration_IAST(text): out = transliterate(text, sanscript.IAST, sanscript.DEVANAGARI) return out # -> Directly Usable indic_transliteration_ITRANS api for transliteration def indic_transliteration_ITRANS(text): out = transliterate(text, sanscript.ITRANS, sanscript.DEVANAGARI) return out # -> Directly Usable sheetal api for transliteration def sheetal(text): s2_out = subprocess.check_output([sys.executable, rf"{basePath}/conversion/translation/dev-rom-sheetal.py", text]) out = s2_out.decode('utf-8') return out # -> Directly Usable ritwik code for transliteration def ritwik(text): s2_out = subprocess.check_output([sys.executable, rf"{basePath}/conversion/translation/dev-rom-ritwik.py", text]) out = s2_out.decode('utf-8') return out # -> Directly Usable indic_transliteration_GURMUKHI api for transliteration def indic_transliteration_GURMUKHI(text): out = transliterate(text, sanscript.IAST, sanscript.GURMUKHI) return out # -> Directly Usable unicode_transliteration_GURMUKHI api for transliteration def unicode_transliteration_GURMUKHI(text): input_text = transliterate(text, sanscript.IAST, sanscript.DEVANAGARI) out = UnicodeIndicTransliterator.transliterate(input_text, "hi", "pa") return out # -> Directly Usable transliteration_LATIN_CYRILLIC api for transliteration def transliteration_LATIN_CYRILLIC(text): out = translit(text, 'bg') return out # -> Directly Usable translit_CHINESE_LATIN api for transliteration def translit_CHINESE_LATIN(text): out = pinyin.get(text, format="strip", delimiter=" ") return out def translit_th_sin_mng_heb_to_latin(text): out = anyascii(text) return out # -> Directly Usable indic_transliteration_TELUGU api for transliteration def indic_transliteration_TELUGU(text): out = transliterate(text, sanscript.IAST, sanscript.TELUGU) return out # -> Directly Usable indic_transliteration_GURMUKHI_LATIN api for transliteration def indic_transliteration_GURMUKHI_LATIN(text): out = transliterate(text, sanscript.GURMUKHI, sanscript.ITRANS) return out # -> Directly Usable unicode_transliteration_GURMUKHI_LATIN api for transliteration def unicode_transliteration_GURMUKHI_LATIN(text): input_text = transliterate(text, sanscript.IAST, sanscript.DEVANAGARI) out = UnicodeIndicTransliterator.transliterate(input_text, "hi", "pa") return out # -> Directly Usable transliteration_CYRILIC_LATIN api for transliteration def transliteration_CYRILIC_LATIN(text): out = translit(text, 'bg', reversed=True) return out # -> Some Random Code to replace special characters def readonly(str): str = str.replace("а", "a") str = str.replace("б", "b") str = str.replace("в", "v") str = str.replace("г", "g") str = str.replace("д", "d") str = str.replace("е", "e") str = str.replace("ё", "yo") str = str.replace("ж", "zh") str = str.replace("з", "z") str = str.replace("и", "i") str = str.replace("й", "j") str = str.replace("к", "k") str = str.replace("л", "l") str = str.replace("м", "m") str = str.replace("н", "n") str = str.replace("о", "o") str = str.replace("п", "p") str = str.replace("р", "r") str = str.replace("с", "s") str = str.replace("т", "t") str = str.replace("у", "u") str = str.replace("ф", "f") str = str.replace("х", "h") str = str.replace("ц", "c") str = str.replace("ч", "ch") str = str.replace("ш", "sh") str = str.replace("щ", "sch") str = str.replace("ъ", "j") str = str.replace("ы", "i") str = str.replace("ь", "j") str = str.replace("э", "e") str = str.replace("ю", "yu") str = str.replace("я", "ya") str = str.replace("А", "A") str = str.replace("Б", "B") str = str.replace("В", "V") str = str.replace("Г", "G") str = str.replace("Д", "D") str = str.replace("Е", "E") str = str.replace("Ё", "Yo") str = str.replace("Ж", "Zh") str = str.replace("З", "Z") str = str.replace("И", "I") str = str.replace("Й", "J") str = str.replace("К", "K") str = str.replace("Л", "L") str = str.replace("М", "M") str = str.replace("Н", "N") str = str.replace("О", "O") str = str.replace("П", "P") str = str.replace("Р", "R") str = str.replace("С", "S") str = str.replace("Т", "T") str = str.replace("У", "U") str = str.replace("Ф", "F") str = str.replace("Х", "H") str = str.replace("Ц", "C") str = str.replace("Ч", "Ch") str = str.replace("Ш", "Sh") str = str.replace("Щ", "Sch") str = str.replace("Ъ", "J") str = str.replace("Ы", "I") str = str.replace("Ь", "J") str = str.replace("Э", "E") str = str.replace("Ю", "Yu") str = str.replace("Я", "Ya") return str # -> Code to Convert Letters to Latin Script def ConvertToLatin(source): result = '' for letter in source: Letter = readonly(letter) ## replacemnet of word result = result + Letter return result # -> Directly Usable indic_transliteration_OTHER_DEVANAGRI api for transliteration def indic_transliteration_OTHER_DEVANAGRI(text, src_script): if src_script == "Malayalam": out = transliterate(text, sanscript.MALAYALAM, sanscript.DEVANAGARI) if src_script == "Gujarati": out = transliterate(text, sanscript.GUJARATI, sanscript.DEVANAGARI) if src_script == "Telugu": out = transliterate(text, sanscript.TELUGU, sanscript.DEVANAGARI) if src_script == "Oriya": out = transliterate(text, sanscript.ORIYA, sanscript.DEVANAGARI) if src_script == "Bengali": out = transliterate(text, sanscript.BENGALI, sanscript.DEVANAGARI) if src_script == "Kannada": out = transliterate(text, sanscript.KANNADA, sanscript.DEVANAGARI) if src_script == "Gurmukhi": out = transliterate(text, sanscript.GURMUKHI, sanscript.DEVANAGARI) if src_script == "Tamil": out = transliterate(text, sanscript.TAMIL, sanscript.DEVANAGARI) return out # -> Directly Usable indic_transliteration_DEVANAGRI_OTHER api for transliteration def indic_transliteration_DEVANAGRI_OTHER(text, dest_script): if dest_script == "Malayalam": out = transliterate(text, sanscript.DEVANAGARI, sanscript.MALAYALAM) if dest_script == "Gujarati": out = transliterate(text, sanscript.DEVANAGARI, sanscript.GUJARATI) if dest_script == "Telugu": out = transliterate(text, sanscript.DEVANAGARI, sanscript.TELUGU) if dest_script == "Oriya": out = transliterate(text, sanscript.DEVANAGARI, sanscript.ORIYA) if dest_script == "Bengali": out = transliterate(text, sanscript.DEVANAGARI, sanscript.BENGALI) if dest_script == "Kannada": out = transliterate(text, sanscript.DEVANAGARI, sanscript.KANNADA) if dest_script == "Gurmukhi": out = transliterate(text, sanscript.DEVANAGARI, sanscript.GURMUKHI) if dest_script == "Tamil": out = transliterate(text, sanscript.DEVANAGARI, sanscript.TAMIL) return out # -> Directly Usable indic_transliteration_KANNADA_OTHER api for transliteration def indic_transliteration_KANNADA_OTHER(text, dest_script): if dest_script == "Malayalam": out = transliterate(text, sanscript.KANNADA, sanscript.MALAYALAM) if dest_script == "Telugu": out = transliterate(text, sanscript.KANNADA, sanscript.TELUGU) if dest_script == "Tamil": out = transliterate(text, sanscript.KANNADA, sanscript.TAMIL) if dest_script == "Bengali": out = transliterate(text, sanscript.KANNADA, sanscript.BENGALI) return out # -> Directly Usable indic_transliteration_OTHER_KANNADA api for transliteration def indic_transliteration_OTHER_KANNADA(text, src_script): if src_script == "Malayalam": out = transliterate(text, sanscript.MALAYALAM, sanscript.KANNADA) if src_script == "Telugu": out = transliterate(text, sanscript.TELUGU, sanscript.KANNADA) if src_script == "Tamil": out = transliterate(text, sanscript.TAMIL, sanscript.KANNADA) if src_script == "Bengali": out = transliterate(text, sanscript.BENGALI, sanscript.KANNADA) return out # -> Directly Usable indic_transliteration_TAMIL_OTHER api for transliteration def indic_transliteration_TAMIL_OTHER(text, dest_script): if dest_script == "Malayalam": out = transliterate(text, sanscript.TAMIL, sanscript.MALAYALAM) if dest_script == "Telugu": out = transliterate(text, sanscript.TAMIL, sanscript.TELUGU) return out # -> Directly Usable indic_transliteration_OTHER_TAMIL api for transliteration def indic_transliteration_OTHER_TAMIL(text, src_script): if src_script == "Malayalam": out = transliterate(text, sanscript.MALAYALAM, sanscript.TAMIL) if src_script == "Telugu": out = transliterate(text, sanscript.TELUGU, sanscript.TAMIL) return out # -> Directly Usable indic_transliteration_TELUGU_OTHER api for transliteration def indic_transliteration_TELUGU_OTHER(text, desc_script): if desc_script == "Malayalam": out = transliterate(text, sanscript.TELUGU, sanscript.MALAYALAM) return out # -> Directly Usable indic_transliteration_MALAYALAM_OTHER api for transliteration def indic_transliteration_MALAYALAM_OTHER(text, desc_script): if desc_script == "Telugu": out = transliterate(text, sanscript.MALAYALAM, sanscript.TELUGU) return out # -> Directly Usable indic_transliteration_OTHER_GUJARATI api for transliteration def indic_transliteration_OTHER_GUJARATI(text, src_script): if src_script == "Gurmukhi": out = transliterate(text, sanscript.GURMUKHI, sanscript.GUJARATI) if src_script == "Oriya": out = transliterate(text, sanscript.ORIYA, sanscript.GUJARATI) return out # -> Directly Usable indic_transliteration_OTHER_GURMUKHI api for transliteration def indic_transliteration_OTHER_GURMUKHI(text, src_script): if src_script == "Gujarati": out = transliterate(text, sanscript.GUJARATI, sanscript.GURMUKHI) if src_script == "Oriya": out = transliterate(text, sanscript.ORIYA, sanscript.GURMUKHI) return out # -> Directly Usable indic_transliteration_OTHER_ORIYA api for transliteration def indic_transliteration_OTHER_ORIYA(text, src_script): if src_script == "Gujarati": out = transliterate(text, sanscript.GUJARATI, sanscript.ORIYA) if src_script == "Gurmukhi": out = transliterate(text, sanscript.GURMUKHI, sanscript.ORIYA) return out from indicnlp.tokenize import sentence_tokenize def punct_remover(string): punctuations = """!()-[]{};:'"\,<>./?@#$%^&*_~…।""" for x in string.lower(): if x in punctuations: string = string.replace(x, " ") return string source_lang = "hi" text = "सड़क के बीच में एक बड़ा ट्रक क्यों है?" source_script = "Devanagari" dest_script = "Latin" # from fuzzywuzzy import fuzz from difflib import SequenceMatcher import Levenshtein from rapidfuzz import fuzz def calculate_edit_distance(original_word: str, transliterated_word: str) -> float: return Levenshtein.distance(original_word, transliterated_word) def calculate_similarity(original_word: str, transliterated_word: str) -> float: return 1 - Levenshtein.distance(original_word, transliterated_word) / max(len(original_word), len(transliterated_word)) # return matcher.ratio() def calculate_fuzz_similarity(original_word: str, transliterated_word: str) -> float: return fuzz.ratio(original_word, transliterated_word) def get_best_output(inside_func: callable, original_word: str, transliteration_outputs: list, reverse: bool=False): best_transliteration = original_word lowest_distance = float('inf') highest_similarity = 0 parameter = highest_similarity if not reverse else lowest_distance for candidate in transliteration_outputs: # total_parameter = 0 total_parameter = inside_func(original_word, candidate) print("total paramter", total_parameter, parameter, original_word, candidate) # average_similarity = total_parameter if not reverse: # average_distance = total_parameter if total_parameter > parameter: parameter = total_parameter # highest_similarity = average_similarity best_transliteration = candidate else: # average_distance = total_parameter if total_parameter < parameter: parameter = total_parameter # highest_similarity = average_distance best_transliteration = candidate print(best_transliteration) return best_transliteration def compare_transliteration_outputs(original_word: str, transliterated_words: list) -> str: best_of_all_outputs = original_word if original_word is None or transliterated_words is None: return best_of_all_outputs """getting outputs compared using different functions and picking best outputs out of them""" best_output1 = get_best_output(calculate_edit_distance, original_word, transliterated_words, True) best_output2 = get_best_output(calculate_similarity, original_word, transliterated_words) best_output3 = get_best_output(calculate_fuzz_similarity, original_word, transliterated_words) best_of_all_outputs = Counter([best_output1, best_output2, best_output3]).most_common(1)[0][0] print(best_output1, best_output2, best_output3, "89999999999999999999") return best_of_all_outputs # sources_name = {"0": "indic_trans", "1": "Azure","2": "libindic", "3": "sheetal"} # priority_list = ["indic_trans", "Azure", "libindic", "sheetal"] # etc_punctuation = ["", " . . .", " . .", " . . ”"] # sentences = sentence_tokenize.sentence_split(text, lang="hi") # if source_lang == "ne": # source_lang = "hi" # transliterated_text = [] # sentences = sentence_tokenize.sentence_split(text, lang="hi") # if source_lang == "ne": # source_lang = "hi" # transliterated_text = [] # Out = [] # print("sentences", sentences) # for sentence in sentences[0].split(): # print("full word -> ", sentence) # if sentence in etc_punctuation: # continue # temp_sentence = punct_remover(sentence) # t0 = indic_trans(temp_sentence, source_script, dest_script) # t1 = azure_transliteration( # temp_sentence, source_lang, source_script, dest_script # ) # t2 = libindic(temp_sentence, dest_script).rstrip() # t3 = sheetal(temp_sentence).replace("\n", "") # Out = [] # for i in range(len(temp_sentence.split())): # word = temp_sentence.split()[i] # T0 = t0.split()[i] # T1 = t1.split()[i] # T2 = t2.split()[i] # T3 = t3.split()[i] # outputs = [T0, T1, T2, T3] # out = compare_outputs_transliteration( # word, outputs, sources_name, priority_list # ) # Out.append(out) # trans_sent_wo_punct = " ".join(Out) # out = compare_transliteration_outputs(temp_sentence, [t0, t1, t2, t3]) # # print("this words output is -> ", out) # # out = compare_outputs_transliteration(word, outputs, sources_name, priority_list) # Out.append(out) # trans_sent_wo_punct = " ".join(Out) # print("trans_sent_wo_punct", trans_sent_wo_punct) # transliterated_sentence = final_transliterated_sentence( # sentence, trans_sent_wo_punct # ) # print("trans_sent_w_punct", transliterated_sentence) # transliterated_text.append(transliterated_sentence) # print(transliterated_sentence) # print("Entered Exiting Here1212", Out) # f = open("output.txt", "w") # f.write(" ".join(Out)) # f.close() # return " ".join(transliterated_text) # # print("original_sentence", sentence) # temp_sentence = punct_remover(text) # t0 = indic_trans(temp_sentence, source_script, dest_script) # t1 = azure_transliteration( # temp_sentence, source_lang, source_script, dest_script # ) # t2 = libindic(temp_sentence, dest_script).rstrip() # t3 = sheetal(temp_sentence).replace("\n", "") # Out = [] # print(t0, t1, t2, t3) # outputs_len = [len(T.split(" ")) for T in [t0, t1, t2, t3]] # print(outputs_len, "outputs len") # for i in range(len(temp_sentence.split())): # word = temp_sentence.split()[i] # T0 = t0.split()[i] # T1 = t1.split()[i] # T2 = t2.split()[i] # T3 = t3.split()[i] # outputs = [T0, T1, T2, T3] # # out = compare_outputs_transliteration( # word, outputs, sources_name, priority_list # ) # Out.append(out) # trans_sent_wo_punct = " ".join(Out) # out = compare_outputs_transliteration(temp_sentence, [t0,t1,t2,t3], sources_name, priority_list) # # print("trans_sent_wo_punct", trans_sent_wo_punct) # transliterated_sentence = final_transliterated_sentence(temp_sentence, out) # print("trans_sent_", transliterated_sentence) # transliterated_text.append(transliterated_sentence) # print("Entered Exiting Here1212") # print(" ".join(transliterated_text)) # if text in etc_punctuation: # return text # # print("original_sentence", sentence) # temp_sentence = punct_remover(text) # tt = 0 # try: # t0 = indic_trans(temp_sentence, source_script, dest_script) # outputa = t0 # except: # tt += 1 # try: # if tt == 1: # t1 = azure_transliteration( # temp_sentence, source_lang, source_script, dest_script # ) # outputa = t1 # except: # tt += 1 # # print("before t1111111111") # try: # if tt == 2: # t2 = libindic(temp_sentence, dest_script).rstrip() # outputa = t2 # except: # tt += 1 # # print("before sheetal", t2) # try: # if tt == 3: # t3 = sheetal(temp_sentence).replace("\n", "") # outputa = t3 # except: # tt += 1 # # if tt == 4: # outputa = text # else: # trans_sent_wo_punct = outputa # print("trans_sent_wo_punct", trans_sent_wo_punct) # transliterated_sentence = final_transliterated_sentence( # sentence, trans_sent_wo_punct # ) # print("trans_sent_w_punct", transliterated_sentence) # transliterated_text.append(transliterated_sentence) # print("Entered Exiting Here1212") # return outputa # source_lang = "hi" # source_script = "Latin" # dest_script = "Devanagari" # sources_name = { # "0": "Azure", # "1": "indic_trans", # "2": "google", # "3": "indic_trans_IAST", # } # sentences = sentence_tokenize.sentence_split(text, lang="en") # priority_list = [ # "Azure", # "indic_trans", # "google", # "indic_trans_IAST", # ] # transliterated_text = [] # for sentence in sentences: # if ( # sentence == "" # or sentence == " . . ." # or sentence == " . ." # or sentence == " . . ”" # ): # continue # OUT = [] # for word in sentence.split(): # if word == ".": # continue # t0 = azure_transliteration( # word, source_lang, source_script, dest_script) # t1 = indic_trans(word, source_script, dest_script) # t2 = google(word, "en", "hi") # t3 = indic_transliteration_IAST(word) # outputs = [t0, t1, t2, t3] # out = compare_outputs_transliteration( # word, outputs, sources_name, priority_list # ) # OUT.append(out) # transliterated_text.append(" ".join(OUT)) # print("running perfectly") # return " ".join(transliterated_text) # print(indic_transliteration_IAST("mera naam dharmesh hai"))