Conversion_Kitchen_Code/kitchen_counter/conversion/translation/transliteration_testing.py

1058 lines
36 KiB
Python
Raw Permalink Normal View History

2024-04-27 09:33:09 +00:00
# from transliteration_resources import (
# azure_transliteration,
# indic_trans,
# indic_transliteration_OTHER_GUJARATI,
# indic_transliteration_OTHER_GURMUKHI,
# indic_transliteration_OTHER_ORIYA,
# om_transliterator,
# libindic,
# indic_transliteration_IAST,
# indic_transliteration_ITRANS,
# # polyglot_trans,
# sheetal,
# unicode_transliteration_GURMUKHI,
# indic_transliteration_GURMUKHI,
# transliteration_LATIN_CYRILLIC,
# indic_transliteration_TELUGU,
# unicode_transliteration_GURMUKHI_LATIN,
# indic_transliteration_GURMUKHI_LATIN,
# transliteration_CYRILIC_LATIN,
# ConvertToLatin,
# readonly,
# indic_transliteration_OTHER_DEVANAGRI,
# indic_transliteration_DEVANAGRI_OTHER,
# indic_transliteration_KANNADA_OTHER,
# indic_transliteration_OTHER_KANNADA,
# indic_transliteration_TAMIL_OTHER,
# indic_transliteration_OTHER_TAMIL,
# indic_transliteration_TELUGU_OTHER,
# indic_transliteration_MALAYALAM_OTHER,
# indic_transliteration_OTHER_GUJARATI,
# indic_transliteration_OTHER_GURMUKHI,
# indic_transliteration_OTHER_ORIYA,
# translit_CHINESE_LATIN,
# translit_th_sin_mng_heb_to_latin
# ) # , translit_THAI_LATIN
import subprocess
import sys
import os
import requests, uuid, json
from indictrans import Transliterator
from om_transliterator import Transliterator as om_Transliterator
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate
from libindic.transliteration import getInstance
t = getInstance()
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate
from indicnlp.transliterate.unicode_transliterate import UnicodeIndicTransliterator
from transliterate import translit # , get_available_language_codes
from indic_transliteration.sanscript import SchemeMap, SCHEMES, transliterate
# import polyglot
# from polyglot.transliteration import Transliterator as poly
# from polyglot.text import Text
import pinyin
from anyascii import anyascii
# from MNF.settings import BasePath
basePath = "/home/user/mnf/project/MNF"
# -> Directly Usable azure api for transliteration
def azure_transliteration(text, source_lang, source_script, dest_script):
if source_script == "Devanagari":
source_script = "Deva"
elif source_script == "Arabic":
source_script = "Arab"
elif source_script == "Latin":
source_script = "Latn"
elif source_script == "Kannada":
source_script = "knda"
elif source_script == "Tamil":
source_script = "Taml"
elif source_script == "Bengali":
source_script = "Beng"
elif source_script == "Telugu":
source_script = "Telu"
elif source_script == "Malayalam":
source_script = "Mlym"
elif source_script == "Cyrillic":
source_script = "Cyrl"
elif source_script == "Gurmukhi":
source_script = "Guru"
elif source_script == "Telugu":
source_script = "Telu"
elif source_script == "Gujarati":
source_script = "Gujr"
elif source_script == "Oriya":
source_script = "Orya"
elif source_script == "Sinhala":
source_script = "Sinh"
elif source_script == "Hanji":
source_script = "Hans"
elif source_script == "Thai":
source_script = "Thai"
elif source_script == "Hebrew":
source_script = "Hebr"
if dest_script == "Devanagari":
dest_script = "Deva"
elif dest_script == "Arabic":
dest_script = "Arab"
elif dest_script == "Latin":
dest_script = "Latn"
elif dest_script == "Kannada":
dest_script = "knda"
elif dest_script == "Tamil":
dest_script = "Taml"
elif dest_script == "Cyrillic":
dest_script = "Cyrl"
elif dest_script == "Malayalam":
dest_script = "Mlym"
elif dest_script == "Gurmukhi":
dest_script = "Guru"
elif dest_script == "Telugu":
dest_script = "Telu"
elif dest_script == "Gujarati":
dest_script = "Gujr"
elif dest_script == "Oriya":
dest_script = "Orya"
elif dest_script == "Bengali":
dest_script = "Beng"
elif dest_script == "Sinhala":
dest_script = "Sinh"
elif dest_script == "Hanji":
dest_script = "Hans"
elif dest_script == "Thai":
dest_script = "Thai"
elif dest_script == "Hebrew":
dest_script = "Hebr"
subscription_key = "959354878e73458e898a69f1f5887b69"
endpoint = "https://api.cognitive.microsofttranslator.com"
location = "eastus"
path = '/translate'
constructed_url = endpoint + path
headers = {
'Ocp-Apim-Subscription-Key': subscription_key,
'Ocp-Apim-Subscription-Region': location,
'Content-type': 'application/json',
'X-ClientTraceId': str(uuid.uuid4())
}
constructed_url1 = "https://api.cognitive.microsofttranslator.com/transliterate?api-version=3.0"
print("source_script", source_script)
print("dest_script", dest_script)
print("source_lang", source_lang)
print("text", text)
params = {'language': source_lang, 'fromScript': source_script, 'toScript': dest_script}
body = [{'text': text}]
# try:
request = requests.post(constructed_url1, params=params, headers=headers, json=body)
response = request.json()
print(response)
out = response[0]['text']
# except Exception as e:
# print("The error was ", e)
# out = text
return out
print(azure_transliteration("mera naam dharmesh hai", "hi", "Latn", "Deva"))
from collections import Counter
def two_sources_two_outputs(sources_name, O):
print("sources name is", sources_name, O)
dict1 = Counter(O)
print("dict1", dict1)
sorted_values = sorted(dict1.values(), reverse=True) # Sort the values
print("sorted_value", sorted_values)
sorted_dict = {}
for i in sorted_values:
for k in dict1.keys():
if dict1[k] == i:
sorted_dict[k] = dict1[k]
print("sorted_Dict", sorted_dict)
sources = list(sorted_dict.keys())
print(sources)
rm =[]
for r in Counter(O).keys():
temp = [i for i in range(len(O)) if O[i] == r]
rm.append(temp)
print("rm", rm)
resANDmethods_indexes={}
fs = list(Counter(O).keys())
print("fs", fs)
for t in range(len(fs)):
resANDmethods_indexes.update({fs[t]: rm[t]})
print("here it is", resANDmethods_indexes)
out1 = sources[0]
source1 = [sources_name[str(i)] for i in resANDmethods_indexes[out1]]
print(source1)
if len(sources)==1:
return (out1, source1), ("", "")
else:
out2 = sources[1]
source2 = [sources_name[str(i)] for i in resANDmethods_indexes[out2]]
print((out1, source1), (out2, source2))
return (out1, source1), (out2, source2)
def selection_source_transliteration(sources_name, O, priority_list):
seq = list(Counter(O).values())
print(seq)
seq.sort(reverse = True)
print(seq)
check=[]
temp="y"
# -> check if highest sequence value is greater than all other values
for i in range(len(seq)-1):
if seq[0]>seq[i+1]:
check.append(i)
print("check here is -> ", check)
# -> check if the highest occurence of a number in seq is greater than all others?
if len(check)==(len(seq)-1):
temp = "yes"
print("check", check)
if temp=="yes":
print("here1")
(o1, s1), (o2, s2) = two_sources_two_outputs(sources_name, O)
print((o1, s1), (o2, s2))
output1 = o1
source1 = s1
print(seq)
if len(seq)==2:
output2 = o2
source2 = s2
else:
temp1="y"
check1=[]
for i in range(len(seq)-2):
if seq[1]>seq[i+2]:
check.append(i)
if len(check1)==(len(seq)-2):
temp1 = "yes"
if temp1=="yes":
output2 = o2
source2 = s2
else:
for i in priority_list:
temp_source="test"
if i not in source1:
temp_source = i
break
if temp_source=="test":
output2 = o2
source2 = s2
else:
if temp_source != priority_list[1]:
output2= O[priority_list.index(temp_source)]
source2=temp_source
else:
output2= O[priority_list.index(priority_list[1])]
source2=priority_list[1]
else:
(o1, s1), (o2, s2) = two_sources_two_outputs(sources_name, O)
if priority_list[0] in s1:
output1= o1
source1= s1
elif priority_list[0] in s2:
output1= o2
source1= s2
else:
output1=O[0]
source1= priority_list[0]
temp_source = "test"
for i in priority_list:
if i not in source1:
temp_source = i
break
if temp_source=="test":
output2 = o2
source2 = s2
else:
if temp_source != priority_list[1]:
output2= O[priority_list.index(temp_source)]
source2=temp_source
else:
output2= O[priority_list.index(priority_list[1])]
source2=priority_list[1]
return(output1, source1), (output2, source2)
def space_after_punct(text):
import re
# text = text.replace('...',' ... ')
text = text.replace(". . .", " ... ")
text = re.sub("([,!?()…-])", r"\1 ", text)
text = re.sub("\s{2,}", " ", text)
return text
def final_transliterated_sentence(original, transliterated):
original = space_after_punct(original)
punct_list = [
"!",
'"',
"#",
"$",
"%",
"&",
"'",
"(",
")",
"*",
"+",
",",
" ",
"-",
".",
"/",
":",
";",
"<",
"=",
">",
"?",
"@",
"[",
"\\",
"]",
"^",
"_",
"`",
"{",
"|",
"}",
"~",
"",
"...",
"",
]
sentence = []
j = 0
for i in range(len(original.split())):
if original.split()[i] in punct_list:
sentence.append(original.split()[i])
elif original.split()[i][-1] in punct_list:
temp = transliterated.split()[j] + original.split()[i][-1]
sentence.append(temp)
j = j + 1
elif original.split()[i][-1] not in punct_list:
temp = transliterated.split()[j]
sentence.append(temp)
j = j + 1
transliterated_sentence = " ".join(sentence)
transliterated_sentence.replace(" ... ", "...")
transliterated_sentence.replace("", "")
return transliterated_sentence
def compare_outputs_transliteration(word, outputs, sources_name, priority_list):
# print(outputs)
# doc2 = docx.Document()
# sections = doc2.sections
# for section in sections:
# section.top_margin = Inches(0.2)
# section.bottom_margin = Inches(0.2)
# section.left_margin = Inches(0.2)
# section.right_margin = Inches(0.2)
# section = doc2.sections[-1]
# new_height = section.page_width
# section.page_width = section.page_height
# section.page_height = new_height
# name = 'Final table ' + doc_file
# doc2.add_heading(name, 0)
# doc_para = doc2.add_paragraph()
# doc_para.add_run('Translation resources used : Google, IBM watson, AWS, Azure, Lingvanex, Yandex').bold = True
# table2 = doc2.add_table(rows=1, cols=4)
# table2.style = 'TableGrid'
# hdr_Cells = table2.rows[0].cells
# hdr_Cells[0].paragraphs[0].add_run("Input").bold = True
# hdr_Cells[1].paragraphs[0].add_run("Output1").bold = True
# hdr_Cells[2].paragraphs[0].add_run("Output2").bold = True
# hdr_Cells[3].paragraphs[0].add_run("Output3").bold = True
O1ANDS1, O2ANDS2 = selection_source_transliteration(
sources_name, outputs, priority_list
)
print(O1ANDS1, "compare all transliterations")
# add_dial_comparison_doc2_transliteration(doc2, table2, word, O1ANDS1, O2ANDS2, sources_name)
return O1ANDS1[0]
# -> Directly Usable Polyglot api for transliteration
# def polyglot_trans(text, source_script, dest_script):
# # from polyglot.downloader import downloader
# if source_script=="Latin":
# source_script="en"
# elif source_script=="Arabic":
# source_script="ar"
# elif source_script=="Hanji":
# source_script="zh"
# if dest_script=="Latin":
# dest_script="en"
# elif dest_script=="Arabic":
# dest_script="ar"
# elif source_script=="Hanji":
# source_script="zh"
# new_text = ""
# text_break = Text(text)
# for x in text_break.transliterate(dest_script):
# new_text = new_text + str(x)
# return new_text
# -> Directly Usable indic_trans api for transliteration
def indic_trans(text, source_script, dest_script):
if source_script == "Devanagari":
source_script = "hin"
elif source_script == "Arabic":
source_script = "urd"
elif source_script == "Kannada":
source_script = "kan"
elif source_script == "Tamil":
source_script = "tam"
elif source_script == "Latin":
source_script = "eng"
elif source_script == "Bengali":
source_script = "ben"
elif source_script == "Telugu":
source_script = "tel"
elif source_script == "Malayalam":
source_script = "mal"
elif source_script == "Tamil":
source_script = "tam"
elif source_script == "Oriya":
source_script = "ori"
elif source_script == "Gujarati":
source_script = "guj"
elif source_script == "Gurmukhi":
source_script = "pan"
if dest_script == "Devanagari":
dest_script = "hin"
elif dest_script == "Arabic":
dest_script = "urd"
elif dest_script == "Kannada":
dest_script = "kan"
elif dest_script == "Latin":
dest_script = "eng"
elif source_script == "Tamil":
source_script = "tam"
elif dest_script == "Gujarati":
dest_script = "guj"
elif dest_script == "Oriya":
dest_script = "ori"
elif dest_script == "Telugu":
dest_script = "tel"
elif dest_script == "Malayalam":
dest_script = "mal"
# elif dest_script=="Gurmukhi":
# dest_script="Guru"
elif dest_script == "Telugu":
dest_script = "Telu"
elif dest_script == "Gujarati":
dest_script = "Gujr"
elif dest_script == "Oriya":
dest_script = "Orya"
elif dest_script == "Bengali":
dest_script = "Ben"
elif dest_script == "Tamil":
dest_script = "tam"
elif dest_script == "Gurmukhi":
dest_script = "pan"
trn = Transliterator(source=source_script, target=dest_script, build_lookup=True)
out = trn.transform(text)
return out
# -> Directly Usable om_translator api for transliteration
def om_transliterator(text):
transliterator = om_Transliterator()
out = transliterator.knda_to_latn(text)
return out
# -> Directly Usable libindic api for transliteration
def libindic(text, dest_script):
if dest_script == "Devanagari":
dest_script = "hi"
elif dest_script == "Latin":
dest_script = "en"
elif dest_script == "Malayalam":
dest_script = "ml"
elif dest_script == "Gujarati":
dest_script = "gu"
elif dest_script == "Oriya":
dest_script = "or"
elif dest_script == "Telugu":
dest_script = "te"
elif dest_script == "Bengali":
dest_script = "bn"
elif dest_script == "Tamil":
dest_script = "ta"
elif dest_script == "Kannada":
dest_script = "kn"
elif dest_script == "Gurmukhi":
dest_script = "gu"
code = dest_script + '_IN'
out = t.transliterate(text, code)
return out
# -> Directly Usable indic_transliteration_IAST api for transliteration
def indic_transliteration_IAST(text):
out = transliterate(text, sanscript.IAST, sanscript.DEVANAGARI)
return out
# -> Directly Usable indic_transliteration_ITRANS api for transliteration
def indic_transliteration_ITRANS(text):
out = transliterate(text, sanscript.ITRANS, sanscript.DEVANAGARI)
return out
# -> Directly Usable sheetal api for transliteration
def sheetal(text):
s2_out = subprocess.check_output([sys.executable, rf"{basePath}/conversion/translation/dev-rom-sheetal.py", text])
out = s2_out.decode('utf-8')
return out
# -> Directly Usable ritwik code for transliteration
def ritwik(text):
s2_out = subprocess.check_output([sys.executable, rf"{basePath}/conversion/translation/dev-rom-ritwik.py", text])
out = s2_out.decode('utf-8')
return out
# -> Directly Usable indic_transliteration_GURMUKHI api for transliteration
def indic_transliteration_GURMUKHI(text):
out = transliterate(text, sanscript.IAST, sanscript.GURMUKHI)
return out
# -> Directly Usable unicode_transliteration_GURMUKHI api for transliteration
def unicode_transliteration_GURMUKHI(text):
input_text = transliterate(text, sanscript.IAST, sanscript.DEVANAGARI)
out = UnicodeIndicTransliterator.transliterate(input_text, "hi", "pa")
return out
# -> Directly Usable transliteration_LATIN_CYRILLIC api for transliteration
def transliteration_LATIN_CYRILLIC(text):
out = translit(text, 'bg')
return out
# -> Directly Usable translit_CHINESE_LATIN api for transliteration
def translit_CHINESE_LATIN(text):
out = pinyin.get(text, format="strip", delimiter=" ")
return out
def translit_th_sin_mng_heb_to_latin(text):
out = anyascii(text)
return out
# -> Directly Usable indic_transliteration_TELUGU api for transliteration
def indic_transliteration_TELUGU(text):
out = transliterate(text, sanscript.IAST, sanscript.TELUGU)
return out
# -> Directly Usable indic_transliteration_GURMUKHI_LATIN api for transliteration
def indic_transliteration_GURMUKHI_LATIN(text):
out = transliterate(text, sanscript.GURMUKHI, sanscript.ITRANS)
return out
# -> Directly Usable unicode_transliteration_GURMUKHI_LATIN api for transliteration
def unicode_transliteration_GURMUKHI_LATIN(text):
input_text = transliterate(text, sanscript.IAST, sanscript.DEVANAGARI)
out = UnicodeIndicTransliterator.transliterate(input_text, "hi", "pa")
return out
# -> Directly Usable transliteration_CYRILIC_LATIN api for transliteration
def transliteration_CYRILIC_LATIN(text):
out = translit(text, 'bg', reversed=True)
return out
# -> Some Random Code to replace special characters
def readonly(str):
str = str.replace("а", "a")
str = str.replace("б", "b")
str = str.replace("в", "v")
str = str.replace("г", "g")
str = str.replace("д", "d")
str = str.replace("е", "e")
str = str.replace("ё", "yo")
str = str.replace("ж", "zh")
str = str.replace("з", "z")
str = str.replace("и", "i")
str = str.replace("й", "j")
str = str.replace("к", "k")
str = str.replace("л", "l")
str = str.replace("м", "m")
str = str.replace("н", "n")
str = str.replace("о", "o")
str = str.replace("п", "p")
str = str.replace("р", "r")
str = str.replace("с", "s")
str = str.replace("т", "t")
str = str.replace("у", "u")
str = str.replace("ф", "f")
str = str.replace("х", "h")
str = str.replace("ц", "c")
str = str.replace("ч", "ch")
str = str.replace("ш", "sh")
str = str.replace("щ", "sch")
str = str.replace("ъ", "j")
str = str.replace("ы", "i")
str = str.replace("ь", "j")
str = str.replace("э", "e")
str = str.replace("ю", "yu")
str = str.replace("я", "ya")
str = str.replace("А", "A")
str = str.replace("Б", "B")
str = str.replace("В", "V")
str = str.replace("Г", "G")
str = str.replace("Д", "D")
str = str.replace("Е", "E")
str = str.replace("Ё", "Yo")
str = str.replace("Ж", "Zh")
str = str.replace("З", "Z")
str = str.replace("И", "I")
str = str.replace("Й", "J")
str = str.replace("К", "K")
str = str.replace("Л", "L")
str = str.replace("М", "M")
str = str.replace("Н", "N")
str = str.replace("О", "O")
str = str.replace("П", "P")
str = str.replace("Р", "R")
str = str.replace("С", "S")
str = str.replace("Т", "T")
str = str.replace("У", "U")
str = str.replace("Ф", "F")
str = str.replace("Х", "H")
str = str.replace("Ц", "C")
str = str.replace("Ч", "Ch")
str = str.replace("Ш", "Sh")
str = str.replace("Щ", "Sch")
str = str.replace("Ъ", "J")
str = str.replace("Ы", "I")
str = str.replace("Ь", "J")
str = str.replace("Э", "E")
str = str.replace("Ю", "Yu")
str = str.replace("Я", "Ya")
return str
# -> Code to Convert Letters to Latin Script
def ConvertToLatin(source):
result = ''
for letter in source:
Letter = readonly(letter) ## replacemnet of word
result = result + Letter
return result
# -> Directly Usable indic_transliteration_OTHER_DEVANAGRI api for transliteration
def indic_transliteration_OTHER_DEVANAGRI(text, src_script):
if src_script == "Malayalam":
out = transliterate(text, sanscript.MALAYALAM, sanscript.DEVANAGARI)
if src_script == "Gujarati":
out = transliterate(text, sanscript.GUJARATI, sanscript.DEVANAGARI)
if src_script == "Telugu":
out = transliterate(text, sanscript.TELUGU, sanscript.DEVANAGARI)
if src_script == "Oriya":
out = transliterate(text, sanscript.ORIYA, sanscript.DEVANAGARI)
if src_script == "Bengali":
out = transliterate(text, sanscript.BENGALI, sanscript.DEVANAGARI)
if src_script == "Kannada":
out = transliterate(text, sanscript.KANNADA, sanscript.DEVANAGARI)
if src_script == "Gurmukhi":
out = transliterate(text, sanscript.GURMUKHI, sanscript.DEVANAGARI)
if src_script == "Tamil":
out = transliterate(text, sanscript.TAMIL, sanscript.DEVANAGARI)
return out
# -> Directly Usable indic_transliteration_DEVANAGRI_OTHER api for transliteration
def indic_transliteration_DEVANAGRI_OTHER(text, dest_script):
if dest_script == "Malayalam":
out = transliterate(text, sanscript.DEVANAGARI, sanscript.MALAYALAM)
if dest_script == "Gujarati":
out = transliterate(text, sanscript.DEVANAGARI, sanscript.GUJARATI)
if dest_script == "Telugu":
out = transliterate(text, sanscript.DEVANAGARI, sanscript.TELUGU)
if dest_script == "Oriya":
out = transliterate(text, sanscript.DEVANAGARI, sanscript.ORIYA)
if dest_script == "Bengali":
out = transliterate(text, sanscript.DEVANAGARI, sanscript.BENGALI)
if dest_script == "Kannada":
out = transliterate(text, sanscript.DEVANAGARI, sanscript.KANNADA)
if dest_script == "Gurmukhi":
out = transliterate(text, sanscript.DEVANAGARI, sanscript.GURMUKHI)
if dest_script == "Tamil":
out = transliterate(text, sanscript.DEVANAGARI, sanscript.TAMIL)
return out
# -> Directly Usable indic_transliteration_KANNADA_OTHER api for transliteration
def indic_transliteration_KANNADA_OTHER(text, dest_script):
if dest_script == "Malayalam":
out = transliterate(text, sanscript.KANNADA, sanscript.MALAYALAM)
if dest_script == "Telugu":
out = transliterate(text, sanscript.KANNADA, sanscript.TELUGU)
if dest_script == "Tamil":
out = transliterate(text, sanscript.KANNADA, sanscript.TAMIL)
if dest_script == "Bengali":
out = transliterate(text, sanscript.KANNADA, sanscript.BENGALI)
return out
# -> Directly Usable indic_transliteration_OTHER_KANNADA api for transliteration
def indic_transliteration_OTHER_KANNADA(text, src_script):
if src_script == "Malayalam":
out = transliterate(text, sanscript.MALAYALAM, sanscript.KANNADA)
if src_script == "Telugu":
out = transliterate(text, sanscript.TELUGU, sanscript.KANNADA)
if src_script == "Tamil":
out = transliterate(text, sanscript.TAMIL, sanscript.KANNADA)
if src_script == "Bengali":
out = transliterate(text, sanscript.BENGALI, sanscript.KANNADA)
return out
# -> Directly Usable indic_transliteration_TAMIL_OTHER api for transliteration
def indic_transliteration_TAMIL_OTHER(text, dest_script):
if dest_script == "Malayalam":
out = transliterate(text, sanscript.TAMIL, sanscript.MALAYALAM)
if dest_script == "Telugu":
out = transliterate(text, sanscript.TAMIL, sanscript.TELUGU)
return out
# -> Directly Usable indic_transliteration_OTHER_TAMIL api for transliteration
def indic_transliteration_OTHER_TAMIL(text, src_script):
if src_script == "Malayalam":
out = transliterate(text, sanscript.MALAYALAM, sanscript.TAMIL)
if src_script == "Telugu":
out = transliterate(text, sanscript.TELUGU, sanscript.TAMIL)
return out
# -> Directly Usable indic_transliteration_TELUGU_OTHER api for transliteration
def indic_transliteration_TELUGU_OTHER(text, desc_script):
if desc_script == "Malayalam":
out = transliterate(text, sanscript.TELUGU, sanscript.MALAYALAM)
return out
# -> Directly Usable indic_transliteration_MALAYALAM_OTHER api for transliteration
def indic_transliteration_MALAYALAM_OTHER(text, desc_script):
if desc_script == "Telugu":
out = transliterate(text, sanscript.MALAYALAM, sanscript.TELUGU)
return out
# -> Directly Usable indic_transliteration_OTHER_GUJARATI api for transliteration
def indic_transliteration_OTHER_GUJARATI(text, src_script):
if src_script == "Gurmukhi":
out = transliterate(text, sanscript.GURMUKHI, sanscript.GUJARATI)
if src_script == "Oriya":
out = transliterate(text, sanscript.ORIYA, sanscript.GUJARATI)
return out
# -> Directly Usable indic_transliteration_OTHER_GURMUKHI api for transliteration
def indic_transliteration_OTHER_GURMUKHI(text, src_script):
if src_script == "Gujarati":
out = transliterate(text, sanscript.GUJARATI, sanscript.GURMUKHI)
if src_script == "Oriya":
out = transliterate(text, sanscript.ORIYA, sanscript.GURMUKHI)
return out
# -> Directly Usable indic_transliteration_OTHER_ORIYA api for transliteration
def indic_transliteration_OTHER_ORIYA(text, src_script):
if src_script == "Gujarati":
out = transliterate(text, sanscript.GUJARATI, sanscript.ORIYA)
if src_script == "Gurmukhi":
out = transliterate(text, sanscript.GURMUKHI, sanscript.ORIYA)
return out
from indicnlp.tokenize import sentence_tokenize
def punct_remover(string):
punctuations = """!()-[]{};:'"\,<>./?@#$%^&*_~…।"""
for x in string.lower():
if x in punctuations:
string = string.replace(x, " ")
return string
source_lang = "hi"
text = "सड़क के बीच में एक बड़ा ट्रक क्यों है?"
source_script = "Devanagari"
dest_script = "Latin"
# from fuzzywuzzy import fuzz
from difflib import SequenceMatcher
import Levenshtein
from rapidfuzz import fuzz
def calculate_edit_distance(original_word: str, transliterated_word: str) -> float:
return Levenshtein.distance(original_word, transliterated_word)
def calculate_similarity(original_word: str, transliterated_word: str) -> float:
return 1 - Levenshtein.distance(original_word, transliterated_word) / max(len(original_word), len(transliterated_word))
# return matcher.ratio()
def calculate_fuzz_similarity(original_word: str, transliterated_word: str) -> float:
return fuzz.ratio(original_word, transliterated_word)
def get_best_output(inside_func: callable, original_word: str, transliteration_outputs: list, reverse: bool=False):
best_transliteration = original_word
lowest_distance = float('inf')
highest_similarity = 0
parameter = highest_similarity if not reverse else lowest_distance
for candidate in transliteration_outputs:
# total_parameter = 0
total_parameter = inside_func(original_word, candidate)
print("total paramter", total_parameter, parameter, original_word, candidate)
# average_similarity = total_parameter
if not reverse:
# average_distance = total_parameter
if total_parameter > parameter:
parameter = total_parameter
# highest_similarity = average_similarity
best_transliteration = candidate
else:
# average_distance = total_parameter
if total_parameter < parameter:
parameter = total_parameter
# highest_similarity = average_distance
best_transliteration = candidate
print(best_transliteration)
return best_transliteration
def compare_transliteration_outputs(original_word: str, transliterated_words: list) -> str:
best_of_all_outputs = original_word
if original_word is None or transliterated_words is None:
return best_of_all_outputs
"""getting outputs compared using different functions and picking best outputs out of them"""
best_output1 = get_best_output(calculate_edit_distance, original_word, transliterated_words, True)
best_output2 = get_best_output(calculate_similarity, original_word, transliterated_words)
best_output3 = get_best_output(calculate_fuzz_similarity, original_word, transliterated_words)
best_of_all_outputs = Counter([best_output1, best_output2, best_output3]).most_common(1)[0][0]
print(best_output1, best_output2, best_output3, "89999999999999999999")
return best_of_all_outputs
# sources_name = {"0": "indic_trans", "1": "Azure","2": "libindic", "3": "sheetal"}
# priority_list = ["indic_trans", "Azure", "libindic", "sheetal"]
# etc_punctuation = ["", " . . .", " . .", " . . ”"]
# sentences = sentence_tokenize.sentence_split(text, lang="hi")
# if source_lang == "ne":
# source_lang = "hi"
# transliterated_text = []
# sentences = sentence_tokenize.sentence_split(text, lang="hi")
# if source_lang == "ne":
# source_lang = "hi"
# transliterated_text = []
# Out = []
# print("sentences", sentences)
# for sentence in sentences[0].split():
# print("full word -> ", sentence)
# if sentence in etc_punctuation:
# continue
# temp_sentence = punct_remover(sentence)
# t0 = indic_trans(temp_sentence, source_script, dest_script)
# t1 = azure_transliteration(
# temp_sentence, source_lang, source_script, dest_script
# )
# t2 = libindic(temp_sentence, dest_script).rstrip()
# t3 = sheetal(temp_sentence).replace("\n", "")
# Out = []
# for i in range(len(temp_sentence.split())):
# word = temp_sentence.split()[i]
# T0 = t0.split()[i]
# T1 = t1.split()[i]
# T2 = t2.split()[i]
# T3 = t3.split()[i]
# outputs = [T0, T1, T2, T3]
# out = compare_outputs_transliteration(
# word, outputs, sources_name, priority_list
# )
# Out.append(out)
# trans_sent_wo_punct = " ".join(Out)
# out = compare_transliteration_outputs(temp_sentence, [t0, t1, t2, t3])
# # print("this words output is -> ", out)
# # out = compare_outputs_transliteration(word, outputs, sources_name, priority_list)
# Out.append(out)
# trans_sent_wo_punct = " ".join(Out)
# print("trans_sent_wo_punct", trans_sent_wo_punct)
# transliterated_sentence = final_transliterated_sentence(
# sentence, trans_sent_wo_punct
# )
# print("trans_sent_w_punct", transliterated_sentence)
# transliterated_text.append(transliterated_sentence)
# print(transliterated_sentence)
# print("Entered Exiting Here1212", Out)
# f = open("output.txt", "w")
# f.write(" ".join(Out))
# f.close()
# return " ".join(transliterated_text)
# # print("original_sentence", sentence)
# temp_sentence = punct_remover(text)
# t0 = indic_trans(temp_sentence, source_script, dest_script)
# t1 = azure_transliteration(
# temp_sentence, source_lang, source_script, dest_script
# )
# t2 = libindic(temp_sentence, dest_script).rstrip()
# t3 = sheetal(temp_sentence).replace("\n", "")
# Out = []
# print(t0, t1, t2, t3)
# outputs_len = [len(T.split(" ")) for T in [t0, t1, t2, t3]]
# print(outputs_len, "outputs len")
# for i in range(len(temp_sentence.split())):
# word = temp_sentence.split()[i]
# T0 = t0.split()[i]
# T1 = t1.split()[i]
# T2 = t2.split()[i]
# T3 = t3.split()[i]
# outputs = [T0, T1, T2, T3]
#
# out = compare_outputs_transliteration(
# word, outputs, sources_name, priority_list
# )
# Out.append(out)
# trans_sent_wo_punct = " ".join(Out)
# out = compare_outputs_transliteration(temp_sentence, [t0,t1,t2,t3], sources_name, priority_list)
# # print("trans_sent_wo_punct", trans_sent_wo_punct)
# transliterated_sentence = final_transliterated_sentence(temp_sentence, out)
# print("trans_sent_", transliterated_sentence)
# transliterated_text.append(transliterated_sentence)
# print("Entered Exiting Here1212")
# print(" ".join(transliterated_text))
# if text in etc_punctuation:
# return text
# # print("original_sentence", sentence)
# temp_sentence = punct_remover(text)
# tt = 0
# try:
# t0 = indic_trans(temp_sentence, source_script, dest_script)
# outputa = t0
# except:
# tt += 1
# try:
# if tt == 1:
# t1 = azure_transliteration(
# temp_sentence, source_lang, source_script, dest_script
# )
# outputa = t1
# except:
# tt += 1
# # print("before t1111111111")
# try:
# if tt == 2:
# t2 = libindic(temp_sentence, dest_script).rstrip()
# outputa = t2
# except:
# tt += 1
# # print("before sheetal", t2)
# try:
# if tt == 3:
# t3 = sheetal(temp_sentence).replace("\n", "")
# outputa = t3
# except:
# tt += 1
#
# if tt == 4:
# outputa = text
# else:
# trans_sent_wo_punct = outputa
# print("trans_sent_wo_punct", trans_sent_wo_punct)
# transliterated_sentence = final_transliterated_sentence(
# sentence, trans_sent_wo_punct
# )
# print("trans_sent_w_punct", transliterated_sentence)
# transliterated_text.append(transliterated_sentence)
# print("Entered Exiting Here1212")
# return outputa
# source_lang = "hi"
# source_script = "Latin"
# dest_script = "Devanagari"
# sources_name = {
# "0": "Azure",
# "1": "indic_trans",
# "2": "google",
# "3": "indic_trans_IAST",
# }
# sentences = sentence_tokenize.sentence_split(text, lang="en")
# priority_list = [
# "Azure",
# "indic_trans",
# "google",
# "indic_trans_IAST",
# ]
# transliterated_text = []
# for sentence in sentences:
# if (
# sentence == ""
# or sentence == " . . ."
# or sentence == " . ."
# or sentence == " . . ”"
# ):
# continue
# OUT = []
# for word in sentence.split():
# if word == ".":
# continue
# t0 = azure_transliteration(
# word, source_lang, source_script, dest_script)
# t1 = indic_trans(word, source_script, dest_script)
# t2 = google(word, "en", "hi")
# t3 = indic_transliteration_IAST(word)
# outputs = [t0, t1, t2, t3]
# out = compare_outputs_transliteration(
# word, outputs, sources_name, priority_list
# )
# OUT.append(out)
# transliterated_text.append(" ".join(OUT))
# print("running perfectly")
# return " ".join(transliterated_text)
# print(indic_transliteration_IAST("mera naam dharmesh hai"))