Conversion_Kitchen_Code/kitchen_counter/conversion/translation/transliteration_testing.py

1058 lines
36 KiB
Python
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# from transliteration_resources import (
# azure_transliteration,
# indic_trans,
# indic_transliteration_OTHER_GUJARATI,
# indic_transliteration_OTHER_GURMUKHI,
# indic_transliteration_OTHER_ORIYA,
# om_transliterator,
# libindic,
# indic_transliteration_IAST,
# indic_transliteration_ITRANS,
# # polyglot_trans,
# sheetal,
# unicode_transliteration_GURMUKHI,
# indic_transliteration_GURMUKHI,
# transliteration_LATIN_CYRILLIC,
# indic_transliteration_TELUGU,
# unicode_transliteration_GURMUKHI_LATIN,
# indic_transliteration_GURMUKHI_LATIN,
# transliteration_CYRILIC_LATIN,
# ConvertToLatin,
# readonly,
# indic_transliteration_OTHER_DEVANAGRI,
# indic_transliteration_DEVANAGRI_OTHER,
# indic_transliteration_KANNADA_OTHER,
# indic_transliteration_OTHER_KANNADA,
# indic_transliteration_TAMIL_OTHER,
# indic_transliteration_OTHER_TAMIL,
# indic_transliteration_TELUGU_OTHER,
# indic_transliteration_MALAYALAM_OTHER,
# indic_transliteration_OTHER_GUJARATI,
# indic_transliteration_OTHER_GURMUKHI,
# indic_transliteration_OTHER_ORIYA,
# translit_CHINESE_LATIN,
# translit_th_sin_mng_heb_to_latin
# ) # , translit_THAI_LATIN
import subprocess
import sys
import os
import requests, uuid, json
from indictrans import Transliterator
from om_transliterator import Transliterator as om_Transliterator
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate
from libindic.transliteration import getInstance
t = getInstance()
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate
from indicnlp.transliterate.unicode_transliterate import UnicodeIndicTransliterator
from transliterate import translit # , get_available_language_codes
from indic_transliteration.sanscript import SchemeMap, SCHEMES, transliterate
# import polyglot
# from polyglot.transliteration import Transliterator as poly
# from polyglot.text import Text
import pinyin
from anyascii import anyascii
# from MNF.settings import BasePath
basePath = "/home/user/mnf/project/MNF"
# -> Directly Usable azure api for transliteration
def azure_transliteration(text, source_lang, source_script, dest_script):
if source_script == "Devanagari":
source_script = "Deva"
elif source_script == "Arabic":
source_script = "Arab"
elif source_script == "Latin":
source_script = "Latn"
elif source_script == "Kannada":
source_script = "knda"
elif source_script == "Tamil":
source_script = "Taml"
elif source_script == "Bengali":
source_script = "Beng"
elif source_script == "Telugu":
source_script = "Telu"
elif source_script == "Malayalam":
source_script = "Mlym"
elif source_script == "Cyrillic":
source_script = "Cyrl"
elif source_script == "Gurmukhi":
source_script = "Guru"
elif source_script == "Telugu":
source_script = "Telu"
elif source_script == "Gujarati":
source_script = "Gujr"
elif source_script == "Oriya":
source_script = "Orya"
elif source_script == "Sinhala":
source_script = "Sinh"
elif source_script == "Hanji":
source_script = "Hans"
elif source_script == "Thai":
source_script = "Thai"
elif source_script == "Hebrew":
source_script = "Hebr"
if dest_script == "Devanagari":
dest_script = "Deva"
elif dest_script == "Arabic":
dest_script = "Arab"
elif dest_script == "Latin":
dest_script = "Latn"
elif dest_script == "Kannada":
dest_script = "knda"
elif dest_script == "Tamil":
dest_script = "Taml"
elif dest_script == "Cyrillic":
dest_script = "Cyrl"
elif dest_script == "Malayalam":
dest_script = "Mlym"
elif dest_script == "Gurmukhi":
dest_script = "Guru"
elif dest_script == "Telugu":
dest_script = "Telu"
elif dest_script == "Gujarati":
dest_script = "Gujr"
elif dest_script == "Oriya":
dest_script = "Orya"
elif dest_script == "Bengali":
dest_script = "Beng"
elif dest_script == "Sinhala":
dest_script = "Sinh"
elif dest_script == "Hanji":
dest_script = "Hans"
elif dest_script == "Thai":
dest_script = "Thai"
elif dest_script == "Hebrew":
dest_script = "Hebr"
subscription_key = "959354878e73458e898a69f1f5887b69"
endpoint = "https://api.cognitive.microsofttranslator.com"
location = "eastus"
path = '/translate'
constructed_url = endpoint + path
headers = {
'Ocp-Apim-Subscription-Key': subscription_key,
'Ocp-Apim-Subscription-Region': location,
'Content-type': 'application/json',
'X-ClientTraceId': str(uuid.uuid4())
}
constructed_url1 = "https://api.cognitive.microsofttranslator.com/transliterate?api-version=3.0"
print("source_script", source_script)
print("dest_script", dest_script)
print("source_lang", source_lang)
print("text", text)
params = {'language': source_lang, 'fromScript': source_script, 'toScript': dest_script}
body = [{'text': text}]
# try:
request = requests.post(constructed_url1, params=params, headers=headers, json=body)
response = request.json()
print(response)
out = response[0]['text']
# except Exception as e:
# print("The error was ", e)
# out = text
return out
print(azure_transliteration("mera naam dharmesh hai", "hi", "Latn", "Deva"))
from collections import Counter
def two_sources_two_outputs(sources_name, O):
print("sources name is", sources_name, O)
dict1 = Counter(O)
print("dict1", dict1)
sorted_values = sorted(dict1.values(), reverse=True) # Sort the values
print("sorted_value", sorted_values)
sorted_dict = {}
for i in sorted_values:
for k in dict1.keys():
if dict1[k] == i:
sorted_dict[k] = dict1[k]
print("sorted_Dict", sorted_dict)
sources = list(sorted_dict.keys())
print(sources)
rm =[]
for r in Counter(O).keys():
temp = [i for i in range(len(O)) if O[i] == r]
rm.append(temp)
print("rm", rm)
resANDmethods_indexes={}
fs = list(Counter(O).keys())
print("fs", fs)
for t in range(len(fs)):
resANDmethods_indexes.update({fs[t]: rm[t]})
print("here it is", resANDmethods_indexes)
out1 = sources[0]
source1 = [sources_name[str(i)] for i in resANDmethods_indexes[out1]]
print(source1)
if len(sources)==1:
return (out1, source1), ("", "")
else:
out2 = sources[1]
source2 = [sources_name[str(i)] for i in resANDmethods_indexes[out2]]
print((out1, source1), (out2, source2))
return (out1, source1), (out2, source2)
def selection_source_transliteration(sources_name, O, priority_list):
seq = list(Counter(O).values())
print(seq)
seq.sort(reverse = True)
print(seq)
check=[]
temp="y"
# -> check if highest sequence value is greater than all other values
for i in range(len(seq)-1):
if seq[0]>seq[i+1]:
check.append(i)
print("check here is -> ", check)
# -> check if the highest occurence of a number in seq is greater than all others?
if len(check)==(len(seq)-1):
temp = "yes"
print("check", check)
if temp=="yes":
print("here1")
(o1, s1), (o2, s2) = two_sources_two_outputs(sources_name, O)
print((o1, s1), (o2, s2))
output1 = o1
source1 = s1
print(seq)
if len(seq)==2:
output2 = o2
source2 = s2
else:
temp1="y"
check1=[]
for i in range(len(seq)-2):
if seq[1]>seq[i+2]:
check.append(i)
if len(check1)==(len(seq)-2):
temp1 = "yes"
if temp1=="yes":
output2 = o2
source2 = s2
else:
for i in priority_list:
temp_source="test"
if i not in source1:
temp_source = i
break
if temp_source=="test":
output2 = o2
source2 = s2
else:
if temp_source != priority_list[1]:
output2= O[priority_list.index(temp_source)]
source2=temp_source
else:
output2= O[priority_list.index(priority_list[1])]
source2=priority_list[1]
else:
(o1, s1), (o2, s2) = two_sources_two_outputs(sources_name, O)
if priority_list[0] in s1:
output1= o1
source1= s1
elif priority_list[0] in s2:
output1= o2
source1= s2
else:
output1=O[0]
source1= priority_list[0]
temp_source = "test"
for i in priority_list:
if i not in source1:
temp_source = i
break
if temp_source=="test":
output2 = o2
source2 = s2
else:
if temp_source != priority_list[1]:
output2= O[priority_list.index(temp_source)]
source2=temp_source
else:
output2= O[priority_list.index(priority_list[1])]
source2=priority_list[1]
return(output1, source1), (output2, source2)
def space_after_punct(text):
import re
# text = text.replace('...',' ... ')
text = text.replace(". . .", " ... ")
text = re.sub("([,!?()…-])", r"\1 ", text)
text = re.sub("\s{2,}", " ", text)
return text
def final_transliterated_sentence(original, transliterated):
original = space_after_punct(original)
punct_list = [
"!",
'"',
"#",
"$",
"%",
"&",
"'",
"(",
")",
"*",
"+",
",",
" ",
"-",
".",
"/",
":",
";",
"<",
"=",
">",
"?",
"@",
"[",
"\\",
"]",
"^",
"_",
"`",
"{",
"|",
"}",
"~",
"",
"...",
"",
]
sentence = []
j = 0
for i in range(len(original.split())):
if original.split()[i] in punct_list:
sentence.append(original.split()[i])
elif original.split()[i][-1] in punct_list:
temp = transliterated.split()[j] + original.split()[i][-1]
sentence.append(temp)
j = j + 1
elif original.split()[i][-1] not in punct_list:
temp = transliterated.split()[j]
sentence.append(temp)
j = j + 1
transliterated_sentence = " ".join(sentence)
transliterated_sentence.replace(" ... ", "...")
transliterated_sentence.replace("", "")
return transliterated_sentence
def compare_outputs_transliteration(word, outputs, sources_name, priority_list):
# print(outputs)
# doc2 = docx.Document()
# sections = doc2.sections
# for section in sections:
# section.top_margin = Inches(0.2)
# section.bottom_margin = Inches(0.2)
# section.left_margin = Inches(0.2)
# section.right_margin = Inches(0.2)
# section = doc2.sections[-1]
# new_height = section.page_width
# section.page_width = section.page_height
# section.page_height = new_height
# name = 'Final table ' + doc_file
# doc2.add_heading(name, 0)
# doc_para = doc2.add_paragraph()
# doc_para.add_run('Translation resources used : Google, IBM watson, AWS, Azure, Lingvanex, Yandex').bold = True
# table2 = doc2.add_table(rows=1, cols=4)
# table2.style = 'TableGrid'
# hdr_Cells = table2.rows[0].cells
# hdr_Cells[0].paragraphs[0].add_run("Input").bold = True
# hdr_Cells[1].paragraphs[0].add_run("Output1").bold = True
# hdr_Cells[2].paragraphs[0].add_run("Output2").bold = True
# hdr_Cells[3].paragraphs[0].add_run("Output3").bold = True
O1ANDS1, O2ANDS2 = selection_source_transliteration(
sources_name, outputs, priority_list
)
print(O1ANDS1, "compare all transliterations")
# add_dial_comparison_doc2_transliteration(doc2, table2, word, O1ANDS1, O2ANDS2, sources_name)
return O1ANDS1[0]
# -> Directly Usable Polyglot api for transliteration
# def polyglot_trans(text, source_script, dest_script):
# # from polyglot.downloader import downloader
# if source_script=="Latin":
# source_script="en"
# elif source_script=="Arabic":
# source_script="ar"
# elif source_script=="Hanji":
# source_script="zh"
# if dest_script=="Latin":
# dest_script="en"
# elif dest_script=="Arabic":
# dest_script="ar"
# elif source_script=="Hanji":
# source_script="zh"
# new_text = ""
# text_break = Text(text)
# for x in text_break.transliterate(dest_script):
# new_text = new_text + str(x)
# return new_text
# -> Directly Usable indic_trans api for transliteration
def indic_trans(text, source_script, dest_script):
if source_script == "Devanagari":
source_script = "hin"
elif source_script == "Arabic":
source_script = "urd"
elif source_script == "Kannada":
source_script = "kan"
elif source_script == "Tamil":
source_script = "tam"
elif source_script == "Latin":
source_script = "eng"
elif source_script == "Bengali":
source_script = "ben"
elif source_script == "Telugu":
source_script = "tel"
elif source_script == "Malayalam":
source_script = "mal"
elif source_script == "Tamil":
source_script = "tam"
elif source_script == "Oriya":
source_script = "ori"
elif source_script == "Gujarati":
source_script = "guj"
elif source_script == "Gurmukhi":
source_script = "pan"
if dest_script == "Devanagari":
dest_script = "hin"
elif dest_script == "Arabic":
dest_script = "urd"
elif dest_script == "Kannada":
dest_script = "kan"
elif dest_script == "Latin":
dest_script = "eng"
elif source_script == "Tamil":
source_script = "tam"
elif dest_script == "Gujarati":
dest_script = "guj"
elif dest_script == "Oriya":
dest_script = "ori"
elif dest_script == "Telugu":
dest_script = "tel"
elif dest_script == "Malayalam":
dest_script = "mal"
# elif dest_script=="Gurmukhi":
# dest_script="Guru"
elif dest_script == "Telugu":
dest_script = "Telu"
elif dest_script == "Gujarati":
dest_script = "Gujr"
elif dest_script == "Oriya":
dest_script = "Orya"
elif dest_script == "Bengali":
dest_script = "Ben"
elif dest_script == "Tamil":
dest_script = "tam"
elif dest_script == "Gurmukhi":
dest_script = "pan"
trn = Transliterator(source=source_script, target=dest_script, build_lookup=True)
out = trn.transform(text)
return out
# -> Directly Usable om_translator api for transliteration
def om_transliterator(text):
transliterator = om_Transliterator()
out = transliterator.knda_to_latn(text)
return out
# -> Directly Usable libindic api for transliteration
def libindic(text, dest_script):
if dest_script == "Devanagari":
dest_script = "hi"
elif dest_script == "Latin":
dest_script = "en"
elif dest_script == "Malayalam":
dest_script = "ml"
elif dest_script == "Gujarati":
dest_script = "gu"
elif dest_script == "Oriya":
dest_script = "or"
elif dest_script == "Telugu":
dest_script = "te"
elif dest_script == "Bengali":
dest_script = "bn"
elif dest_script == "Tamil":
dest_script = "ta"
elif dest_script == "Kannada":
dest_script = "kn"
elif dest_script == "Gurmukhi":
dest_script = "gu"
code = dest_script + '_IN'
out = t.transliterate(text, code)
return out
# -> Directly Usable indic_transliteration_IAST api for transliteration
def indic_transliteration_IAST(text):
out = transliterate(text, sanscript.IAST, sanscript.DEVANAGARI)
return out
# -> Directly Usable indic_transliteration_ITRANS api for transliteration
def indic_transliteration_ITRANS(text):
out = transliterate(text, sanscript.ITRANS, sanscript.DEVANAGARI)
return out
# -> Directly Usable sheetal api for transliteration
def sheetal(text):
s2_out = subprocess.check_output([sys.executable, rf"{basePath}/conversion/translation/dev-rom-sheetal.py", text])
out = s2_out.decode('utf-8')
return out
# -> Directly Usable ritwik code for transliteration
def ritwik(text):
s2_out = subprocess.check_output([sys.executable, rf"{basePath}/conversion/translation/dev-rom-ritwik.py", text])
out = s2_out.decode('utf-8')
return out
# -> Directly Usable indic_transliteration_GURMUKHI api for transliteration
def indic_transliteration_GURMUKHI(text):
out = transliterate(text, sanscript.IAST, sanscript.GURMUKHI)
return out
# -> Directly Usable unicode_transliteration_GURMUKHI api for transliteration
def unicode_transliteration_GURMUKHI(text):
input_text = transliterate(text, sanscript.IAST, sanscript.DEVANAGARI)
out = UnicodeIndicTransliterator.transliterate(input_text, "hi", "pa")
return out
# -> Directly Usable transliteration_LATIN_CYRILLIC api for transliteration
def transliteration_LATIN_CYRILLIC(text):
out = translit(text, 'bg')
return out
# -> Directly Usable translit_CHINESE_LATIN api for transliteration
def translit_CHINESE_LATIN(text):
out = pinyin.get(text, format="strip", delimiter=" ")
return out
def translit_th_sin_mng_heb_to_latin(text):
out = anyascii(text)
return out
# -> Directly Usable indic_transliteration_TELUGU api for transliteration
def indic_transliteration_TELUGU(text):
out = transliterate(text, sanscript.IAST, sanscript.TELUGU)
return out
# -> Directly Usable indic_transliteration_GURMUKHI_LATIN api for transliteration
def indic_transliteration_GURMUKHI_LATIN(text):
out = transliterate(text, sanscript.GURMUKHI, sanscript.ITRANS)
return out
# -> Directly Usable unicode_transliteration_GURMUKHI_LATIN api for transliteration
def unicode_transliteration_GURMUKHI_LATIN(text):
input_text = transliterate(text, sanscript.IAST, sanscript.DEVANAGARI)
out = UnicodeIndicTransliterator.transliterate(input_text, "hi", "pa")
return out
# -> Directly Usable transliteration_CYRILIC_LATIN api for transliteration
def transliteration_CYRILIC_LATIN(text):
out = translit(text, 'bg', reversed=True)
return out
# -> Some Random Code to replace special characters
def readonly(str):
str = str.replace("а", "a")
str = str.replace("б", "b")
str = str.replace("в", "v")
str = str.replace("г", "g")
str = str.replace("д", "d")
str = str.replace("е", "e")
str = str.replace("ё", "yo")
str = str.replace("ж", "zh")
str = str.replace("з", "z")
str = str.replace("и", "i")
str = str.replace("й", "j")
str = str.replace("к", "k")
str = str.replace("л", "l")
str = str.replace("м", "m")
str = str.replace("н", "n")
str = str.replace("о", "o")
str = str.replace("п", "p")
str = str.replace("р", "r")
str = str.replace("с", "s")
str = str.replace("т", "t")
str = str.replace("у", "u")
str = str.replace("ф", "f")
str = str.replace("х", "h")
str = str.replace("ц", "c")
str = str.replace("ч", "ch")
str = str.replace("ш", "sh")
str = str.replace("щ", "sch")
str = str.replace("ъ", "j")
str = str.replace("ы", "i")
str = str.replace("ь", "j")
str = str.replace("э", "e")
str = str.replace("ю", "yu")
str = str.replace("я", "ya")
str = str.replace("А", "A")
str = str.replace("Б", "B")
str = str.replace("В", "V")
str = str.replace("Г", "G")
str = str.replace("Д", "D")
str = str.replace("Е", "E")
str = str.replace("Ё", "Yo")
str = str.replace("Ж", "Zh")
str = str.replace("З", "Z")
str = str.replace("И", "I")
str = str.replace("Й", "J")
str = str.replace("К", "K")
str = str.replace("Л", "L")
str = str.replace("М", "M")
str = str.replace("Н", "N")
str = str.replace("О", "O")
str = str.replace("П", "P")
str = str.replace("Р", "R")
str = str.replace("С", "S")
str = str.replace("Т", "T")
str = str.replace("У", "U")
str = str.replace("Ф", "F")
str = str.replace("Х", "H")
str = str.replace("Ц", "C")
str = str.replace("Ч", "Ch")
str = str.replace("Ш", "Sh")
str = str.replace("Щ", "Sch")
str = str.replace("Ъ", "J")
str = str.replace("Ы", "I")
str = str.replace("Ь", "J")
str = str.replace("Э", "E")
str = str.replace("Ю", "Yu")
str = str.replace("Я", "Ya")
return str
# -> Code to Convert Letters to Latin Script
def ConvertToLatin(source):
result = ''
for letter in source:
Letter = readonly(letter) ## replacemnet of word
result = result + Letter
return result
# -> Directly Usable indic_transliteration_OTHER_DEVANAGRI api for transliteration
def indic_transliteration_OTHER_DEVANAGRI(text, src_script):
if src_script == "Malayalam":
out = transliterate(text, sanscript.MALAYALAM, sanscript.DEVANAGARI)
if src_script == "Gujarati":
out = transliterate(text, sanscript.GUJARATI, sanscript.DEVANAGARI)
if src_script == "Telugu":
out = transliterate(text, sanscript.TELUGU, sanscript.DEVANAGARI)
if src_script == "Oriya":
out = transliterate(text, sanscript.ORIYA, sanscript.DEVANAGARI)
if src_script == "Bengali":
out = transliterate(text, sanscript.BENGALI, sanscript.DEVANAGARI)
if src_script == "Kannada":
out = transliterate(text, sanscript.KANNADA, sanscript.DEVANAGARI)
if src_script == "Gurmukhi":
out = transliterate(text, sanscript.GURMUKHI, sanscript.DEVANAGARI)
if src_script == "Tamil":
out = transliterate(text, sanscript.TAMIL, sanscript.DEVANAGARI)
return out
# -> Directly Usable indic_transliteration_DEVANAGRI_OTHER api for transliteration
def indic_transliteration_DEVANAGRI_OTHER(text, dest_script):
if dest_script == "Malayalam":
out = transliterate(text, sanscript.DEVANAGARI, sanscript.MALAYALAM)
if dest_script == "Gujarati":
out = transliterate(text, sanscript.DEVANAGARI, sanscript.GUJARATI)
if dest_script == "Telugu":
out = transliterate(text, sanscript.DEVANAGARI, sanscript.TELUGU)
if dest_script == "Oriya":
out = transliterate(text, sanscript.DEVANAGARI, sanscript.ORIYA)
if dest_script == "Bengali":
out = transliterate(text, sanscript.DEVANAGARI, sanscript.BENGALI)
if dest_script == "Kannada":
out = transliterate(text, sanscript.DEVANAGARI, sanscript.KANNADA)
if dest_script == "Gurmukhi":
out = transliterate(text, sanscript.DEVANAGARI, sanscript.GURMUKHI)
if dest_script == "Tamil":
out = transliterate(text, sanscript.DEVANAGARI, sanscript.TAMIL)
return out
# -> Directly Usable indic_transliteration_KANNADA_OTHER api for transliteration
def indic_transliteration_KANNADA_OTHER(text, dest_script):
if dest_script == "Malayalam":
out = transliterate(text, sanscript.KANNADA, sanscript.MALAYALAM)
if dest_script == "Telugu":
out = transliterate(text, sanscript.KANNADA, sanscript.TELUGU)
if dest_script == "Tamil":
out = transliterate(text, sanscript.KANNADA, sanscript.TAMIL)
if dest_script == "Bengali":
out = transliterate(text, sanscript.KANNADA, sanscript.BENGALI)
return out
# -> Directly Usable indic_transliteration_OTHER_KANNADA api for transliteration
def indic_transliteration_OTHER_KANNADA(text, src_script):
if src_script == "Malayalam":
out = transliterate(text, sanscript.MALAYALAM, sanscript.KANNADA)
if src_script == "Telugu":
out = transliterate(text, sanscript.TELUGU, sanscript.KANNADA)
if src_script == "Tamil":
out = transliterate(text, sanscript.TAMIL, sanscript.KANNADA)
if src_script == "Bengali":
out = transliterate(text, sanscript.BENGALI, sanscript.KANNADA)
return out
# -> Directly Usable indic_transliteration_TAMIL_OTHER api for transliteration
def indic_transliteration_TAMIL_OTHER(text, dest_script):
if dest_script == "Malayalam":
out = transliterate(text, sanscript.TAMIL, sanscript.MALAYALAM)
if dest_script == "Telugu":
out = transliterate(text, sanscript.TAMIL, sanscript.TELUGU)
return out
# -> Directly Usable indic_transliteration_OTHER_TAMIL api for transliteration
def indic_transliteration_OTHER_TAMIL(text, src_script):
if src_script == "Malayalam":
out = transliterate(text, sanscript.MALAYALAM, sanscript.TAMIL)
if src_script == "Telugu":
out = transliterate(text, sanscript.TELUGU, sanscript.TAMIL)
return out
# -> Directly Usable indic_transliteration_TELUGU_OTHER api for transliteration
def indic_transliteration_TELUGU_OTHER(text, desc_script):
if desc_script == "Malayalam":
out = transliterate(text, sanscript.TELUGU, sanscript.MALAYALAM)
return out
# -> Directly Usable indic_transliteration_MALAYALAM_OTHER api for transliteration
def indic_transliteration_MALAYALAM_OTHER(text, desc_script):
if desc_script == "Telugu":
out = transliterate(text, sanscript.MALAYALAM, sanscript.TELUGU)
return out
# -> Directly Usable indic_transliteration_OTHER_GUJARATI api for transliteration
def indic_transliteration_OTHER_GUJARATI(text, src_script):
if src_script == "Gurmukhi":
out = transliterate(text, sanscript.GURMUKHI, sanscript.GUJARATI)
if src_script == "Oriya":
out = transliterate(text, sanscript.ORIYA, sanscript.GUJARATI)
return out
# -> Directly Usable indic_transliteration_OTHER_GURMUKHI api for transliteration
def indic_transliteration_OTHER_GURMUKHI(text, src_script):
if src_script == "Gujarati":
out = transliterate(text, sanscript.GUJARATI, sanscript.GURMUKHI)
if src_script == "Oriya":
out = transliterate(text, sanscript.ORIYA, sanscript.GURMUKHI)
return out
# -> Directly Usable indic_transliteration_OTHER_ORIYA api for transliteration
def indic_transliteration_OTHER_ORIYA(text, src_script):
if src_script == "Gujarati":
out = transliterate(text, sanscript.GUJARATI, sanscript.ORIYA)
if src_script == "Gurmukhi":
out = transliterate(text, sanscript.GURMUKHI, sanscript.ORIYA)
return out
from indicnlp.tokenize import sentence_tokenize
def punct_remover(string):
punctuations = """!()-[]{};:'"\,<>./?@#$%^&*_~…।"""
for x in string.lower():
if x in punctuations:
string = string.replace(x, " ")
return string
source_lang = "hi"
text = "सड़क के बीच में एक बड़ा ट्रक क्यों है?"
source_script = "Devanagari"
dest_script = "Latin"
# from fuzzywuzzy import fuzz
from difflib import SequenceMatcher
import Levenshtein
from rapidfuzz import fuzz
def calculate_edit_distance(original_word: str, transliterated_word: str) -> float:
return Levenshtein.distance(original_word, transliterated_word)
def calculate_similarity(original_word: str, transliterated_word: str) -> float:
return 1 - Levenshtein.distance(original_word, transliterated_word) / max(len(original_word), len(transliterated_word))
# return matcher.ratio()
def calculate_fuzz_similarity(original_word: str, transliterated_word: str) -> float:
return fuzz.ratio(original_word, transliterated_word)
def get_best_output(inside_func: callable, original_word: str, transliteration_outputs: list, reverse: bool=False):
best_transliteration = original_word
lowest_distance = float('inf')
highest_similarity = 0
parameter = highest_similarity if not reverse else lowest_distance
for candidate in transliteration_outputs:
# total_parameter = 0
total_parameter = inside_func(original_word, candidate)
print("total paramter", total_parameter, parameter, original_word, candidate)
# average_similarity = total_parameter
if not reverse:
# average_distance = total_parameter
if total_parameter > parameter:
parameter = total_parameter
# highest_similarity = average_similarity
best_transliteration = candidate
else:
# average_distance = total_parameter
if total_parameter < parameter:
parameter = total_parameter
# highest_similarity = average_distance
best_transliteration = candidate
print(best_transliteration)
return best_transliteration
def compare_transliteration_outputs(original_word: str, transliterated_words: list) -> str:
best_of_all_outputs = original_word
if original_word is None or transliterated_words is None:
return best_of_all_outputs
"""getting outputs compared using different functions and picking best outputs out of them"""
best_output1 = get_best_output(calculate_edit_distance, original_word, transliterated_words, True)
best_output2 = get_best_output(calculate_similarity, original_word, transliterated_words)
best_output3 = get_best_output(calculate_fuzz_similarity, original_word, transliterated_words)
best_of_all_outputs = Counter([best_output1, best_output2, best_output3]).most_common(1)[0][0]
print(best_output1, best_output2, best_output3, "89999999999999999999")
return best_of_all_outputs
# sources_name = {"0": "indic_trans", "1": "Azure","2": "libindic", "3": "sheetal"}
# priority_list = ["indic_trans", "Azure", "libindic", "sheetal"]
# etc_punctuation = ["", " . . .", " . .", " . . ”"]
# sentences = sentence_tokenize.sentence_split(text, lang="hi")
# if source_lang == "ne":
# source_lang = "hi"
# transliterated_text = []
# sentences = sentence_tokenize.sentence_split(text, lang="hi")
# if source_lang == "ne":
# source_lang = "hi"
# transliterated_text = []
# Out = []
# print("sentences", sentences)
# for sentence in sentences[0].split():
# print("full word -> ", sentence)
# if sentence in etc_punctuation:
# continue
# temp_sentence = punct_remover(sentence)
# t0 = indic_trans(temp_sentence, source_script, dest_script)
# t1 = azure_transliteration(
# temp_sentence, source_lang, source_script, dest_script
# )
# t2 = libindic(temp_sentence, dest_script).rstrip()
# t3 = sheetal(temp_sentence).replace("\n", "")
# Out = []
# for i in range(len(temp_sentence.split())):
# word = temp_sentence.split()[i]
# T0 = t0.split()[i]
# T1 = t1.split()[i]
# T2 = t2.split()[i]
# T3 = t3.split()[i]
# outputs = [T0, T1, T2, T3]
# out = compare_outputs_transliteration(
# word, outputs, sources_name, priority_list
# )
# Out.append(out)
# trans_sent_wo_punct = " ".join(Out)
# out = compare_transliteration_outputs(temp_sentence, [t0, t1, t2, t3])
# # print("this words output is -> ", out)
# # out = compare_outputs_transliteration(word, outputs, sources_name, priority_list)
# Out.append(out)
# trans_sent_wo_punct = " ".join(Out)
# print("trans_sent_wo_punct", trans_sent_wo_punct)
# transliterated_sentence = final_transliterated_sentence(
# sentence, trans_sent_wo_punct
# )
# print("trans_sent_w_punct", transliterated_sentence)
# transliterated_text.append(transliterated_sentence)
# print(transliterated_sentence)
# print("Entered Exiting Here1212", Out)
# f = open("output.txt", "w")
# f.write(" ".join(Out))
# f.close()
# return " ".join(transliterated_text)
# # print("original_sentence", sentence)
# temp_sentence = punct_remover(text)
# t0 = indic_trans(temp_sentence, source_script, dest_script)
# t1 = azure_transliteration(
# temp_sentence, source_lang, source_script, dest_script
# )
# t2 = libindic(temp_sentence, dest_script).rstrip()
# t3 = sheetal(temp_sentence).replace("\n", "")
# Out = []
# print(t0, t1, t2, t3)
# outputs_len = [len(T.split(" ")) for T in [t0, t1, t2, t3]]
# print(outputs_len, "outputs len")
# for i in range(len(temp_sentence.split())):
# word = temp_sentence.split()[i]
# T0 = t0.split()[i]
# T1 = t1.split()[i]
# T2 = t2.split()[i]
# T3 = t3.split()[i]
# outputs = [T0, T1, T2, T3]
#
# out = compare_outputs_transliteration(
# word, outputs, sources_name, priority_list
# )
# Out.append(out)
# trans_sent_wo_punct = " ".join(Out)
# out = compare_outputs_transliteration(temp_sentence, [t0,t1,t2,t3], sources_name, priority_list)
# # print("trans_sent_wo_punct", trans_sent_wo_punct)
# transliterated_sentence = final_transliterated_sentence(temp_sentence, out)
# print("trans_sent_", transliterated_sentence)
# transliterated_text.append(transliterated_sentence)
# print("Entered Exiting Here1212")
# print(" ".join(transliterated_text))
# if text in etc_punctuation:
# return text
# # print("original_sentence", sentence)
# temp_sentence = punct_remover(text)
# tt = 0
# try:
# t0 = indic_trans(temp_sentence, source_script, dest_script)
# outputa = t0
# except:
# tt += 1
# try:
# if tt == 1:
# t1 = azure_transliteration(
# temp_sentence, source_lang, source_script, dest_script
# )
# outputa = t1
# except:
# tt += 1
# # print("before t1111111111")
# try:
# if tt == 2:
# t2 = libindic(temp_sentence, dest_script).rstrip()
# outputa = t2
# except:
# tt += 1
# # print("before sheetal", t2)
# try:
# if tt == 3:
# t3 = sheetal(temp_sentence).replace("\n", "")
# outputa = t3
# except:
# tt += 1
#
# if tt == 4:
# outputa = text
# else:
# trans_sent_wo_punct = outputa
# print("trans_sent_wo_punct", trans_sent_wo_punct)
# transliterated_sentence = final_transliterated_sentence(
# sentence, trans_sent_wo_punct
# )
# print("trans_sent_w_punct", transliterated_sentence)
# transliterated_text.append(transliterated_sentence)
# print("Entered Exiting Here1212")
# return outputa
# source_lang = "hi"
# source_script = "Latin"
# dest_script = "Devanagari"
# sources_name = {
# "0": "Azure",
# "1": "indic_trans",
# "2": "google",
# "3": "indic_trans_IAST",
# }
# sentences = sentence_tokenize.sentence_split(text, lang="en")
# priority_list = [
# "Azure",
# "indic_trans",
# "google",
# "indic_trans_IAST",
# ]
# transliterated_text = []
# for sentence in sentences:
# if (
# sentence == ""
# or sentence == " . . ."
# or sentence == " . ."
# or sentence == " . . ”"
# ):
# continue
# OUT = []
# for word in sentence.split():
# if word == ".":
# continue
# t0 = azure_transliteration(
# word, source_lang, source_script, dest_script)
# t1 = indic_trans(word, source_script, dest_script)
# t2 = google(word, "en", "hi")
# t3 = indic_transliteration_IAST(word)
# outputs = [t0, t1, t2, t3]
# out = compare_outputs_transliteration(
# word, outputs, sources_name, priority_list
# )
# OUT.append(out)
# transliterated_text.append(" ".join(OUT))
# print("running perfectly")
# return " ".join(transliterated_text)
# print(indic_transliteration_IAST("mera naam dharmesh hai"))