Conversion_Kitchen_Code/kitchen_counter/conversion/translation/transliteration_function.py

498 lines
22 KiB
Python
Executable File

from indicnlp.tokenize import sentence_tokenize
from .transliteration_resources import azure_transliteration, indic_trans, om_transliterator, libindic, indic_transliteration_IAST, indic_transliteration_ITRANS, sheetal,unicode_transliteration_GURMUKHI,indic_transliteration_GURMUKHI,transliteration_LATIN_CYRILLIC,indic_transliteration_TELUGU
def google_length_checker(t, temp_sentence, t0):
if len(t.split()) >= len(temp_sentence.split()):
return t
elif len(t.split()) == len(temp_sentence.split())-1:
final_t = t+ " " + t0.split()[-1]
return final_t
elif len(t.split()) == len(temp_sentence.split())-2:
final_t = t+ " " + t0.split()[-2] + " " + t0.split()[-1]
return final_t
else:
return t
def Halant_remover(T3):
if T3[-1] == "":
return T3[:-1]
else: return T3
def dial_comparison_transliteration_rom_dev_ph1_sentence_wise(text, source_lang, source_script, dest_script):
source_lang = "hi"
sources_name = {'0':'Azure', '1':'indic_trans', '2':'google', '3':'indic_trans_IAST'}
etc_punctuation =["", " . . .", " . .", " . . ”"]
sentences=sentence_tokenize.sentence_split(text, lang='en')
priority_list =['Azure', 'indic_trans', 'google', 'indic_trans_IAST']
transliterated_text=[]
for sentence in sentences:
if sentence in etc_punctuation:
continue
print("original_sentence", sentence)
temp_sentence = punct_remover(sentence)
print("sentence_without_punctuation", temp_sentence)
t00 = azure_transliteration(temp_sentence, source_lang, source_script, dest_script)
print(t00)
t11 = indic_trans(temp_sentence, source_script, dest_script)
print(t11)
t = google(temp_sentence, 'en', 'hi')
#print("btw", t)
t22 = google_length_checker(t, temp_sentence, t00)
print("T22 transliteration",t22)
t33 = indic_transliteration_IAST(temp_sentence)
print(t33)
Out= []
outputs = []
for i in range(len(temp_sentence.split())):
word = temp_sentence.split()[i]
# print("test of trasnliteration",i)
T0 = t00.split()[i]
# print("test of trasnliteration T0",T0)
# print("test of trasnliteration T11",t11)
T1 = t11.split()[i]
# print("test of trasnliteration T1",T1)
# print("test of trasnliteration T22",t22)
T2 = t22.split()[i]
# print("test of trasnliteration T2",T2)
T3 = t33.split()[i]
T3 = Halant_remover(T3)
outputs=[T0, T1, T2, T3]
out = compare_outputs_transliteration(word, outputs, sources_name, priority_list)
Out.append(out)
trans_sent_wo_punct = " ".join(Out)
print("trans_sent_wo_punct", trans_sent_wo_punct)
transliterated_sentence = final_transliterated_sentence(sentence, trans_sent_wo_punct)
print("trans_sent_w_punct", transliterated_sentence)
transliterated_text.append(transliterated_sentence)
return " ".join(transliterated_text)
def dial_comparison_transliteration_dev_rom_ph1_sentence_wise(text, source_lang, source_script, dest_script):
# sources_name = {'0':'indic_trans', '1':'Azure', '2':'libindic', '3':'sheetal', '4':'ritwik'}
sources_name = {'0':'indic_trans', '1':'Azure', '2':'libindic', '3':'sheetal'}
# priority_list =['indic_trans', 'Azure', 'ritwik', 'sheetal', 'libindic']
priority_list =['indic_trans', 'Azure', 'sheetal', 'libindic']
etc_punctuation =["", " . . .", " . .", " . . ”"]
sentences=sentence_tokenize.sentence_split(text, lang='hi')
transliterated_text=[]
for sentence in sentences:
#if sentence == "" or sentence == " . . ." or sentence == " . ." or sentence ==" . . ”":
if sentence in etc_punctuation:
continue
print("original_sentence", sentence)
temp_sentence = punct_remover(sentence)
print("sentence_without_punctuation", temp_sentence)
t0 = indic_trans(temp_sentence, source_script, dest_script)
print(t0)
t1 = azure_transliteration(temp_sentence, source_lang, source_script, dest_script)
print(t1)
t2 = libindic(temp_sentence, dest_script).rstrip()
print(t2)
t3 = sheetal(temp_sentence).replace('\n','')
print(t3)
# t4 = ritwik(temp_sentence).replace('\n','').rstrip()
# print(t4)
Out= []
outputs = []
for i in range(len(temp_sentence.split())):
word = temp_sentence.split()[i]
T0 = t0.split()[i]
T1 = t1.split()[i]
T2 = t2.split()[i]
T3 = t3.split()[i]
# T4 = t4.split()[i]
# outputs=[T0, T1, T2, T3, T4]
outputs=[T0, T1, T2, T3]
out = compare_outputs_transliteration(word, outputs, sources_name, priority_list)
Out.append(out)
trans_sent_wo_punct = " ".join(Out)
print("trans_sent_wo_punct", trans_sent_wo_punct)
transliterated_sentence = final_transliterated_sentence(sentence, trans_sent_wo_punct)
print("trans_sent_w_punct", transliterated_sentence)
transliterated_text.append(transliterated_sentence)
return " ".join(transliterated_text)
def dial_comparison_transliteration_dev_rom_ph1(text, source_lang, source_script, dest_script):
#sources_name = {'0':'indic_trans', '1':'Azure', '2':'libindic', '3':'sheetal', '4':'ritwik'}
sources_name = {'0':'indic_trans', '1':'Azure', '2':'libindic', '3':'sheetal'}
sentences=sentence_tokenize.sentence_split(text, lang='hi')
#priority_list =['indic_trans', 'Azure', 'ritwik, 'sheetal', 'libindic']
priority_list =['indic_trans', 'Azure', 'sheetal', 'libindic']
transliterated_text=[]
for sentence in sentences:
if sentence == "" or sentence == " . . ." or sentence == " . ." or sentence ==" . . ”":
continue
print(sentence)
OUT=[]
for word in sentence.split():
if word==".":
continue
print(word)
t0 = indic_trans(word, source_script, dest_script)
#print(t0)
t1 = azure_transliteration(word, source_lang, source_script, dest_script)
#print(t1)
t2 = libindic(word, dest_script).rstrip()
#print(t2)
t3 = sheetal(word).replace('\n','')
#print(t3)
#t4 = ritwik(word).replace('\n','').rstrip()
#print(t4)
# outputs=[t0, t1, t2, t3, t4]
outputs=[t0, t1, t2, t3]
out = compare_outputs_transliteration(word, outputs, sources_name, priority_list)
OUT.append(out)
transliterated_text.append(" ".join(OUT))
return " ".join(transliterated_text)
def dial_comparison_transliteration_arbic_to_rom_ph1(text, source_lang, source_script, dest_script):
print("hello")
sources_name = {'0':'indic_trans', '1':'Azure', '2':'buck_2_unicode'}
sentences=sentence_tokenize.sentence_split(text, lang='en')
priority_list =['indic_trans', 'Azure', 'buck_2_unicode' ]
transliterated_text=[]
for sentence in sentences:
if sentence == "" or sentence == " . . ." or sentence == " . ." or sentence ==" . . ”":
continue
print(sentence)
OUT=[]
for word in sentence.split():
if word==".":
continue
print(word)
t0 = indic_trans(word, source_script, dest_script)
t1 = azure_transliteration(word, source_lang, source_script, dest_script)
t2 = buck_2_unicode(word)
outputs=[t0, t1, t2]
out = compare_outputs_transliteration(word, outputs, sources_name, priority_list)
OUT.append(out)
transliterated_text.append(" ".join(OUT))
return " ".join(transliterated_text)
def dial_comparison_transliteration_kann_to_rom_ph1(text, source_lang, source_script, dest_script):
print("hello")
sources_name = {'0':'om_transliteration', '1':'indic_trans', '2':'libindic', '3':'Azure'}
sentences=sentence_tokenize.sentence_split(text, lang='en')
priority_list =['om_transliteration', 'indic_trans', 'libindic', 'Azure']
transliterated_text=[]
for sentence in sentences:
if sentence == "" or sentence == " . . ." or sentence == " . ." or sentence ==" . . ”":
continue
print(sentence)
OUT=[]
for word in sentence.split():
if word==".":
continue
print(word)
t0 = om_transliterator(word)
t1 = indic_trans(word, source_script, dest_script)
t2 = libindic(word, dest_script)
t3 = azure_transliteration(word, source_lang, source_script, dest_script)
outputs=[t0, t1, t2, t3]
out = compare_outputs_transliteration(word, outputs, sources_name, priority_list)
OUT.append(out)
transliterated_text.append(" ".join(OUT))
return " ".join(transliterated_text)
def dial_comparison_transliteration_tamil_to_rom_ph1(text, source_lang, source_script, dest_script):
print("hello")
sources_name = {'0':'Azure', '1':'libindic', '2':'indic_trans', }
sentences=sentence_tokenize.sentence_split(text, lang='en')
priority_list =['Azure', 'libindic', 'indic_trans']
transliterated_text=[]
for sentence in sentences:
if sentence == "" or sentence == " . . ." or sentence == " . ." or sentence ==" . . ”":
continue
print(sentence)
OUT=[]
for word in sentence.split():
if word==".":
continue
print(word)
t0 = azure_transliteration(word, source_lang, source_script, dest_script)
t2 = libindic(word, dest_script)
t1 = indic_trans(word, source_script, dest_script)
outputs=[t0, t1, t2]
out = compare_outputs_transliteration(word, outputs, sources_name, priority_list)
OUT.append(out)
transliterated_text.append(" ".join(OUT))
return " ".join(transliterated_text)
def dial_comparison_transliteration_beng_tel_mal_to_rom_ph1(text, source_lang, source_script, dest_script):
print("hello gujarati to latin")
sources_name = {'0':'Azure', '1':'indic_trans', '2':'libindic'}
sentences=sentence_tokenize.sentence_split(text, lang='en')
priority_list =['Azure', 'indic_trans', 'libindic']
transliterated_text=[]
for sentence in sentences:
if sentence == "" or sentence == " . . ." or sentence == " . ." or sentence ==" . . ”":
continue
#print(sentence)
OUT=[]
for word in sentence.split():
if word==".":
continue
print(word)
t0 = azure_transliteration(word, source_lang, source_script, dest_script)
t1 = indic_trans(word, source_script, dest_script)
t2 = libindic(word, dest_script)
outputs=[t0, t1, t2]
out = compare_outputs_transliteration(word, outputs, sources_name, priority_list)
OUT.append(out)
transliterated_text.append(" ".join(OUT))
return " ".join(transliterated_text)
def dial_comparison_transliteration_latin_gurmukhi(text,source_lang, source_script, dest_script):
source_lang = "pa"
sources_name = {'0':'Azure', '1':'indic_trans', '2':'indic_trans_IAST'}
etc_punctuation =["", " . . .", " . .", " . . ”"]
sentences=sentence_tokenize.sentence_split(text, lang='en')
priority_list =['Azure', 'indic_trans', 'indic_trans_IAST']
transliterated_text=[]
for sentence in sentences:
if sentence in etc_punctuation:
continue
temp_sentence = punct_remover(sentence)
t00 = azure_transliteration(temp_sentence, source_lang, source_script, dest_script)
t11 = indic_transliteration_GURMUKHI(temp_sentence)
t22 = unicode_transliteration_GURMUKHI(temp_sentence)
Out= []
outputs = []
for i in range(len(temp_sentence.split())):
word = temp_sentence.split()[i]
T0 = t00.split()[i]
T1 = t11.split()[i]
T2 = t22.split()[i]
outputs=[T0, T1, T2]
out = compare_outputs_transliteration(word, outputs, sources_name, priority_list)
Out.append(out)
trans_sent_wo_punct = " ".join(Out)
transliterated_sentence = final_transliterated_sentence(sentence, trans_sent_wo_punct)
transliterated_text.append(transliterated_sentence)
return " ".join(transliterated_text)
def dial_comparison_transliteration_latin_cyrillic(text,source_lang, source_script, dest_script):
source_lang = "bg"
sources_name = {'0':'Azure', '1':'indic_trans'}
etc_punctuation =["", " . . .", " . .", " . . ”"]
sentences=sentence_tokenize.sentence_split(text, lang='en')
priority_list =['Azure', 'indic_trans']
transliterated_text=[]
for sentence in sentences:
if sentence in etc_punctuation:
continue
temp_sentence = punct_remover(sentence)
t00 = azure_transliteration(temp_sentence, source_lang, source_script, dest_script)
t11 = transliteration_LATIN_CYRILLIC(temp_sentence)
#t22 = polygot(text)
Out= []
outputs = []
for i in range(len(temp_sentence.split())):
word = temp_sentence.split()[i]
T0 = t00.split()[i]
T1 = t11.split()[i]
#T2 = t22.split()[i]
outputs=[T0, T1]
#outputs=[T0, T1, T2]
out = compare_outputs_transliteration(word, outputs, sources_name, priority_list)
Out.append(out)
trans_sent_wo_punct = " ".join(Out)
transliterated_sentence = final_transliterated_sentence(sentence, trans_sent_wo_punct)
transliterated_text.append(transliterated_sentence)
return " ".join(transliterated_text)
def dial_comparison_transliteration_latin_telugu_sentence_wise(text, source_lang, source_script, dest_script):
source_lang = "te"
sources_name = {'0':'indic_trans', '1':'Azure'}
priority_list =['indic_trans', 'Azure',]
etc_punctuation =["", " . . .", " . .", " . . ”"]
sentences=sentence_tokenize.sentence_split(text, lang='hi')
transliterated_text=[]
for sentence in sentences:
#if sentence == "" or sentence == " . . ." or sentence == " . ." or sentence ==" . . ”":
if sentence in etc_punctuation:
continue
print("original_sentence", sentence)
temp_sentence = punct_remover(sentence)
print("sentence_without_punctuation", temp_sentence)
t0 = indic_transliteration_TELUGU(temp_sentence)
print(t0)
t1 = azure_transliteration(temp_sentence, source_lang, source_script, dest_script)
print(t1)
Out= []
outputs = []
for i in range(len(temp_sentence.split())):
word = temp_sentence.split()[i]
T0 = t0.split()[i]
T1 = t1.split()[i]
outputs=[T0, T1]
out = compare_outputs_transliteration(word, outputs, sources_name, priority_list)
Out.append(out)
trans_sent_wo_punct = " ".join(Out)
print("trans_sent_wo_punct", trans_sent_wo_punct)
transliterated_sentence = final_transliterated_sentence(sentence, trans_sent_wo_punct)
print("trans_sent_w_punct", transliterated_sentence)
transliterated_text.append(transliterated_sentence)
return " ".join(transliterated_text)
def dial_comparison_transliteration_gurmukhi_latin_sentence_wise(text, source_lang, source_script, dest_script):
source_lang = "pa"
sources_name = {'0':'indic_trans', '1':'Azure', '2':'unicode'}
priority_list =['indic_trans', 'Azure', 'unicode']
etc_punctuation =["", " . . .", " . .", " . . ”"]
sentences=sentence_tokenize.sentence_split(text, lang='hi')
transliterated_text=[]
for sentence in sentences:
#if sentence == "" or sentence == " . . ." or sentence == " . ." or sentence ==" . . ”":
if sentence in etc_punctuation:
continue
print("original_sentence", sentence)
temp_sentence = punct_remover(sentence)
print("sentence_without_punctuation", temp_sentence)
t0 = indic_transliteration_GURMUKHI_LATIN(temp_sentence)
print(t0)
t1 = azure_transliteration(temp_sentence, source_lang, source_script, dest_script)
print(t1)
t2 = unicode_transliteration_GURMUKHI_LATIN(temp_sentence).rstrip()
print(t2)
Out= []
outputs = []
for i in range(len(temp_sentence.split())):
word = temp_sentence.split()[i]
T0 = t0.split()[i]
T1 = t1.split()[i]
T2 = t2.split()[i]
outputs=[T0, T1, T2]
out = compare_outputs_transliteration(word, outputs, sources_name, priority_list)
Out.append(out)
trans_sent_wo_punct = " ".join(Out)
print("trans_sent_wo_punct", trans_sent_wo_punct)
transliterated_sentence = final_transliterated_sentence(sentence, trans_sent_wo_punct)
print("trans_sent_w_punct", transliterated_sentence)
transliterated_text.append(transliterated_sentence)
return " ".join(transliterated_text)
def dial_comparison_transliteration_cyrilic_latin_sentence_wise(text, source_lang, source_script, dest_script):
print("cyrillic to latin")
source_lang = "bg"
sources_name = {'0':'indic_trans', '1':'Azure', '2':'unicode'}
priority_list =['indic_trans', 'Azure', 'unicode']
etc_punctuation =["", " . . .", " . .", " . . ”"]
sentences=sentence_tokenize.sentence_split(text, lang='hi')
transliterated_text=[]
for sentence in sentences:
#if sentence == "" or sentence == " . . ." or sentence == " . ." or sentence ==" . . ”":
if sentence in etc_punctuation:
continue
print("original_sentence", sentence)
temp_sentence = punct_remover(sentence)
print("sentence_without_punctuation", temp_sentence)
t0 = azure_transliteration(temp_sentence, source_lang, source_script, dest_script)
print("t0",t0)
t1 = transliteration_CYRILIC_LATIN(temp_sentence)
print("t1",t1)
t2 = ConvertToLatin(temp_sentence)
print("t2",t2)
Out= []
outputs = []
for i in range(len(temp_sentence.split())):
word = temp_sentence.split()[i]
T0 = t0.split()[i]
T1 = t1.split()[i]
T2 = t2.split()[i]
outputs=[T0, T1, T2]
out = compare_outputs_transliteration(word, outputs, sources_name, priority_list)
Out.append(out)
trans_sent_wo_punct = " ".join(Out)
print("trans_sent_wo_punct", trans_sent_wo_punct)
transliterated_sentence = final_transliterated_sentence(sentence, trans_sent_wo_punct)
print("trans_sent_w_punct", transliterated_sentence)
transliterated_text.append(transliterated_sentence)
return " ".join(transliterated_text)
def transliterate(dest_script,src_script,src_lang,text):
trans_text=text ###initialise nahi karo
if dest_script == "Latin" and src_script == "Devanagari":
# trans_text = dial_comparison_transliteration_dev_rom_ph1(text, src_lang, src_script,dest_script)
trans_text=dial_comparison_transliteration_dev_rom_ph1_sentence_wise(text, src_lang, src_script,dest_script)
elif dest_script == "Devanagari" and src_script == "Latin":
trans_text = dial_comparison_transliteration_rom_dev_ph1(text, src_lang, src_script,dest_script)
#trans_text=dial_comparison_transliteration_rom_dev_ph1_sentence_wise(text, src_lang, src_script,dest_script)
elif dest_script == "Latin" and src_script == "Arabic":
trans_text = dial_comparison_transliteration_arbic_to_rom_ph1(text, src_lang, src_script,dest_script )
elif dest_script == "Latin" and src_script == "Kannada":
trans_text = dial_comparison_transliteration_kann_to_rom_ph1(text, src_lang, src_script,dest_script )
elif dest_script == "Latin" and src_script == "Tamil":
trans_text = dial_comparison_transliteration_tamil_to_rom_ph1(text, src_lang, src_script,dest_script )
elif dest_script == "Latin" and src_script == "Bengali":
trans_text = dial_comparison_transliteration_beng_tel_mal_to_rom_ph1(text, src_lang, src_script,dest_script )
elif dest_script == "Latin" and src_script == "Telugu":
trans_text = dial_comparison_transliteration_beng_tel_mal_to_rom_ph1(text, src_lang, src_script,dest_script )
elif dest_script == "Latin" and src_script == "Malayalam":
trans_text = dial_comparison_transliteration_beng_tel_mal_to_rom_ph1(text, src_lang, src_script,dest_script )
elif dest_script == "Gurmukhi" and src_script == "Latin":
trans_text = dial_comparison_transliteration_latin_gurmukhi(text, src_lang, src_script,dest_script )
elif dest_script == "Cyrillic" and src_script == "Latin":
trans_text = dial_comparison_transliteration_latin_cyrillic(text, src_lang, src_script,dest_script )
elif dest_script == "Telugu" and src_script == "Latin":
trans_text = dial_comparison_transliteration_latin_telugu_sentence_wise(text, src_lang, src_script,dest_script )
elif dest_script == "Latin" and src_script == "Gurmukhi":
trans_text = dial_comparison_transliteration_gurmukhi_latin_sentence_wise(text, src_lang, src_script,dest_script )
elif dest_script == "Latin" and src_script == "Cyrillic":
trans_text = dial_comparison_transliteration_cyrilic_latin_sentence_wise(text, src_lang, src_script,dest_script )
elif dest_script == "Latin" and src_script == "Gujarati":
print("Gujarti to latin hoga")
trans_text = dial_comparison_transliteration_beng_tel_mal_to_rom_ph1(text, src_lang, src_script,dest_script )
elif dest_script == "Latin" and src_script == "Oriya":
trans_text = dial_comparison_transliteration_beng_tel_mal_to_rom_ph1(text, src_lang, src_script,dest_script )
return trans_text