365 lines
14 KiB
Python
Executable File
365 lines
14 KiB
Python
Executable File
import re
|
|
import os
|
|
from .buck_2_unicode import buck_2_unicode
|
|
from indicnlp.tokenize import sentence_tokenize
|
|
from .transliteration_resources import azure_transliteration, om_transliterator, \
|
|
libindic, indic_transliteration_IAST, indic_transliteration_ITRANS, sheetal, ritwik
|
|
|
|
from .translation_resources import google, aws, azure,yandex
|
|
from .selection_source import selection_source, function5, function41, function311, function221, \
|
|
function2111, function11111, selection_source_transliteration, two_sources_two_outputs
|
|
|
|
|
|
def compare_outputs_transliteration(word, outputs, sources_name, priority_list):
|
|
#print(outputs)
|
|
O1ANDS1, O2ANDS2 = selection_source_transliteration(sources_name, outputs, priority_list)
|
|
#print(O1ANDS1)
|
|
#add_dial_comparison_doc2_transliteration(doc2, table2, word, O1ANDS1, O2ANDS2, sources_name)
|
|
return O1ANDS1[0]
|
|
|
|
def space_after_punct(text):
|
|
#text = text.replace('...',' ... ')
|
|
text = text.replace('. . .',' ... ')
|
|
text = re.sub('([,!?()…-])', r'\1 ', text)
|
|
text = re.sub('\s{2,}', ' ', text)
|
|
return text
|
|
|
|
def final_transliterated_sentence(original, transliterated):
|
|
original = space_after_punct(original)
|
|
punct_list = ['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', ' ', '-', '.', '/', ':', ';',
|
|
'<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', '…', '...']
|
|
sentence = []
|
|
j = 0
|
|
|
|
for i in range(len(original.split())):
|
|
|
|
if original.split()[i] in punct_list:
|
|
sentence.append(original.split()[i])
|
|
elif original.split()[i][-1] in punct_list:
|
|
temp = transliterated.split()[j] + original.split()[i][-1]
|
|
sentence.append(temp)
|
|
j = j+1
|
|
elif original.split()[i][-1] not in punct_list:
|
|
temp = transliterated.split()[j]
|
|
sentence.append(temp)
|
|
j = j+1
|
|
|
|
transliterated_sentence = " ".join(sentence)
|
|
transliterated_sentence.replace(' ... ','...')
|
|
transliterated_sentence.replace('… ', '…')
|
|
return transliterated_sentence
|
|
|
|
def punct_remover(string):
|
|
#punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~…।“”'''
|
|
punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~…।'''
|
|
for x in string.lower():
|
|
if x in punctuations:
|
|
string = string.replace(x, " ")
|
|
return string
|
|
|
|
def google_length_checker(t, temp_sentence, t0):
|
|
print("7777777777777777777777777")
|
|
print("1221")
|
|
if len(t.split()) >= len(temp_sentence.split()):
|
|
print("1")
|
|
return t
|
|
elif len(t.split()) == len(temp_sentence.split())-1:
|
|
print("2")
|
|
final_t = t+ " " + t0.split()[-1]
|
|
return final_t
|
|
elif len(t.split()) == len(temp_sentence.split())-2:
|
|
print("3")
|
|
final_t = t+ " " + t0.split()[-2] + " " + t0.split()[-1]
|
|
return final_t
|
|
return t
|
|
print("1266666161")
|
|
def Halant_remover(T3):
|
|
if T3[-1] == "्":
|
|
return T3[:-1]
|
|
else: return T3
|
|
|
|
|
|
##rom-dev
|
|
def dial_comparison_transliteration_rom_dev_ph1_sentence_wise(text, source_script, dest_script):
|
|
sources_name = {'0':'Azure', '1':'indic_trans', '2':'google', '3':'indic_trans_IAST'}
|
|
etc_punctuation =["", " . . .", " . .", " . . ”"]
|
|
sentences=sentence_tokenize.sentence_split(text, lang='en')
|
|
priority_list =['Azure', 'indic_trans', 'google', 'indic_trans_IAST']
|
|
source_lang = "hi"
|
|
|
|
transliterated_text=[]
|
|
for sentence in sentences:
|
|
if sentence in etc_punctuation:
|
|
continue
|
|
print("original_sentence", sentence)
|
|
temp_sentence = punct_remover(sentence)
|
|
print("sentence_without_punctuation", temp_sentence)
|
|
|
|
t00 = azure_transliteration(temp_sentence, source_lang, source_script, dest_script)
|
|
# for i in
|
|
print(t00)
|
|
t11 = indic_trans(temp_sentence, source_script, dest_script)
|
|
print(t11)
|
|
t = google(temp_sentence, 'en', 'hi')
|
|
print("btw", t)
|
|
t22 = google_length_checker(t, temp_sentence, t00)
|
|
print(t22)
|
|
t33 = indic_transliteration_IAST(temp_sentence)
|
|
print(t33)
|
|
valid_outputs = {}
|
|
print(priority_list)
|
|
for i,pos in zip([len(t00.split()),len(t11.split()),len(t22.split()),len(t33.split())],[0,1,2,3]):
|
|
print(pos)
|
|
if i == len(temp_sentence.split()):
|
|
valid_outputs[str(pos)] = "yes"
|
|
else:
|
|
# sources_name.pop(str(pos))
|
|
# del priority_list[pos]
|
|
# priority_list.pop(pos)
|
|
valid_outputs[str(pos)] = "no"
|
|
|
|
Out= []
|
|
outputs = []
|
|
print(len(temp_sentence.split()))
|
|
trans_sent_wo_punct = ""
|
|
for i in range(len(temp_sentence.split())):
|
|
print("7878")
|
|
print(i)
|
|
word = temp_sentence.split()[i]
|
|
print(word+"tt")
|
|
print(len(t00.split()))
|
|
print(len(t11.split()))
|
|
print(len(t22.split())) #not correctly translated
|
|
print(len(t33.split()))
|
|
# print(t22)
|
|
if valid_outputs["0"] == "yes":
|
|
T0 = t00.split()[i]
|
|
else:
|
|
T0 = ""
|
|
print(T0)
|
|
if valid_outputs["1"] == "yes":
|
|
T1 = t11.split()[i]
|
|
else:
|
|
T1 = ""
|
|
print(T1)
|
|
if valid_outputs["2"] == "yes":
|
|
T2 = t22.split()[i]
|
|
else:
|
|
T2 = ""
|
|
print(T2)
|
|
if valid_outputs["3"] == "yes":
|
|
T3 = t33.split()[i]
|
|
T3 = Halant_remover(T3)
|
|
else:
|
|
T3 = ""
|
|
print(T3)
|
|
|
|
outputs=[T0, T1, T2, T3]
|
|
# for i in [0,1,2,3]:
|
|
# if valid_outputs[str(i)] == "yes":
|
|
# pass
|
|
# else:
|
|
# outputs.pop(i)
|
|
|
|
out = compare_outputs_transliteration(word, outputs, sources_name, priority_list)
|
|
Out.append(out)
|
|
trans_sent_wo_punct = " ".join(Out)
|
|
#print("trans_sent_wo_punct", trans_sent_wo_punct)
|
|
|
|
transliterated_sentence = final_transliterated_sentence(sentence, trans_sent_wo_punct)
|
|
#print("trans_sent_w_punct", transliterated_sentence)
|
|
transliterated_text.append(transliterated_sentence)
|
|
|
|
return " ".join(transliterated_text)
|
|
|
|
##dev_rom
|
|
def dial_comparison_transliteration_dev_rom_ph1_sentence_wise(text, source_script, dest_script):
|
|
sources_name = {'0':'indic_trans', '1':'Azure', '2':'libindic', '3':'sheetal', '4':'ritwik'}
|
|
priority_list =['indic_trans', 'Azure', 'ritwik', 'sheetal', 'libindic']
|
|
etc_punctuation =["", " . . .", " . .", " . . ”"]
|
|
sentences=sentence_tokenize.sentence_split(text, lang='hi')
|
|
source_lang = "hi"
|
|
transliterated_text=[]
|
|
for sentence in sentences:
|
|
if sentence in etc_punctuation:
|
|
continue
|
|
|
|
#print("original_sentence", sentence)
|
|
temp_sentence = punct_remover(sentence)
|
|
#print("sentence_without_punctuation", temp_sentence)
|
|
|
|
t0 = indic_trans(temp_sentence, source_script, dest_script)
|
|
#print(t0)
|
|
t1 = azure_transliteration(temp_sentence, source_lang, source_script, dest_script)
|
|
#print(t1)
|
|
t2 = libindic(temp_sentence, dest_script).rstrip()
|
|
#print(t2)
|
|
t3 = sheetal(temp_sentence).replace('\n','')
|
|
#print(t3)
|
|
t4 = ritwik(temp_sentence).replace('\n','').rstrip()
|
|
#print(t4)
|
|
|
|
Out= []
|
|
outputs = []
|
|
for i in range(len(temp_sentence.split())):
|
|
word = temp_sentence.split()[i]
|
|
|
|
T0 = t0.split()[i]
|
|
T1 = t1.split()[i]
|
|
T2 = t2.split()[i]
|
|
T3 = t3.split()[i]
|
|
T4 = t4.split()[i]
|
|
|
|
outputs=[T0, T1, T2, T3, T4]
|
|
out = compare_outputs_transliteration(word, outputs, sources_name, priority_list)
|
|
Out.append(out)
|
|
trans_sent_wo_punct = " ".join(Out)
|
|
#print("trans_sent_wo_punct", trans_sent_wo_punct)
|
|
|
|
transliterated_sentence = final_transliterated_sentence(sentence, trans_sent_wo_punct)
|
|
#print("trans_sent_w_punct", transliterated_sentence)
|
|
transliterated_text.append(transliterated_sentence)
|
|
|
|
return " ".join(transliterated_text)
|
|
|
|
|
|
def dial_comparison_transliteration_arbic_to_rom_ph1(text, source_lang, source_script, dest_script):
|
|
#print("hello")
|
|
sources_name = {'0':'indic_trans', '1':'Azure', '2':'buck_2_unicode'}
|
|
sentences=sentence_tokenize.sentence_split(text, lang='en')
|
|
priority_list =['indic_trans', 'Azure', 'buck_2_unicode' ]
|
|
|
|
transliterated_text=[]
|
|
for sentence in sentences:
|
|
if sentence == "" or sentence == " . . ." or sentence == " . ." or sentence ==" . . ”":
|
|
continue
|
|
print(sentence)
|
|
OUT=[]
|
|
for word in sentence.split():
|
|
if word==".":
|
|
continue
|
|
print(word)
|
|
t0 = indic_trans(word, source_script, dest_script)
|
|
t1 = azure_transliteration(word, source_lang, source_script, dest_script)
|
|
t2 = buck_2_unicode(word)
|
|
outputs=[t0, t1, t2]
|
|
out = compare_outputs_transliteration(word, outputs, sources_name, priority_list)
|
|
OUT.append(out)
|
|
transliterated_text.append(" ".join(OUT))
|
|
return " ".join(transliterated_text)
|
|
|
|
def dial_comparison_transliteration_kann_to_rom_ph1(text, source_script, dest_script):
|
|
print("hello")
|
|
sources_name = {'0':'om_transliteration', '1':'indic_trans', '2':'libindic', '3':'Azure'}
|
|
sentences=sentence_tokenize.sentence_split(text, lang='en')
|
|
priority_list =['om_transliteration', 'indic_trans', 'libindic', 'Azure']
|
|
|
|
transliterated_text=[]
|
|
for sentence in sentences:
|
|
if sentence == "" or sentence == " . . ." or sentence == " . ." or sentence ==" . . ”":
|
|
continue
|
|
print(sentence)
|
|
OUT=[]
|
|
for word in sentence.split():
|
|
if word==".":
|
|
continue
|
|
print(word)
|
|
t0 = om_transliterator(word)
|
|
t1 = indic_trans(word, source_script, dest_script)
|
|
t2 = libindic(word, dest_script)
|
|
t3 = azure_transliteration(word, source_lang, source_script, dest_script)
|
|
outputs=[t0, t1, t2, t3]
|
|
out = compare_outputs_transliteration(word, outputs, sources_name, priority_list)
|
|
OUT.append(out)
|
|
transliterated_text.append(" ".join(OUT))
|
|
return " ".join(transliterated_text)
|
|
|
|
def dial_comparison_transliteration_tamil_to_rom_ph1(text, source_lang, source_script, dest_script):
|
|
#print("hello")
|
|
sources_name = {'0':'Azure', '1':'libindic', '2':'indic_trans', }
|
|
sentences=sentence_tokenize.sentence_split(text, lang='en')
|
|
priority_list =['Azure', 'libindic', 'indic_trans']
|
|
|
|
transliterated_text=[]
|
|
for sentence in sentences:
|
|
if sentence == "" or sentence == " . . ." or sentence == " . ." or sentence ==" . . ”":
|
|
continue
|
|
print(sentence)
|
|
OUT=[]
|
|
for word in sentence.split():
|
|
if word==".":
|
|
continue
|
|
print(word)
|
|
t0 = azure_transliteration(word, source_lang, source_script, dest_script)
|
|
t2 = libindic(word, dest_script)
|
|
t1 = indic_trans(word, source_script, dest_script)
|
|
outputs=[t0, t1, t2]
|
|
out = compare_outputs_transliteration(word, outputs, sources_name, priority_list)
|
|
OUT.append(out)
|
|
transliterated_text.append(" ".join(OUT))
|
|
return " ".join(transliterated_text)
|
|
|
|
|
|
def dial_comparison_transliteration_beng_tel_mal_to_rom_ph1(text, source_lang, source_script, dest_script):
|
|
#print("hello")
|
|
sources_name = {'0':'Azure', '1':'indic_trans', '2':'libindic'}
|
|
sentences=sentence_tokenize.sentence_split(text, lang='en')
|
|
priority_list =['Azure', 'indic_trans', 'libindic']
|
|
|
|
transliterated_text=[]
|
|
for sentence in sentences:
|
|
if sentence == "" or sentence == " . . ." or sentence == " . ." or sentence ==" . . ”":
|
|
continue
|
|
#print(sentence)
|
|
OUT=[]
|
|
for word in sentence.split():
|
|
if word==".":
|
|
continue
|
|
print(word)
|
|
t0 = azure_transliteration(word, source_lang, source_script, dest_script)
|
|
t1 = indic_trans(word, source_script, dest_script)
|
|
t2 = libindic(word, dest_script)
|
|
outputs=[t0, t1, t2]
|
|
out = compare_outputs_transliteration(word, outputs, sources_name, priority_list)
|
|
OUT.append(out)
|
|
transliterated_text.append(" ".join(OUT))
|
|
return " ".join(transliterated_text)
|
|
|
|
|
|
|
|
|
|
def all_transliteration(text, source_script, dest_script):
|
|
|
|
if text == "":
|
|
return
|
|
|
|
if source_script == "Latin" and dest_script == "Devanagari":
|
|
trans_text = dial_comparison_transliteration_rom_dev_ph1_sentence_wise(text, source_script, dest_script)
|
|
elif source_script == "Devanagari" and dest_script == "Latin":
|
|
trans_text = dial_comparison_transliteration_dev_rom_ph1_sentence_wise(text, source_script, dest_script)
|
|
|
|
elif source_script== "Arabic" and dest_script == "Latin":
|
|
trans_text = dial_comparison_transliteration_arbic_to_rom_ph1(text, text, source_script, dest_script)
|
|
elif source_script == "Kannada" and dest_script == "Latin":
|
|
trans_text = dial_comparison_transliteration_kann_to_rom_ph1(text, text, source_script, dest_script)
|
|
elif source_script== "Tamil" and dest_script == "Latin":
|
|
trans_text = dial_comparison_transliteration_tamil_to_rom_ph1(text, text, source_script, dest_script)
|
|
elif source_script == "Bengali" and dest_script == "Latin":
|
|
trans_text = dial_comparison_transliteration_beng_tel_mal_to_rom_ph1(text, text, source_script, dest_script)
|
|
elif source_script == "Telugu" and dest_script == "Latin":
|
|
trans_text = dial_comparison_transliteration_beng_tel_mal_to_rom_ph1(text, text, source_script, dest_script)
|
|
elif source_script == "Malayalam" and dest_script == "Latin":
|
|
trans_text = dial_comparison_transliteration_beng_tel_mal_to_rom_ph1(text, text, source_script, dest_script)
|
|
|
|
|
|
else:
|
|
trans_text = text
|
|
|
|
return trans_text
|
|
|
|
|
|
# text = " I am Lokesh."
|
|
# source_script = "Latin"
|
|
# dest_script = "Devanagari"
|
|
|
|
# print(all_transliteration(text, source_script, dest_script)) |