import re import os from .buck_2_unicode import buck_2_unicode from indicnlp.tokenize import sentence_tokenize from .transliteration_resources import azure_transliteration, om_transliterator, \ libindic, indic_transliteration_IAST, indic_transliteration_ITRANS, sheetal, ritwik from .translation_resources import google, aws, azure,yandex from .selection_source import selection_source, function5, function41, function311, function221, \ function2111, function11111, selection_source_transliteration, two_sources_two_outputs def compare_outputs_transliteration(word, outputs, sources_name, priority_list): #print(outputs) O1ANDS1, O2ANDS2 = selection_source_transliteration(sources_name, outputs, priority_list) #print(O1ANDS1) #add_dial_comparison_doc2_transliteration(doc2, table2, word, O1ANDS1, O2ANDS2, sources_name) return O1ANDS1[0] def space_after_punct(text): #text = text.replace('...',' ... ') text = text.replace('. . .',' ... ') text = re.sub('([,!?()…-])', r'\1 ', text) text = re.sub('\s{2,}', ' ', text) return text def final_transliterated_sentence(original, transliterated): original = space_after_punct(original) punct_list = ['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', ' ', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', '…', '...'] sentence = [] j = 0 for i in range(len(original.split())): if original.split()[i] in punct_list: sentence.append(original.split()[i]) elif original.split()[i][-1] in punct_list: temp = transliterated.split()[j] + original.split()[i][-1] sentence.append(temp) j = j+1 elif original.split()[i][-1] not in punct_list: temp = transliterated.split()[j] sentence.append(temp) j = j+1 transliterated_sentence = " ".join(sentence) transliterated_sentence.replace(' ... ','...') transliterated_sentence.replace('… ', '…') return transliterated_sentence def punct_remover(string): #punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~…।“”''' punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~…।''' for x in string.lower(): if x in punctuations: string = string.replace(x, " ") return string def google_length_checker(t, temp_sentence, t0): print("7777777777777777777777777") print("1221") if len(t.split()) >= len(temp_sentence.split()): print("1") return t elif len(t.split()) == len(temp_sentence.split())-1: print("2") final_t = t+ " " + t0.split()[-1] return final_t elif len(t.split()) == len(temp_sentence.split())-2: print("3") final_t = t+ " " + t0.split()[-2] + " " + t0.split()[-1] return final_t return t print("1266666161") def Halant_remover(T3): if T3[-1] == "्": return T3[:-1] else: return T3 ##rom-dev def dial_comparison_transliteration_rom_dev_ph1_sentence_wise(text, source_script, dest_script): sources_name = {'0':'Azure', '1':'indic_trans', '2':'google', '3':'indic_trans_IAST'} etc_punctuation =["", " . . .", " . .", " . . ”"] sentences=sentence_tokenize.sentence_split(text, lang='en') priority_list =['Azure', 'indic_trans', 'google', 'indic_trans_IAST'] source_lang = "hi" transliterated_text=[] for sentence in sentences: if sentence in etc_punctuation: continue print("original_sentence", sentence) temp_sentence = punct_remover(sentence) print("sentence_without_punctuation", temp_sentence) t00 = azure_transliteration(temp_sentence, source_lang, source_script, dest_script) # for i in print(t00) t11 = indic_trans(temp_sentence, source_script, dest_script) print(t11) t = google(temp_sentence, 'en', 'hi') print("btw", t) t22 = google_length_checker(t, temp_sentence, t00) print(t22) t33 = indic_transliteration_IAST(temp_sentence) print(t33) valid_outputs = {} print(priority_list) for i,pos in zip([len(t00.split()),len(t11.split()),len(t22.split()),len(t33.split())],[0,1,2,3]): print(pos) if i == len(temp_sentence.split()): valid_outputs[str(pos)] = "yes" else: # sources_name.pop(str(pos)) # del priority_list[pos] # priority_list.pop(pos) valid_outputs[str(pos)] = "no" Out= [] outputs = [] print(len(temp_sentence.split())) trans_sent_wo_punct = "" for i in range(len(temp_sentence.split())): print("7878") print(i) word = temp_sentence.split()[i] print(word+"tt") print(len(t00.split())) print(len(t11.split())) print(len(t22.split())) #not correctly translated print(len(t33.split())) # print(t22) if valid_outputs["0"] == "yes": T0 = t00.split()[i] else: T0 = "" print(T0) if valid_outputs["1"] == "yes": T1 = t11.split()[i] else: T1 = "" print(T1) if valid_outputs["2"] == "yes": T2 = t22.split()[i] else: T2 = "" print(T2) if valid_outputs["3"] == "yes": T3 = t33.split()[i] T3 = Halant_remover(T3) else: T3 = "" print(T3) outputs=[T0, T1, T2, T3] # for i in [0,1,2,3]: # if valid_outputs[str(i)] == "yes": # pass # else: # outputs.pop(i) out = compare_outputs_transliteration(word, outputs, sources_name, priority_list) Out.append(out) trans_sent_wo_punct = " ".join(Out) #print("trans_sent_wo_punct", trans_sent_wo_punct) transliterated_sentence = final_transliterated_sentence(sentence, trans_sent_wo_punct) #print("trans_sent_w_punct", transliterated_sentence) transliterated_text.append(transliterated_sentence) return " ".join(transliterated_text) ##dev_rom def dial_comparison_transliteration_dev_rom_ph1_sentence_wise(text, source_script, dest_script): sources_name = {'0':'indic_trans', '1':'Azure', '2':'libindic', '3':'sheetal', '4':'ritwik'} priority_list =['indic_trans', 'Azure', 'ritwik', 'sheetal', 'libindic'] etc_punctuation =["", " . . .", " . .", " . . ”"] sentences=sentence_tokenize.sentence_split(text, lang='hi') source_lang = "hi" transliterated_text=[] for sentence in sentences: if sentence in etc_punctuation: continue #print("original_sentence", sentence) temp_sentence = punct_remover(sentence) #print("sentence_without_punctuation", temp_sentence) t0 = indic_trans(temp_sentence, source_script, dest_script) #print(t0) t1 = azure_transliteration(temp_sentence, source_lang, source_script, dest_script) #print(t1) t2 = libindic(temp_sentence, dest_script).rstrip() #print(t2) t3 = sheetal(temp_sentence).replace('\n','') #print(t3) t4 = ritwik(temp_sentence).replace('\n','').rstrip() #print(t4) Out= [] outputs = [] for i in range(len(temp_sentence.split())): word = temp_sentence.split()[i] T0 = t0.split()[i] T1 = t1.split()[i] T2 = t2.split()[i] T3 = t3.split()[i] T4 = t4.split()[i] outputs=[T0, T1, T2, T3, T4] out = compare_outputs_transliteration(word, outputs, sources_name, priority_list) Out.append(out) trans_sent_wo_punct = " ".join(Out) #print("trans_sent_wo_punct", trans_sent_wo_punct) transliterated_sentence = final_transliterated_sentence(sentence, trans_sent_wo_punct) #print("trans_sent_w_punct", transliterated_sentence) transliterated_text.append(transliterated_sentence) return " ".join(transliterated_text) def dial_comparison_transliteration_arbic_to_rom_ph1(text, source_lang, source_script, dest_script): #print("hello") sources_name = {'0':'indic_trans', '1':'Azure', '2':'buck_2_unicode'} sentences=sentence_tokenize.sentence_split(text, lang='en') priority_list =['indic_trans', 'Azure', 'buck_2_unicode' ] transliterated_text=[] for sentence in sentences: if sentence == "" or sentence == " . . ." or sentence == " . ." or sentence ==" . . ”": continue print(sentence) OUT=[] for word in sentence.split(): if word==".": continue print(word) t0 = indic_trans(word, source_script, dest_script) t1 = azure_transliteration(word, source_lang, source_script, dest_script) t2 = buck_2_unicode(word) outputs=[t0, t1, t2] out = compare_outputs_transliteration(word, outputs, sources_name, priority_list) OUT.append(out) transliterated_text.append(" ".join(OUT)) return " ".join(transliterated_text) def dial_comparison_transliteration_kann_to_rom_ph1(text, source_script, dest_script): print("hello") sources_name = {'0':'om_transliteration', '1':'indic_trans', '2':'libindic', '3':'Azure'} sentences=sentence_tokenize.sentence_split(text, lang='en') priority_list =['om_transliteration', 'indic_trans', 'libindic', 'Azure'] transliterated_text=[] for sentence in sentences: if sentence == "" or sentence == " . . ." or sentence == " . ." or sentence ==" . . ”": continue print(sentence) OUT=[] for word in sentence.split(): if word==".": continue print(word) t0 = om_transliterator(word) t1 = indic_trans(word, source_script, dest_script) t2 = libindic(word, dest_script) t3 = azure_transliteration(word, source_lang, source_script, dest_script) outputs=[t0, t1, t2, t3] out = compare_outputs_transliteration(word, outputs, sources_name, priority_list) OUT.append(out) transliterated_text.append(" ".join(OUT)) return " ".join(transliterated_text) def dial_comparison_transliteration_tamil_to_rom_ph1(text, source_lang, source_script, dest_script): #print("hello") sources_name = {'0':'Azure', '1':'libindic', '2':'indic_trans', } sentences=sentence_tokenize.sentence_split(text, lang='en') priority_list =['Azure', 'libindic', 'indic_trans'] transliterated_text=[] for sentence in sentences: if sentence == "" or sentence == " . . ." or sentence == " . ." or sentence ==" . . ”": continue print(sentence) OUT=[] for word in sentence.split(): if word==".": continue print(word) t0 = azure_transliteration(word, source_lang, source_script, dest_script) t2 = libindic(word, dest_script) t1 = indic_trans(word, source_script, dest_script) outputs=[t0, t1, t2] out = compare_outputs_transliteration(word, outputs, sources_name, priority_list) OUT.append(out) transliterated_text.append(" ".join(OUT)) return " ".join(transliterated_text) def dial_comparison_transliteration_beng_tel_mal_to_rom_ph1(text, source_lang, source_script, dest_script): #print("hello") sources_name = {'0':'Azure', '1':'indic_trans', '2':'libindic'} sentences=sentence_tokenize.sentence_split(text, lang='en') priority_list =['Azure', 'indic_trans', 'libindic'] transliterated_text=[] for sentence in sentences: if sentence == "" or sentence == " . . ." or sentence == " . ." or sentence ==" . . ”": continue #print(sentence) OUT=[] for word in sentence.split(): if word==".": continue print(word) t0 = azure_transliteration(word, source_lang, source_script, dest_script) t1 = indic_trans(word, source_script, dest_script) t2 = libindic(word, dest_script) outputs=[t0, t1, t2] out = compare_outputs_transliteration(word, outputs, sources_name, priority_list) OUT.append(out) transliterated_text.append(" ".join(OUT)) return " ".join(transliterated_text) def all_transliteration(text, source_script, dest_script): if text == "": return if source_script == "Latin" and dest_script == "Devanagari": trans_text = dial_comparison_transliteration_rom_dev_ph1_sentence_wise(text, source_script, dest_script) elif source_script == "Devanagari" and dest_script == "Latin": trans_text = dial_comparison_transliteration_dev_rom_ph1_sentence_wise(text, source_script, dest_script) elif source_script== "Arabic" and dest_script == "Latin": trans_text = dial_comparison_transliteration_arbic_to_rom_ph1(text, text, source_script, dest_script) elif source_script == "Kannada" and dest_script == "Latin": trans_text = dial_comparison_transliteration_kann_to_rom_ph1(text, text, source_script, dest_script) elif source_script== "Tamil" and dest_script == "Latin": trans_text = dial_comparison_transliteration_tamil_to_rom_ph1(text, text, source_script, dest_script) elif source_script == "Bengali" and dest_script == "Latin": trans_text = dial_comparison_transliteration_beng_tel_mal_to_rom_ph1(text, text, source_script, dest_script) elif source_script == "Telugu" and dest_script == "Latin": trans_text = dial_comparison_transliteration_beng_tel_mal_to_rom_ph1(text, text, source_script, dest_script) elif source_script == "Malayalam" and dest_script == "Latin": trans_text = dial_comparison_transliteration_beng_tel_mal_to_rom_ph1(text, text, source_script, dest_script) else: trans_text = text return trans_text # text = " I am Lokesh." # source_script = "Latin" # dest_script = "Devanagari" # print(all_transliteration(text, source_script, dest_script))