539 lines
19 KiB
Python
Executable File
539 lines
19 KiB
Python
Executable File
# Module imports
|
|
import subprocess
|
|
from nltk.tokenize import sent_tokenize, regexp_tokenize
|
|
|
|
# internal imports
|
|
from conversion.translation.script_writing import dial_checker, dual_script, addDialogue, default_script
|
|
from conversion.translation.translation_variables import is_in_translation_list, special_characters, code_2_language
|
|
from conversion.translation.all_transliteration import all_transliteration
|
|
from conversion.translation.script_writing import default_script
|
|
from conversion.translation.translation_metric import manual_diff_score, bleu_diff_score, gleu_diff_score, meteor_diff_score, rouge_diff_score, diff_score, critera4_5
|
|
from conversion.translation.selection_source import selection_source, function5, function41, function311, function221, function2111, function11111, selection_source_transliteration, two_sources_two_outputs
|
|
from conversion.translation.translation_resources import google, aws, azure, yandex
|
|
from .detection import language_detector, script_det
|
|
|
|
|
|
# -> Random Function
|
|
def word_transliterate(sentence, dest_script):
|
|
return sentence
|
|
|
|
|
|
# # -> to be used when option 5 or 6 is yes from frontend then this function is used for translation
|
|
# def ui_option5_and_6(doc, dial_src_lang, dial_dest_lang, dialogue, dual_dial_script):
|
|
# if dialogue == "":
|
|
# return
|
|
# dial_translate = dial_checker(dial_dest_lang, dial_src_lang)
|
|
# if dial_translate:
|
|
# print("in 51")
|
|
# if is_in_translation_list(dial_src_lang) and is_in_translation_list(dial_dest_lang):
|
|
# trans_text = ui_option5_translate_comparison(
|
|
# dialogue, dial_src_lang, dial_dest_lang)
|
|
#
|
|
# if dual_dial_script == "Yes":
|
|
# dual_script(doc, dialogue,
|
|
# trans_text, dial_dest_lang)
|
|
# else:
|
|
# addDialogue(doc, trans_text, dial_dest_lang)
|
|
# else:
|
|
# print("in 52")
|
|
# if dual_dial_script == "Yes":
|
|
# dual_script(doc, dialogue,
|
|
# dialogue, dial_dest_lang)
|
|
# else:
|
|
# addDialogue(doc, dialogue, dial_dest_lang)
|
|
|
|
|
|
# # -> If option 3 or 4 is yes from frontend then this function is used for translation
|
|
def ui_option3_and_4(dial_src_lang, dial_dest_lang, dialogue, ui_option_1st_choice, dual_dial_script):
|
|
print("dial_src_lang", dial_src_lang)
|
|
print("dial_dest_lang", dial_dest_lang)
|
|
print("ui option 1st choice", ui_option_1st_choice)
|
|
if ui_option_1st_choice == "Yes":
|
|
print("in ui31")
|
|
dial_translate = dial_checker(dial_dest_lang, dial_src_lang)
|
|
if dial_translate:
|
|
print("in ui311")
|
|
if dialogue == "":
|
|
return
|
|
print(dial_src_lang, dial_dest_lang, "fjdjfskd")
|
|
if is_in_translation_list(dial_src_lang) and is_in_translation_list(dial_dest_lang):
|
|
print("case 1")
|
|
trans_text = translate_comparison(
|
|
dialogue, dial_src_lang, dial_dest_lang)
|
|
print("the translated text is", trans_text)
|
|
else:
|
|
trans_text = dialogue
|
|
print("no creation of ")
|
|
if dual_dial_script == "Yes":
|
|
print("case 1")
|
|
# dual_script(doc, dialogue,
|
|
# trans_text, dial_dest_lang)
|
|
else:
|
|
try:
|
|
if trans_text == "":
|
|
trans_text = dialogue
|
|
else:
|
|
trans_text = trans_text
|
|
except:
|
|
trans_text = dialogue
|
|
return trans_text
|
|
else:
|
|
print("in ui312")
|
|
if dual_dial_script == "Yes":
|
|
pass
|
|
# dual_script(doc, dialogue,
|
|
# dialogue, dial_dest_lang)
|
|
else:
|
|
return dialogue
|
|
|
|
else:
|
|
print("in ui32")
|
|
print(dialogue, script_det(
|
|
dialogue), dial_src_lang)
|
|
output = all_transliteration(dialogue, script_det(
|
|
dialogue), default_script[dial_src_lang])
|
|
if dual_dial_script == "Yes":
|
|
pass
|
|
# transliteration
|
|
# dual_script(doc, dialogue, output, dial_dest_lang)
|
|
else:
|
|
return output
|
|
# addDialogue(doc, output, dial_dest_lang)
|
|
|
|
|
|
# -> Converting Docx to PDF using Libra-Office
|
|
def convert_to_pdf(input_docx, out_folder):
|
|
|
|
p = subprocess.Popen(['libreoffice', '--headless', '--convert-to', 'pdf', '--outdir',
|
|
out_folder, input_docx])
|
|
print(['--convert-to', 'pdf', input_docx])
|
|
p.communicate()
|
|
# saveFile = input_docx.split('.')[0] + ".pdf"
|
|
# change_chmod = subprocess.Popen(['sudo', 'chmod', '777', saveFile])
|
|
# change_chmod.communicate()
|
|
|
|
|
|
def final_out(output1, output2, output3, dest_lang):
|
|
# for word in output1.split():
|
|
for word in regexp_tokenize(output1, "[\w']+"):
|
|
if script_det(word) != default_script[dest_lang]:
|
|
for word in regexp_tokenize(output2, "[\w']+"):
|
|
if script_det(word) != default_script[dest_lang]:
|
|
for word in regexp_tokenize(output3, "[\w']+"):
|
|
if script_det(word) != default_script[dest_lang]:
|
|
# print("in3")
|
|
output1 = word_transliterate(
|
|
output1, default_script[dest_lang])
|
|
return output1
|
|
return output3
|
|
return output2
|
|
return output1
|
|
|
|
|
|
def manual_diff_score(trans, sources_name):
|
|
global_diff = []
|
|
n=len(sources_name)
|
|
for i in range(n):
|
|
local_diff = 0
|
|
for j in range(n):
|
|
if i!=j:
|
|
if trans[str(i)] and trans[str(j)] == " ":
|
|
continue
|
|
d = diff_score(trans[str(i)], trans[str(j)])
|
|
local_diff += d
|
|
global_diff.append(local_diff/(n-1))
|
|
Choiced_source = global_diff.index(min(global_diff))
|
|
return trans[str(Choiced_source)], sources_name[str(Choiced_source)]
|
|
|
|
# -> Comapre Outputs from all sources like google,ibm,aws,etc and decides the text to be returned as output
|
|
def compare_outputs(sentence, t0, trans, sources_name, target_lang):
|
|
# take a sentence and give translated sentence by comparing outputs from different resources
|
|
|
|
k = []
|
|
s = []
|
|
methods_name = {'0': 'MNF', '1': 'Gleu',
|
|
'2': 'Meteor', '3': 'Rougen', '4': 'Rougel'}
|
|
google_output = t0
|
|
#print("google", google_output)
|
|
output1, source1 = manual_diff_score(trans, sources_name)
|
|
#print("MNF", output1)
|
|
output2, source2 = gleu_diff_score(trans, sources_name)
|
|
#print("gleu", output2)
|
|
print("TRans 2-> ",trans)
|
|
print(type(trans))
|
|
output3, source3 = meteor_diff_score(trans, sources_name)
|
|
#print("meteor", output3)
|
|
output4, source4, output5, source5 = rouge_diff_score(
|
|
trans, sources_name)
|
|
#print("rougen", output4)
|
|
#print("rougel", output5)
|
|
|
|
if google_output == output1 == output2 == output3 == output4 == output5:
|
|
print("all output is same as google")
|
|
return google_output
|
|
else:
|
|
if google_output != output1:
|
|
k.append(output1)
|
|
s.append(source1)
|
|
else:
|
|
k.append(" ")
|
|
s.append(" ")
|
|
if google_output != output2:
|
|
k.append(output2)
|
|
s.append(source2)
|
|
else:
|
|
k.append(" ")
|
|
s.append(" ")
|
|
if google_output != output3:
|
|
k.append(output3)
|
|
s.append(source3)
|
|
else:
|
|
k.append(" ")
|
|
s.append(" ")
|
|
if google_output != output4:
|
|
k.append(output4)
|
|
s.append(source4)
|
|
else:
|
|
k.append(" ")
|
|
s.append(" ")
|
|
if google_output != output5:
|
|
k.append(output5)
|
|
s.append(source5)
|
|
else:
|
|
k.append(" ")
|
|
s.append(" ")
|
|
|
|
k.insert(0, sentence)
|
|
k.insert(1, google_output)
|
|
s1ANDm1, s2ANDm2, s3ANDm3 = selection_source(
|
|
s, sources_name, trans, methods_name)
|
|
|
|
for a, b in sources_name.items():
|
|
if b == s1ANDm1[0]:
|
|
k = a
|
|
output1 = trans[str(k)]
|
|
|
|
if s2ANDm2[0] != "":
|
|
for c, d in sources_name.items():
|
|
if d == s2ANDm2[0]:
|
|
l = c
|
|
output2 = trans[str(l)]
|
|
else:
|
|
output2 = output1
|
|
|
|
if s3ANDm3[0] != "":
|
|
for e, f in sources_name.items():
|
|
if f == s3ANDm3[0]:
|
|
m = e
|
|
output3 = trans[str(m)]
|
|
else:
|
|
output3 = output1
|
|
|
|
output = final_out(output1, output2, output3, target_lang)
|
|
return output
|
|
|
|
|
|
# -> Defining own way of declaring Dictionary
|
|
class myDict(dict):
|
|
def __init__(self):
|
|
self = dict()
|
|
|
|
def add(self, key, value):
|
|
self[key] = value
|
|
|
|
|
|
# -> Main Translation function to be called without any special dots in Sentence
|
|
def all_translator(sentence, source_lang, target_lang, makeExcel=False):
|
|
import time
|
|
i = 0
|
|
trans = myDict()
|
|
sources_name = myDict()
|
|
|
|
try:
|
|
globals()['t%s' % i] = google(
|
|
sentence, source_lang, target_lang)
|
|
trans.add(str(i), globals()['t%s' % i])
|
|
sources_name.add(str(i), "GOOGLE")
|
|
i = i+1
|
|
except:
|
|
pass
|
|
|
|
# try:
|
|
# globals()['t%s' % i] = ibm_watson(
|
|
# sentence, source_lang, target_lang)
|
|
# trans.add(str(i), globals()['t%s' % i])
|
|
# sources_name.add(str(i), "IBM_WATSON")
|
|
# i = i+1
|
|
# except:
|
|
# pass
|
|
|
|
|
|
try:
|
|
globals()['t%s' % i] = aws(sentence, source_lang, target_lang)
|
|
trans.add(str(i), globals()['t%s' % i])
|
|
sources_name.add(str(i), "AWS")
|
|
i = i+1
|
|
except:
|
|
pass
|
|
|
|
try:
|
|
globals()['t%s' % i] = azure(sentence, target_lang)
|
|
trans.add(str(i), globals()['t%s' % i])
|
|
sources_name.add(str(i), "AZURE")
|
|
i = i+1
|
|
except:
|
|
pass
|
|
|
|
# try:
|
|
# globals()['t%s' % i] = lingvanex(
|
|
# sentence, source_lang, target_lang)
|
|
# trans.add(str(i), globals()['t%s' % i])
|
|
# sources_name.add(str(i), "LINGVANEX")
|
|
# i = i+1
|
|
# except:
|
|
# pass
|
|
#
|
|
# try:
|
|
# globals()['t%s' % i] = yandex(
|
|
# sentence, source_lang, target_lang)
|
|
# trans.add(str(i), globals()['t%s' % i])
|
|
# sources_name.add(str(i), "YANDEX")
|
|
# i = i+1
|
|
# except:
|
|
# pass
|
|
|
|
if len(sources_name) == 1:
|
|
trans_text = trans["0"]
|
|
else:
|
|
|
|
print("Trans -> ", trans)
|
|
print(type(trans))
|
|
trans_text = compare_outputs(
|
|
sentence, trans["0"], trans, sources_name, target_lang)
|
|
|
|
# print("final trasnlated text 101", trans_text)
|
|
|
|
if makeExcel:
|
|
print("Translated texts are",trans)
|
|
return trans_text, str(rf"{sentence} | ".join(list(trans.values())))
|
|
else:
|
|
return trans_text
|
|
|
|
|
|
# -> Main Translation function to be called with any special dots in Sentence (TRY TO USE THIS FUNCTION FOR TRANSLATION)
|
|
def translate_comparison(text, source_lang, target_lang, makeExcel=False):
|
|
print(text, " : Text at 58%")
|
|
sentences = sent_tokenize(text)
|
|
translated_text = []
|
|
for sentence in sentences:
|
|
if any(ext in sentence for ext in special_characters):
|
|
print("Isme gaya")
|
|
trans_text = translation_with_spcecial_dots(
|
|
sentence, source_lang, target_lang)
|
|
translated_text.append(trans_text)
|
|
|
|
else:
|
|
if makeExcel:
|
|
trans_text = all_translator(
|
|
sentence, source_lang, target_lang, makeExcel)
|
|
translated_text.append(trans_text)
|
|
else:
|
|
trans_text = all_translator(
|
|
sentence, source_lang, target_lang, makeExcel)
|
|
translated_text.append(trans_text)
|
|
if makeExcel:
|
|
return " ".join(translated_text), str(trans)
|
|
else:
|
|
return " ".join(translated_text)
|
|
|
|
|
|
# -> Handling all special dots in sentence
|
|
|
|
|
|
|
|
# -> Main functio for handling sentences to remove recursive dots
|
|
def recursive_dots(Sentence, source_lang, target_lang):
|
|
translated_text = []
|
|
for i in special_characters:
|
|
if i not in Sentence:
|
|
continue
|
|
Sentences = Sentence.split(i)
|
|
|
|
for Sentence in Sentences:
|
|
if Sentence == "" or Sentence == " ":
|
|
continue
|
|
if any(ext in Sentence for ext in special_characters):
|
|
trans_text = translation_with_spcecial_dots(
|
|
Sentence, source_lang, target_lang)
|
|
else:
|
|
if Sentence != Sentences[-1]:
|
|
trans_text = all_translator(
|
|
Sentence, source_lang, target_lang) + i
|
|
else:
|
|
trans_text = all_translator(
|
|
Sentence, source_lang, target_lang)
|
|
translated_text.append(trans_text)
|
|
|
|
return " ".join(translated_text)
|
|
|
|
|
|
|
|
def translation_with_spcecial_dots(text, source_lang, target_lang, splitter, line_language, line_script, script_data=None, subsentence_choices=None):
|
|
|
|
sentences = text.split(splitter)
|
|
|
|
translated_text = []
|
|
|
|
for sentence in sentences:
|
|
|
|
if sentence == "" or sentence == " ":
|
|
continue
|
|
|
|
line_language2 = language_detector(sentence)
|
|
line_script2 = script_det(sentence)
|
|
|
|
if splitter2 := next((ext for ext in special_characters if ext in sentence), None):
|
|
trans_text = translation_with_spcecial_dots(
|
|
sentence, source_lang, target_lang, splitter2, line_language2, line_script2)
|
|
|
|
else:
|
|
|
|
if line_language == script_data['dial_dest_lang'] \
|
|
and line_script == script_data['dial_dest_script']:
|
|
|
|
if subsentence_choices[0]:
|
|
trans_text = all_translator(
|
|
sentence, source_lang, target_lang)
|
|
|
|
elif line_language == script_data['dial_dest_lang'] \
|
|
and line_script == script_data['non_dial_dest_script']:
|
|
|
|
if subsentence_choices[2]:
|
|
trans_text = all_translator(
|
|
sentence, source_lang, target_lang)
|
|
|
|
elif line_language == script_data['non_dial_src_lang'] \
|
|
and line_script == script_data['dial_dest_script']:
|
|
|
|
if subsentence_choices[4]:
|
|
trans_text = all_translator(
|
|
sentence, source_lang, target_lang)
|
|
|
|
elif line_language == script_data['non_dial_src_lang'] \
|
|
and line_script == script_data['non_dial_dest_script']:
|
|
|
|
if subsentence_choices[6]:
|
|
trans_text = all_translator(
|
|
sentence, source_lang, target_lang)
|
|
|
|
elif line_language == script_data['non_dial_dest_lang'] \
|
|
and line_script == script_data['dial_dest_script']:
|
|
|
|
if subsentence_choices[8]:
|
|
trans_text = all_translator(
|
|
sentence, source_lang, target_lang)
|
|
|
|
elif line_language == script_data['non_dial_dest_lang'] \
|
|
and line_script == script_data['non_dial_dest_script']:
|
|
|
|
if subsentence_choices[10]:
|
|
trans_text = all_translator(
|
|
sentence, source_lang, target_lang)
|
|
|
|
else:
|
|
|
|
if subsentence_choices[12]:
|
|
trans_text = all_translator(
|
|
sentence, source_lang, target_lang)
|
|
|
|
if sentence != sentences[-1]:
|
|
trans_text = trans_text + splitter
|
|
else:
|
|
trans_text = all_translator(
|
|
sentence, source_lang, target_lang)
|
|
translated_text.append(trans_text)
|
|
|
|
return " ".join(translated_text)
|
|
|
|
|
|
|
|
def translate_comparison2(text, source_lang, target_lang, script_data=None, subsentence_choices=None, is_dialogue=False):
|
|
|
|
if is_dialogue:
|
|
tokenizers_valid_langs = ("malayalam", "french", "italian", "german", "spanish", "swedish", "finnish", "danish", "english", "slovene", "norwegian", "dutch", "portuguese", "czech", "russian", "polish","turkish", "estonian", "greek")
|
|
|
|
if code_2_language[source_lang].lower() in tokenizers_valid_langs:
|
|
sentences = sent_tokenize(text, language=(code_2_language[source_lang]).lower())
|
|
else:
|
|
sentences = text.split(".")
|
|
|
|
# List of translated sentences
|
|
translated_text = []
|
|
|
|
|
|
# Translating each sentence one by one
|
|
for sentence in sentences:
|
|
|
|
line_language = language_detector(sentence)
|
|
line_script = script_det(sentence)
|
|
|
|
if splitter := next((ext for ext in special_characters if ext in sentence), None):
|
|
trans_text = translation_with_spcecial_dots(
|
|
sentence, source_lang, target_lang, splitter, line_language, line_script, script_data, subsentence_choices)
|
|
translated_text.append(trans_text)
|
|
else:
|
|
if line_language == script_data['dial_dest_lang'] \
|
|
and line_script == script_data['dial_dest_script']:
|
|
|
|
if subsentence_choices[0]:
|
|
trans_text = all_translator(
|
|
sentence, source_lang, target_lang)
|
|
|
|
elif line_language == script_data['dial_dest_lang'] \
|
|
and line_script == script_data['non_dial_dest_script']:
|
|
|
|
if subsentence_choices[2]:
|
|
trans_text = all_translator(
|
|
sentence, source_lang, target_lang)
|
|
|
|
elif line_language == script_data['non_dial_src_lang'] \
|
|
and line_script == script_data['dial_dest_script']:
|
|
|
|
if subsentence_choices[4]:
|
|
trans_text = all_translator(
|
|
sentence, source_lang, target_lang)
|
|
|
|
elif line_language == script_data['non_dial_src_lang'] \
|
|
and line_script == script_data['non_dial_dest_script']:
|
|
|
|
if subsentence_choices[6]:
|
|
trans_text = all_translator(
|
|
sentence, source_lang, target_lang)
|
|
|
|
elif line_language == script_data['non_dial_dest_lang'] \
|
|
and line_script == script_data['dial_dest_script']:
|
|
|
|
if subsentence_choices[8]:
|
|
trans_text = all_translator(
|
|
sentence, source_lang, target_lang)
|
|
|
|
elif line_language == script_data['non_dial_dest_lang'] \
|
|
and line_script == script_data['non_dial_dest_script']:
|
|
|
|
if subsentence_choices[10]:
|
|
trans_text = all_translator(
|
|
sentence, source_lang, target_lang)
|
|
|
|
else:
|
|
|
|
if subsentence_choices[12]:
|
|
trans_text = all_translator(
|
|
sentence, source_lang, target_lang)
|
|
|
|
translated_text.append(trans_text)
|
|
else:
|
|
return " ".join(translated_text)
|
|
|
|
else:
|
|
return all_translator(text, source_lang, target_lang)
|