import os import sys import docx import re # import textract from tqdm import tqdm from collections import Counter import ntpath from docx.shared import Inches, Cm, Pt from docx.enum.text import WD_ALIGN_PARAGRAPH from docx.enum.table import WD_TABLE_ALIGNMENT, WD_ALIGN_VERTICAL import requests, uuid, json import nltk.translate.bleu_score as bleu import nltk.translate.gleu_score as gleu from rouge_score import rouge_scorer import numpy as np import statistics from statistics import mode from indicnlp.tokenize import sentence_tokenize import nltk try: print("time33333") nltk.data.find('tokenizers/punkt') except: #LookupError: nltk.download('punkt') pass try: nltk.data.find('wordnet') except LookupError: ###nltk.download('wordnet') print("error in finding wordnet222222222") from nltk.tokenize import sent_tokenize # import logging # from logger import get_module_logger # log = get_module_logger(__name__) # log.info('Logger working') #google os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="gifted-mountain-318504-0a5f94cda0c8.json" from google.cloud import translate from google.cloud import translate_v2 as Translate translate_client = Translate.Client() client = translate.TranslationServiceClient() project_id = "excellent-hue-272808" location = "global" parent = f"projects/{project_id}/locations/{location}" from script_detector import script_cat from buck_2_unicode import buck_2_unicode from translation_metric import manual_diff_score, bleu_diff_score, gleu_diff_score, meteor_diff_score, rouge_diff_score, diff_score, critera4_5 from selection_source import selection_source, function5, function41, function311, function221, function2111, function11111, selection_source_transliteration, two_sources_two_outputs from script_writing import addSlugLine, addActionLine, addSpeaker, addParenthetical, addDialogue, dual_script, addTransition, dial_checker, non_dial_checker from script_reading import breaksen, getRefined, getSlugAndNonSlug, getSpeakers, getScenes from translation_resources import ibm_watson, google, aws, azure, lingvanex, yandex from transliteration_resources import azure_transliteration, indic_trans, om_transliterator, libindic, indic_transliteration_IAST, indic_transliteration_ITRANS, sheetal, ritwik from script_writing import default_script from nltk.tokenize import regexp_tokenize #comes from frontend after detection algo # filename1 = sys.argv[1] # dial_src_lang = sys.argv[2] # dial_dest_lang = sys.argv[3] # non_dial_src_lang = sys.argv[4] # non_dial_dest_lang = sys.argv[5] # dual_dial_script = sys.argv[6] #to check whether action lines or dialogues will translate if non_dial_src_lang != non_dial_dest_lang: global_non_dialogue_flag = "Yes" else:global_non_dialogue_flag = "No" if dial_src_lang != dial_dest_lang: global_dialogue_flag = "Yes" else:global_dialogue_flag = "No" translation_list = ['en', 'ta', 'hi', 'ar', 'ur', 'kn', 'gu', 'bg', 'bn', 'te', 'ml', 'ru', 'sr', 'uk', 'hr', 'ga', 'sq', 'mr', 'fa', 'tr', 'hu', 'it', 'ro','pa','gu','or','zh-CN', 'zh-TW','ne','fr','es','id','el','ja','ko','be','uz','sd','af','de','is', 'ig','la','pt','my','th','su','lo','am','si','az','kk','mk','bs','ps','mg','ms','yo','cs','da','nl','tl','no','sl','sv', 'vi','cy','he','hy','km','ka','mn','ku','ky','tk','he','hy','km','ka','mn','ku','ky','tk','fi','ht','haw','lt','lb','mt', 'pl','eo','tt','ug','ha','so','sw','yi','eu','ca','ceb','co','et','fy','gl','hmn','rw','lv','mi','sm','gd','st','sn','sk', 'xh','zu'] # create an instance of a word document doc = docx.Document() doc_file = "translated/" +str(dial_dest_lang) +"_" +"trans" + '_of_'+ ntpath.basename(filename1) #print(doc_file) doc2 = docx.Document() sections = doc2.sections for section in sections: section.top_margin = Inches(0.2) section.bottom_margin = Inches(0.2) section.left_margin = Inches(0.2) section.right_margin = Inches(0.2) section = doc2.sections[-1] new_height = section.page_width section.page_width = section.page_height section.page_height = new_height name = 'Final table '+doc_file doc2.add_heading(name, 0) doc_para = doc2.add_paragraph() doc_para.add_run('Translation resources used : Google, IBM watson, AWS, Azure, Lingvanex, Yandex').bold = True table2 = doc2.add_table(rows=1,cols=4) table2.style = 'TableGrid' hdr_Cells = table2.rows[0].cells hdr_Cells[0].paragraphs[0].add_run("Input").bold=True hdr_Cells[1].paragraphs[0].add_run("Output1").bold=True hdr_Cells[2].paragraphs[0].add_run("Output2").bold=True hdr_Cells[3].paragraphs[0].add_run("Output3").bold=True #process the input script and return scenes refined,total_scenes = getRefined(filename1) #print(refined) #log.debug(refined) sluglines,without_slug = getSlugAndNonSlug(refined) #print(sluglines) #log.debug(sluglines) characters = getSpeakers(without_slug) #log.debug(characters) scenes,actionline,parenthetical_lis,speakers,dialogues = getScenes(refined,total_scenes,characters) print(scenes) #to detect the language def language_detector(text): result = translate_client.translate(text, target_language='hi') det_lang = result["detectedSourceLanguage"] return det_lang class myDict(dict): def __init__(self): self = dict() def add(self, key, value): self[key] = value def all_translator(sentence, source_lang, target_lang): i = 0 trans = myDict() sources_name = myDict() try: globals()['t%s' % i] = google(sentence, source_lang, target_lang) trans.add(str(i), globals()['t%s' % i]) sources_name.add(str(i), "GOOGLE") i = i+1 except: pass try: globals()['t%s' % i] = ibm_watson(sentence, source_lang, target_lang) trans.add(str(i), globals()['t%s' % i]) sources_name.add(str(i), "IBM_WATSON") i = i+1 except: pass try: globals()['t%s' % i] = aws(sentence, source_lang, target_lang) trans.add(str(i), globals()['t%s' % i]) sources_name.add(str(i), "AWS") i = i+1 except: pass try: globals()['t%s' % i]= azure(sentence, target_lang) trans.add(str(i), globals()['t%s' % i]) sources_name.add(str(i), "AZURE") i = i+1 except: pass try: globals()['t%s' % i] = lingvanex(sentence, source_lang, target_lang) trans.add(str(i), globals()['t%s' % i]) sources_name.add(str(i), "LINGVANEX") i = i+1 except: pass try: globals()['t%s' % i] = yandex(sentence, source_lang, target_lang) trans.add(str(i), globals()['t%s' % i]) sources_name.add(str(i), "YANDEX") i = i+1 except: pass trans_text = compare_outputs(sentence, trans["0"], trans, sources_name, target_lang) return trans_text def recursive_dots(Sentence, source_lang, target_lang): special_characters = ['....', '…', '. . .', '...'] translated_text = [] for i in special_characters: if i not in Sentence: continue Sentences = Sentence.split(i) for Sentence in Sentences: if Sentence=="" or Sentence==" ": continue if any(ext in Sentence for ext in special_characters): trans_text = translation_with_spcecial_dots(Sentence, source_lang, target_lang) else: if Sentence!= Sentences[-1]: trans_text = all_translator(Sentence, source_lang, target_lang) + i else: trans_text = all_translator(Sentence, source_lang, target_lang) translated_text.append(trans_text) return " ".join(translated_text) def translation_with_spcecial_dots(Sentence, source_lang, target_lang): special_characters = ['....', '…', '. . .', '...'] translated_text = [] for ext in special_characters: if ext in Sentence: splitter = ext break Sentences = Sentence.split(splitter) for Sentence in Sentences: if Sentence=="" or Sentence==" ": continue if any(ext in Sentence for ext in special_characters): trans_text = recursive_dots(Sentence, source_lang, target_lang) else: if Sentence!= Sentences[-1]: trans_text = all_translator(Sentence, source_lang, target_lang) + splitter else: trans_text = all_translator(Sentence, source_lang, target_lang) translated_text.append(trans_text) return " ".join(translated_text) def translate_comparison(text, source_lang, target_lang): sentences=sent_tokenize(text) special_characters = ['....', '…', '. . .', '...'] translated_text = [] for sentence in sentences: if any(ext in sentence for ext in special_characters): trans_text = translation_with_spcecial_dots(sentence, source_lang, target_lang) translated_text.append(trans_text) else: trans_text = all_translator(sentence, source_lang, target_lang) translated_text.append(trans_text) return " ".join(translated_text) def script_det(text): punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~“"”''' no_punct = "" for char in text: if char not in punctuations: no_punct = char break #print("alphabet", no_punct) script = script_cat(no_punct)[0] #print("script", script) return script def punct_remover(string): #punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~…।“”''' punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~…।1234567890''' for x in string.lower(): if x in punctuations: string = string.replace(x, " ") return string def word_transliterate(sentence, dest_script): return sentence def final_out(output1, output2, output3, dest_lang): temp_output1 = punct_remover(output1) temp_output2 = punct_remover(output2) temp_output3 = punct_remover(output3) #for word in regexp_tokenize(output1, "[\w']+") for word in temp_output1.split(): if script_det(word) != default_script[dest_lang]: for word in temp_output2.split(): if script_det(word) != default_script[dest_lang]: for word in temp_output3.split(): if script_det(word) != default_script[dest_lang]: #print("in3") output1 = word_transliterate(output1, default_script[dest_lang]) return output1 return output3 return output2 return output1 #take a sentence and give translated sentence by comparing outputs from different resources def compare_outputs(sentence, t0, trans, sources_name, target_lang): k=[] s=[] methods_name = {'0':'MNF', '1':'Gleu', '2':'Meteor', '3':'Rougen', '4':'Rougel'} google_output = t0 #print("google", google_output) output1, source1 = manual_diff_score(trans, sources_name) #print("MNF", output1) output2, source2 = gleu_diff_score(trans, sources_name) #print("gleu", output2) output3, source3 = meteor_diff_score(trans, sources_name) #print("meteor", output3) output4, source4, output5, source5 = rouge_diff_score(trans, sources_name) #print("rougen", output4) #print("rougel", output5) if google_output == output1 == output2==output3==output4==output5: #print("all output are same as google") return google_output else: if google_output != output1: k.append(output1) s.append(source1) else: k.append(" ") s.append(" ") if google_output != output2: k.append(output2) s.append(source2) else: k.append(" ") s.append(" ") if google_output != output3: k.append(output3) s.append(source3) else: k.append(" ") s.append(" ") if google_output != output4: k.append(output4) s.append(source4) else: k.append(" ") s.append(" ") if google_output != output5: k.append(output5) s.append(source5) else: k.append(" ") s.append(" ") k.insert(0,sentence) k.insert(1,google_output) s1ANDm1, s2ANDm2, s3ANDm3 = selection_source(s, sources_name, trans, methods_name ) # print("s1", s1ANDm1) # print("s2", s2ANDm2) # print("s3", s3ANDm3) #print(s1ANDm1[0]) #print(sources_name) #add_dial_comparison_doc1a(doc1a, table1a , k, s, s1ANDm1[0]) #add_dial_comparison_doc1b(doc1b, table1b , k, s, s1ANDm1[0]) add_dial_comparison_doc2(doc2, table2, sentence, s1ANDm1, s2ANDm2, s3ANDm3, sources_name, trans) for a, b in sources_name.items(): if b == s1ANDm1[0]: k = a output1 = trans[str(k)] if s2ANDm2[0] != "": for c, d in sources_name.items(): if d == s2ANDm2[0]: l = c output2 = trans[str(l)] else: output2 = output1 if s3ANDm3[0] != "": for e, f in sources_name.items(): if f == s3ANDm3[0]: m = e output3 = trans[str(m)] else: output3 = output1 # print("output1", output1) # print("output2", output2) # print("output3", output3) output = final_out(output1, output2, output3, target_lang) # print("output", output) return output #to return the table with best 3 outputs def add_dial_comparison_doc2(doc2, table2, sentence, s1ANDm1, s2ANDm2, s3ANDm3, sources_name, trans): row_Cells = table2.add_row().cells for a, b in sources_name.items(): if b == s1ANDm1[0]: k = a output1 = trans[str(k)] row_Cells[0].text= sentence row_Cells[1].text= output1 row_Cells[1].paragraphs[0].add_run('(Source : '+str(s1ANDm1[0])+')') row_Cells[1].paragraphs[0].add_run('(Methods : '+str(s1ANDm1[1])+')') if s2ANDm2[0] == "": row_Cells[2].text= "" else: for a, b in sources_name.items(): if b == s2ANDm2[0]: k = a output2 = trans[str(k)] row_Cells[2].text= output2 row_Cells[2].paragraphs[0].add_run('(Source : '+str(s2ANDm2[0])+')') row_Cells[2].paragraphs[0].add_run('(Methods : '+str(s2ANDm2[1])+')') if s3ANDm3[0] == "": row_Cells[3].text= "" else: for a, b in sources_name.items(): if b == s3ANDm3[0]: k = a output3 = trans[str(k)] row_Cells[3].text= output3 row_Cells[3].paragraphs[0].add_run('(Source : '+str(s3ANDm3[0])+')') row_Cells[3].paragraphs[0].add_run('(Methods : '+str(s3ANDm3[1])+')') for scene in tqdm(scenes[:1]): for i,line in enumerate(scene): if i == 0: addSlugLine(doc,line) continue if type(line)==type(""): if global_non_dialogue_flag == "Yes": if non_dial_src_lang in translation_list and non_dial_dest_lang in translation_list: trans_text = translate_comparison(line , non_dial_src_lang, non_dial_dest_lang) addActionLine(doc, trans_text, non_dial_dest_lang) else: addActionLine(doc, line, non_dial_dest_lang) else: addActionLine(doc, line, non_dial_dest_lang) else: #print(line) [speaker] = line.keys() #print([speaker]) if speaker == 'Transition': addTransition(doc,line[speaker]) continue addSpeaker(doc,speaker) if global_dialogue_flag == "Yes": print("In dialogue") #print("dilo") print("dialogue", line[speaker][2]) dial_src_lang = language_detector(line[speaker][2]) print("dial_src_lang", dial_src_lang) #print("p", line[speaker][0]) if line[speaker][0] != 'NONE': print("parenthitical", line[speaker][0]) par_lang = language_detector(line[speaker][0]) out = google(line[speaker][0], par_lang, dial_dest_lang) #out = google(line[speaker][0], dial_src_lang, dial_dest_lang) addParenthetical(doc,out) dial_translate = dial_checker(dial_dest_lang, dial_src_lang) print("dial_translate", dial_translate) #dial_translate = True if dial_translate: print("dialogue to be translated", line[speaker][2]) if line[speaker][2] == "": continue if dial_src_lang in translation_list and dial_dest_lang in translation_list: trans_text = translate_comparison(line[speaker][2] , dial_src_lang, dial_dest_lang) if dual_dial_script == "Yes": dual_script(doc, line[speaker][2], trans_text, dial_dest_lang) else: addDialogue(doc, trans_text, dial_dest_lang) else: addDialogue(doc, line[speaker][2], dial_dest_lang) else: addParenthetical(doc,line[speaker][0]) addDialogue(doc, line[speaker][2], dial_dest_lang) doc.save(doc_file) doc2.save("final_comparision.docx") #doc2.save(str(filename1.split('.')[0])+"_trans_to_"+str(dial_dest_lang)+'_'+"final.docx")