import os import sys import docx import re from tqdm import tqdm from collections import Counter import ntpath from docx.shared import Inches, Cm, Pt from docx.enum.text import WD_ALIGN_PARAGRAPH from docx.enum.table import WD_TABLE_ALIGNMENT, WD_ALIGN_VERTICAL import requests, uuid, json import nltk.translate.bleu_score as bleu import nltk.translate.gleu_score as gleu from rouge_score import rouge_scorer import numpy as np from indicnlp.tokenize import sentence_tokenize import nltk try: print("time2222") nltk.data.find('tokenizers/punkt') except: #LookupError: nltk.download('punkt') pass try: nltk.data.find('wordnet') except LookupError: ###nltk.download('wordnet') print("error in finding wordnet11111") # import logging # from logger import get_module_logger # log = get_module_logger(__name__) # log.info('Logger working') #google os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="My First Project-2573112d5326.json" from google.cloud import translate from google.cloud import translate_v2 as Translate translate_client = Translate.Client() client = translate.TranslationServiceClient() project_id = 'authentic-bongo-272808' location = "global" parent = f"projects/{project_id}/locations/{location}" from script_detector import script_cat from buck_2_unicode import buck_2_unicode from translation_metric import manual_diff_score, bleu_diff_score, gleu_diff_score, meteor_diff_score, rouge_diff_score, diff_score, critera4_5 from selection_source import selection_source, function5, function41, function311, function221, function2111, function11111, selection_source_transliteration, two_sources_two_outputs from script_writing import addSlugLine, addActionLine, addSpeaker, addParenthetical, addDialogue, dual_script, addTransition, dial_checker, non_dial_checker from script_reading import breaksen, getRefined, getSlugAndNonSlug, getSpeakers, getScenes from translation_resources import ibm_watson, google, aws, azure, lingvanex, yandex from transliteration_resources import azure_transliteration, indic_trans, om_transliterator, libindic, indic_transliteration_IAST, indic_transliteration_ITRANS, sheetal, ritwik filename1 = sys.argv[1] # get translated file from UI-1(translation) dial_dest_script = sys.argv[2] dual_dial_script = sys.argv[3] #Yes/No translation_and_transliteration = sys.argv[4] #Yes/No filename2 = sys.argv[5] # original file or take input as scenes from final translation # create an instance of a word document doc = docx.Document() doc_file = "translated/" + "trans" + ntpath.basename(filename1) print(doc_file) doc1a = docx.Document() sections = doc1a.sections for section in sections: section.top_margin = Inches(0.2) section.bottom_margin = Inches(0.2) section.left_margin = Inches(0.2) section.right_margin = Inches(0.2) section = doc1a.sections[-1] new_height = section.page_width section.page_width = section.page_height section.page_height = new_height name = 'Dialogue Comparision Table of '+doc_file doc1a.add_heading(name, 0) doc_para = doc1a.add_paragraph() doc_para.add_run('Translation resources used : Google, IBM watson, AWS, Azure, Lingvanex, Yandex').bold = True table1a = doc1a.add_table(rows=1,cols=4) table1a.style = 'TableGrid' hdr_Cells = table1a.rows[0].cells hdr_Cells[0].paragraphs[0].add_run("Input").bold=True hdr_Cells[1].paragraphs[0].add_run("Google").bold=True hdr_Cells[2].paragraphs[0].add_run("MNF Diff Score Method").bold=True hdr_Cells[3].paragraphs[0].add_run("Bleu Diff Score Method").bold=True doc1b = docx.Document() sections = doc1b.sections for section in sections: section.top_margin = Inches(0.2) section.bottom_margin = Inches(0.2) section.left_margin = Inches(0.2) section.right_margin = Inches(0.2) section = doc1b.sections[-1] new_height = section.page_width section.page_width = section.page_height section.page_height = new_height name = 'Dialogue Comparision Table of '+doc_file doc1b.add_heading(name, 0) doc_para = doc1b.add_paragraph() doc_para.add_run('Translation resources used : Google, IBM watson, AWS, Azure, Lingvanex, Yandex').bold = True table1b = doc1b.add_table(rows=1,cols=4) table1b.style = 'TableGrid' hdr_Cells = table1b.rows[0].cells hdr_Cells[0].paragraphs[0].add_run("Gleu Diff Score Method").bold=True hdr_Cells[1].paragraphs[0].add_run("Meteor Diff Score Method").bold=True hdr_Cells[2].paragraphs[0].add_run("Rougen Diff Score Method").bold=True hdr_Cells[3].paragraphs[0].add_run("Rougel Diff Score Method").bold=True doc2 = docx.Document() sections = doc2.sections for section in sections: section.top_margin = Inches(0.2) section.bottom_margin = Inches(0.2) section.left_margin = Inches(0.2) section.right_margin = Inches(0.2) section = doc2.sections[-1] new_height = section.page_width section.page_width = section.page_height section.page_height = new_height name = 'Final table '+doc_file doc2.add_heading(name, 0) doc_para = doc2.add_paragraph() doc_para.add_run('Translation resources used : Google, IBM watson, AWS, Azure, Lingvanex, Yandex').bold = True table2 = doc2.add_table(rows=1,cols=4) table2.style = 'TableGrid' hdr_Cells = table2.rows[0].cells hdr_Cells[0].paragraphs[0].add_run("Input").bold=True hdr_Cells[1].paragraphs[0].add_run("Output1").bold=True hdr_Cells[2].paragraphs[0].add_run("Output2").bold=True hdr_Cells[3].paragraphs[0].add_run("Output3").bold=True refined,total_scenes = getRefined(filename1) print(refined) sluglines,without_slug = getSlugAndNonSlug(refined) print(sluglines) characters = getSpeakers(without_slug) #print(characters) scenes,actionline,parenthetical_lis,speakers,dialogues = getScenes(refined,total_scenes,characters) #print(scenes) refined,total_scenes = getRefined(filename2) sluglines,without_slug = getSlugAndNonSlug(refined) characters = getSpeakers(without_slug) scenes1,actionline,parenthetical_lis,speakers,dialogues = getScenes(refined,total_scenes,characters) def language_detector(text): result = translate_client.translate(text, target_language='hi') det_lang = result["detectedSourceLanguage"] return det_lang def script_det(text): punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~''' no_punct = "" for char in text: if char not in punctuations: no_punct = char break script = script_cat(no_punct)[0] return script def punct_remover(string): punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~…।''' for x in string.lower(): if x in punctuations: string = string.replace(x, " ") return string def space_after_punct(text): #text = text.replace('...',' ... ') text = text.replace('. . .',' ... ') text = re.sub('([,!?()…-])', r'\1 ', text) text = re.sub('\s{2,}', ' ', text) return text def final_transliterated_sentence(original, transliterated): original = space_after_punct(original) punct_list = ['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', ' ', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', '…', '...', '।'] sentence = [] j = 0 for i in range(len(original.split())): if original.split()[i] in punct_list: sentence.append(original.split()[i]) elif original.split()[i][-1] in punct_list: temp = transliterated.split()[j] + original.split()[i][-1] sentence.append(temp) j = j+1 elif original.split()[i][-1] not in punct_list: temp = transliterated.split()[j] sentence.append(temp) j = j+1 transliterated_sentence = " ".join(sentence) transliterated_sentence.replace(' ... ','...') transliterated_sentence.replace('… ', '…') return transliterated_sentence def MNF_translate(text, dest_lang): result = translate_client.translate(text, target_language = dest_lang) translated_text = result['translatedText'] return translated_text def dial_comparison_transliteration_rom_dev_ph1(text, source_lang, source_script, dest_script): source_lang = "hi" source_script = "Latin" dest_script = "Devanagari" sources_name = {'0':'Azure', '1':'indic_trans', '2':'google', '3':'indic_trans_IAST'} sentences=sentence_tokenize.sentence_split(text, lang='en') priority_list =['Azure', 'indic_trans', 'google', 'indic_trans_IAST', ] transliterated_text=[] for sentence in sentences: if sentence == "" or sentence == " . . ." or sentence == " . ." or sentence ==" . . ”": continue print(sentence) OUT=[] for word in sentence.split(): if word==".": continue print(word) t0 = azure_transliteration(word, source_lang, source_script, dest_script) #print(t0) t1 = indic_trans(word, source_script, dest_script) #print(t1) t2 = google(word, 'en', 'hi') #print(t2) t3 = indic_transliteration_IAST(word) #print(t3) outputs=[t0, t1, t2, t3] out = compare_outputs_transliteration(word, outputs, sources_name, priority_list) OUT.append(out) transliterated_text.append(" ".join(OUT)) return " ".join(transliterated_text) # def dial_comparison_transliteration_dev_rom_ph1(text, source_lang, source_script, dest_script): # sources_name = {'0':'indic_trans', '1':'Azure', '2':'libindic', '3':'sheetal', '4':'ritwik'} # sentences=sentence_tokenize.sentence_split(text, lang='hi') # priority_list =['indic_trans', 'Azure', 'ritwik', 'sheetal', 'libindic'] # transliterated_text=[] # for sentence in sentences: # if sentence == "" or sentence == " . . ." or sentence == " . ." or sentence ==" . . ”": # continue # print(sentence) # OUT=[] # for word in sentence.split(): # if word==".": # continue # print(word) # t0 = indic_trans(word, source_script, dest_script) # #print(t0) # t1 = azure_transliteration(word, source_lang, source_script, dest_script) # #print(t1) # t2 = libindic(word, dest_script).rstrip() # #print(t2) # t3 = sheetal(word).replace('\n','') # #print(t3) # t4 = ritwik(word).replace('\n','').rstrip() # #print(t4) # outputs=[t0, t1, t2, t3, t4] # out = compare_outputs_transliteration(word, outputs, sources_name, priority_list) # OUT.append(out) # transliterated_text.append(" ".join(OUT)) # return " ".join(transliterated_text) def dial_comparison_transliteration_dev_rom_ph1(text, source_lang, source_script, dest_script): sources_name = {'0':'indic_trans', '1':'Azure', '2':'libindic', '3':'sheetal', '4':'ritwik'} sentences=sentence_tokenize.sentence_split(text, lang='hi') priority_list =['indic_trans', 'Azure', 'ritwik', 'sheetal', 'libindic'] transliterated_text=[] for sentence in sentences: if sentence == "" or sentence == " . . ." or sentence == " . ." or sentence ==" . . ”": continue print("original sentence", sentence) temp_sentence = punct_remover(sentence) print("sentence after punctuation", temp_sentence) t0 = indic_trans(temp_sentence, source_script, dest_script) #print(t0) t1 = azure_transliteration(temp_sentence, source_lang, source_script, dest_script) #print(t1) t2 = libindic(temp_sentence, dest_script).rstrip() #print(t2) t3 = sheetal(temp_sentence).replace('\n','') #print(t3) t4 = ritwik(temp_sentence).replace('\n','').rstrip() #print(t4) Out= [] outputs = [] for i in range(len(temp_sentence.split())): word = temp_sentence.split()[i] T0 = t0.split()[i] T1 = t1.split()[i] T2 = t2.split()[i] T3 = t3.split()[i] T4 = t4.split()[i] outputs=[T0, T1, T2, T3, T4] out = compare_outputs_transliteration(word, outputs, sources_name, priority_list) Out.append(out) trans_sent_wo_punct = " ".join(Out) print("trans_sent_wo_punct", trans_sent_wo_punct) transliterated_sentence = final_transliterated_sentence(sentence, trans_sent_wo_punct) print("trans_sent_with_punct", transliterated_sentence) transliterated_text.append(transliterated_sentence) return " ".join(transliterated_text) def dial_comparison_transliteration_arbic_to_rom_ph1(text, source_lang, source_script, dest_script): print("hello") sources_name = {'0':'indic_trans', '1':'Azure', '2':'buck_2_unicode'} sentences=sentence_tokenize.sentence_split(text, lang='en') priority_list =['indic_trans', 'Azure', 'buck_2_unicode' ] transliterated_text=[] for sentence in sentences: if sentence == "" or sentence == " . . ." or sentence == " . ." or sentence ==" . . ”": continue print(sentence) OUT=[] for word in sentence.split(): if word==".": continue print(word) t0 = indic_trans(word, source_script, dest_script) t1 = azure_transliteration(word, source_lang, source_script, dest_script) t2 = buck_2_unicode(word) outputs=[t0, t1, t2] out = compare_outputs_transliteration(word, outputs, sources_name, priority_list) OUT.append(out) transliterated_text.append(" ".join(OUT)) return " ".join(transliterated_text) def dial_comparison_transliteration_kann_to_rom_ph1(text, source_lang, source_script, dest_script): print("hello") sources_name = {'0':'om_transliteration', '1':'indic_trans', '2':'libindic', '3':'Azure'} sentences=sentence_tokenize.sentence_split(text, lang='en') priority_list =['om_transliteration', 'indic_trans', 'libindic', 'Azure'] transliterated_text=[] for sentence in sentences: if sentence == "" or sentence == " . . ." or sentence == " . ." or sentence ==" . . ”": continue print(sentence) OUT=[] for word in sentence.split(): if word==".": continue print(word) t0 = om_transliterator(word) t1 = indic_trans(word, source_script, dest_script) t2 = libindic(text, dest_script) t3 = azure_transliteration(word, source_lang, source_script, dest_script) outputs=[t0, t1, t2, t3] out = compare_outputs_transliteration(word, outputs, sources_name, priority_list) OUT.append(out) transliterated_text.append(" ".join(OUT)) return " ".join(transliterated_text) def dial_comparison_transliteration_tamil_to_rom_ph1(text, source_lang, source_script, dest_script): print("hello") sources_name = {'0':'Azure', '1':'libindic', '2':'indic_trans', } sentences=sentence_tokenize.sentence_split(text, lang='en') priority_list =['Azure', 'libindic', 'indic_trans'] transliterated_text=[] for sentence in sentences: if sentence == "" or sentence == " . . ." or sentence == " . ." or sentence ==" . . ”": continue print(sentence) OUT=[] for word in sentence.split(): if word==".": continue print(word) t0 = azure_transliteration(word, source_lang, source_script, dest_script) t2 = libindic(text, dest_script) t1 = indic_trans(word, source_script, dest_script) outputs=[t0, t1, t2] out = compare_outputs_transliteration(word, outputs, sources_name, priority_list) OUT.append(out) transliterated_text.append(" ".join(OUT)) return " ".join(transliterated_text) def compare_outputs_transliteration(word, outputs, sources_name, priority_list): #print(outputs) O1ANDS1, O2ANDS2 = selection_source_transliteration(sources_name, outputs, priority_list) print(O1ANDS1) add_dial_comparison_doc2_transliteration(doc2, table2, word, O1ANDS1, O2ANDS2, sources_name) return O1ANDS1[0] def add_dial_comparison_doc2_transliteration(doc2, table2, word, O1ANDS1, O2ANDS2, sources_name): row_Cells = table2.add_row().cells row_Cells[0].text= word row_Cells[1].text= O1ANDS1[0] row_Cells[1].paragraphs[0].add_run('(Source : '+str(O1ANDS1[1])+')') row_Cells[2].text= O2ANDS2[0] row_Cells[2].paragraphs[0].add_run('(Source : '+str(O2ANDS2[1])+')') original_dialogues = [] for scene in tqdm(scenes1[:5]): for i,line in enumerate(scene): if i == 0: #addSlugLine(doc,line) continue if type(line)==type(""): #addActionLine(doc, line, non_dial_src_lang) continue #print("action_line") #non_dial_src_lang = language_detector(line) #print("non_dial_src_lang", non_dial_src_lang) #non_dial_translate = non_dial_checker( non_dial_dest_lang, non_dial_src_lang ) #print("non_dial_translate", non_dial_translate) #print("line", line) #if non_dial_translate: # if non_dial_src_lang in translation_list and non_dial_dest_lang in translation_list: # trans_text = dial_comparison(line , non_dial_src_lang, non_dial_dest_lang) # addActionLine(doc, trans_text, non_dial_dest_lang) #else: # addActionLine(doc, line, non_dial_dest_lang) else: print("In dialogue") [speaker] = line.keys() if speaker == 'Transition': # if want to translate transition also along with action line use addTransition(doc,translator.translate(speaker,dest = gtrans_dict[actionline_dest_lang]).text) #addTransition(doc,line[speaker]) continue #addSpeaker(doc,speaker) if line[speaker][0] != 'NONE': # In parenthitical part # non_dial_translate = "no" # if non_dial_translate == "yes": # out = MNF_translate(line[speaker][0], non_dial_dest_lang) # addParenthetical(doc,out) # else: # addParenthetical(doc,line[speaker][0]) continue #print("dialogue to be transliterated ", line[speaker][2]) if line[speaker][2] == "": continue original_dialogues.append(line[speaker][2]) # if dial_dest_script == "Latin" and dial_src_script == "Devanagari": # trans_text = dial_comparison_transliteration_dev_rom_ph1(line[speaker][2], dial_src_lang, dial_src_script, dial_dest_script) # elif dial_dest_script == "Devanagari" and dial_src_script == "Latin": # trans_text = dial_comparison_transliteration_rom_dev_ph1(line[speaker][2], dial_src_lang, dial_src_script, dial_dest_script) # elif dial_dest_script == "Latin" and dial_src_script == "Arabic": # trans_text = dial_comparison_transliteration_arbic_to_rom_ph1(line[speaker][2], dial_src_lang, dial_src_script, dial_dest_script ) # elif dial_dest_script == "Latin" and dial_src_script == "Kannada": # trans_text = dial_comparison_transliteration_kann_to_rom_ph1(line[speaker][2], dial_src_lang, dial_src_script, dial_dest_script ) # elif dial_dest_script == "Latin" and dial_src_script == "Tamil": # trans_text = dial_comparison_transliteration_tamil_to_rom_ph1(line[speaker][2], dial_src_lang, dial_src_script, dial_dest_script ) # if dual_dial_script == "Yes": # dual_script(doc, line[speaker][2], trans_text, dial_src_lang) # else: # addDialogue(doc, trans_text, dial_src_lang) for scene in tqdm(scenes): x = "False" y = "False" for i,line in enumerate(scene): if i == 0: continue if type(line)==type(""): x = "True" non_dial_src_lang = language_detector(line) else: y = "True" [speaker] = line.keys() if speaker == 'Transition': continue if line[speaker][0] != 'NONE': continue dial_src_lang = language_detector(line[speaker][2]) dial_src_script = script_det(line[speaker][2]) if x == "True" and y == "True": break print("non_dial_src_lang", non_dial_src_lang) print("dial_src_lang", dial_src_lang) print("dial_src_script", dial_src_script) print("dial_dest_script", dial_dest_script) j = 0 for scene in tqdm(scenes[:5]): for i,line in enumerate(scene): if i == 0: addSlugLine(doc,line) continue if type(line)==type(""): addActionLine(doc, line, non_dial_src_lang) #print("action_line") #non_dial_src_lang = language_detector(line) #print("non_dial_src_lang", non_dial_src_lang) #non_dial_translate = non_dial_checker( non_dial_dest_lang, non_dial_src_lang ) #print("non_dial_translate", non_dial_translate) #print("line", line) #if non_dial_translate: # if non_dial_src_lang in translation_list and non_dial_dest_lang in translation_list: # trans_text = dial_comparison(line , non_dial_src_lang, non_dial_dest_lang) # addActionLine(doc, trans_text, non_dial_dest_lang) #else: # addActionLine(doc, line, non_dial_dest_lang) else: print("In dialogue") [speaker] = line.keys() if speaker == 'Transition': # if want to translate transition also along with action line use addTransition(doc,translator.translate(speaker,dest = gtrans_dict[actionline_dest_lang]).text) addTransition(doc,line[speaker]) continue addSpeaker(doc,speaker) if line[speaker][0] != 'NONE': # In parenthitical part non_dial_translate = "no" if non_dial_translate == "yes": out = MNF_translate(line[speaker][0], non_dial_dest_lang) addParenthetical(doc,out) else: addParenthetical(doc,line[speaker][0]) print("dialogue to be transliterated ", line[speaker][2]) if line[speaker][2] == "": continue if dial_dest_script == "Latin" and dial_src_script == "Devanagari": trans_text = dial_comparison_transliteration_dev_rom_ph1(line[speaker][2], dial_src_lang, dial_src_script, dial_dest_script) elif dial_dest_script == "Devanagari" and dial_src_script == "Latin": trans_text = dial_comparison_transliteration_rom_dev_ph1(line[speaker][2], dial_src_lang, dial_src_script, dial_dest_script) elif dial_dest_script == "Latin" and dial_src_script == "Arabic": trans_text = dial_comparison_transliteration_arbic_to_rom_ph1(line[speaker][2], dial_src_lang, dial_src_script, dial_dest_script ) elif dial_dest_script == "Latin" and dial_src_script == "Kannada": trans_text = dial_comparison_transliteration_kann_to_rom_ph1(line[speaker][2], dial_src_lang, dial_src_script, dial_dest_script ) elif dial_dest_script == "Latin" and dial_src_script == "Tamil": trans_text = dial_comparison_transliteration_tamil_to_rom_ph1(line[speaker][2], dial_src_lang, dial_src_script, dial_dest_script ) if dual_dial_script == "Yes": if translation_and_transliteration == "Yes": dual_script(doc, original_dialogues[j], trans_text, dial_src_lang) j=j+1 else: dual_script(doc, line[speaker][2], trans_text, dial_src_lang) else: addDialogue(doc, trans_text, dial_src_lang) doc.save(doc_file) #doc1a.save("1"+doc_file) #doc1b.save("2"+doc_file) doc2.save("final.docx")