import os import sys import docx import re from tqdm import tqdm from collections import Counter import ntpath from docx.shared import Inches, Cm, Pt from docx.enum.text import WD_ALIGN_PARAGRAPH from docx.enum.table import WD_TABLE_ALIGNMENT, WD_ALIGN_VERTICAL import requests, uuid, json import nltk.translate.bleu_score as bleu import nltk.translate.gleu_score as gleu from rouge_score import rouge_scorer import numpy as np import statistics from statistics import mode from indicnlp.tokenize import sentence_tokenize import nltk try: print("time1111") nltk.data.find("tokenizers/punkt") except LookupError: #nltk.download("punkt") pass try: nltk.data.find("wordnet") except LookupError: ###nltk.download('wordnet') print("pass") from nltk.tokenize import sent_tokenize # import logging # from logger import get_module_logger # log = get_module_logger(__name__) # log.info('Logger working') # google os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "My First Project-2573112d5326.json" from google.cloud import translate from google.cloud import translate_v2 as Translate translate_client = Translate.Client() client = translate.TranslationServiceClient() project_id = "authentic-bongo-272808" location = "global" parent = f"projects/{project_id}/locations/{location}" from script_detector import script_cat from buck_2_unicode import buck_2_unicode from translation_metric import ( manual_diff_score, bleu_diff_score, gleu_diff_score, meteor_diff_score, rouge_diff_score, diff_score, critera4_5, ) from selection_source import ( selection_source, function5, function41, function311, function221, function2111, function11111, selection_source_transliteration, two_sources_two_outputs, ) from script_writing import ( addSlugLine, addActionLine, addSpeaker, addParenthetical, addDialogue, dual_script, addTransition, dial_checker, non_dial_checker, ) from script_reading import ( breaksen, getRefined, getSlugAndNonSlug, getSpeakers, getScenes, ) from translation_resources import ibm_watson, google, aws, azure, lingvanex, yandex from transliteration_resources import ( azure_transliteration, indic_trans, om_transliterator, libindic, indic_transliteration_IAST, indic_transliteration_ITRANS, sheetal, ritwik, ) filename1 = sys.argv[1] # original file non_dial_src_lang = sys.argv[2] non_dial_dest_lang = sys.argv[3] dial_src_lang = sys.argv[4] dial_dest_lang = sys.argv[5] dual_dial_script = sys.argv[6] # yes/No if non_dial_src_lang != non_dial_dest_lang: global_non_dialogue_flag = "Yes" else: global_non_dialogue_flag = "No" if dial_src_lang != dial_dest_lang: global_dialogue_flag = "Yes" else: global_dialogue_flag = "No" # filename1 = sys.argv[1] # dial_dest_lang = sys.argv[2] # #dial_dest_lang = user_script_data.get("dial_dest_language") # dial_dest_script = sys.argv[3]C:\Users\lokesh\Desktop\mnfproject1\scripts\activate # #dial_dest_script = user_script_data.get("dial_dest_script") # non_dial_dest_lang = sys.argv[4] # #non_dial_dest_lang = user_script_data.get("nondial_dest_language") # dual_dial_script = sys.argv[5] # #dual_dial_script = user_script_data.get("dual_dial_script") # Yes,No translation_list = [ "en", "ta", "hi", "ar", "ur", "kn", "gu", "bg", "bn", "te", "ml", "ru", "sr", "uk", "hr", "ga", "sq", "mr", "fa", "tr", "hu", "it", "ro", "pa", "gu", "or", "zh", "ne", "fr", "es", "id", "el", "ja", "ko", "be", "uz", "sd", "af", "de", "is", "ig", "la", "pt", "my", "th", "su", "lo", "am", "si", "az", "kk", "mk", "bs", "ps", "mg", "ms", "yo", "cs", "da", "nl", "tl", "no", "sl", "sv", "vi", "cy", "he", "hy", "km", "ka", "mn", "ku", "ky", "tk", "he", "hy", "km", "ka", "mn", "ku", "ky", "tk", "fi", "ht", "haw", "lt", "lb", "mt", "pl", "eo", "tt", "ug", "ha", "so", "sw", "yi", "eu", "ca", "ceb", "co", "et", "fy", "gl", "hmn", "rw", "lv", "mi", "sm", "gd", "st", "sn", "sk", "xh", "zu", ] # create an instance of a word document doc = docx.Document() doc_file = ( "translated/" + str(dial_dest_lang) + "_" + "trans" + "_of_" + ntpath.basename(filename1) ) print(doc_file) doc1a = docx.Document() sections = doc1a.sections for section in sections: section.top_margin = Inches(0.2) section.bottom_margin = Inches(0.2) section.left_margin = Inches(0.2) section.right_margin = Inches(0.2) section = doc1a.sections[-1] new_height = section.page_width section.page_width = section.page_height section.page_height = new_height name = "Dialogue Comparision Table of " + doc_file doc1a.add_heading(name, 0) doc_para = doc1a.add_paragraph() doc_para.add_run( "Translation resources used : Google, IBM watson, AWS, Azure, Lingvanex, Yandex" ).bold = True table1a = doc1a.add_table(rows=1, cols=4) table1a.style = "TableGrid" hdr_Cells = table1a.rows[0].cells hdr_Cells[0].paragraphs[0].add_run("Input").bold = True hdr_Cells[1].paragraphs[0].add_run("Google").bold = True hdr_Cells[2].paragraphs[0].add_run("MNF Diff Score Method").bold = True hdr_Cells[3].paragraphs[0].add_run("Bleu Diff Score Method").bold = True doc1b = docx.Document() sections = doc1b.sections for section in sections: section.top_margin = Inches(0.2) section.bottom_margin = Inches(0.2) section.left_margin = Inches(0.2) section.right_margin = Inches(0.2) section = doc1b.sections[-1] new_height = section.page_width section.page_width = section.page_height section.page_height = new_height name = "Dialogue Comparision Table of " + doc_file doc1b.add_heading(name, 0) doc_para = doc1b.add_paragraph() doc_para.add_run( "Translation resources used : Google, IBM watson, AWS, Azure, Lingvanex, Yandex" ).bold = True table1b = doc1b.add_table(rows=1, cols=4) table1b.style = "TableGrid" hdr_Cells = table1b.rows[0].cells hdr_Cells[0].paragraphs[0].add_run("Gleu Diff Score Method").bold = True hdr_Cells[1].paragraphs[0].add_run("Meteor Diff Score Method").bold = True hdr_Cells[2].paragraphs[0].add_run("Rougen Diff Score Method").bold = True hdr_Cells[3].paragraphs[0].add_run("Rougel Diff Score Method").bold = True doc2 = docx.Document() sections = doc2.sections for section in sections: section.top_margin = Inches(0.2) section.bottom_margin = Inches(0.2) section.left_margin = Inches(0.2) section.right_margin = Inches(0.2) section = doc2.sections[-1] new_height = section.page_width section.page_width = section.page_height section.page_height = new_height name = "Final table " + doc_file doc2.add_heading(name, 0) doc_para = doc2.add_paragraph() doc_para.add_run( "Translation resources used : Google, IBM watson, AWS, Azure, Lingvanex, Yandex" ).bold = True table2 = doc2.add_table(rows=1, cols=4) table2.style = "TableGrid" hdr_Cells = table2.rows[0].cells hdr_Cells[0].paragraphs[0].add_run("Input").bold = True hdr_Cells[1].paragraphs[0].add_run("Output1").bold = True hdr_Cells[2].paragraphs[0].add_run("Output2").bold = True hdr_Cells[3].paragraphs[0].add_run("Output3").bold = True # process the input script and return scenes refined, total_scenes = getRefined(filename1) print(refined) # log.debug(refined) sluglines, without_slug = getSlugAndNonSlug(refined) print(sluglines) # log.debug(sluglines) characters = getSpeakers(without_slug) # print(characters) # log.debug(characters) scenes, actionline, parenthetical_lis, speakers, dialogues = getScenes( refined, total_scenes, characters ) # print(scenes) # to detect the language def language_detector(text): result = translate_client.translate(text, target_language="hi") det_lang = result["detectedSourceLanguage"] return det_lang class myDict(dict): def __init__(self): self = dict() def add(self, key, value): self[key] = value def all_translator(sentence, source_lang, target_lang): print("in all translator", sentence) # if sentence=="" or sentence==" ": # return i = 0 trans = myDict() sources_name = myDict() try: globals()["t%s" % i] = google(sentence, source_lang, target_lang) trans.add(str(i), globals()["t%s" % i]) sources_name.add(str(i), "GOOGLE") i = i + 1 except: pass try: globals()["t%s" % i] = ibm_watson(sentence, source_lang, target_lang) trans.add(str(i), globals()["t%s" % i]) sources_name.add(str(i), "IBM_WATSON") i = i + 1 except: pass try: globals()["t%s" % i] = aws(sentence, source_lang, target_lang) trans.add(str(i), globals()["t%s" % i]) sources_name.add(str(i), "AWS") i = i + 1 except: pass try: globals()["t%s" % i] = azure(sentence, target_lang) trans.add(str(i), globals()["t%s" % i]) sources_name.add(str(i), "AZURE") i = i + 1 except: pass try: globals()["t%s" % i] = lingvanex(sentence, source_lang, target_lang) trans.add(str(i), globals()["t%s" % i]) sources_name.add(str(i), "LINGVANEX") i = i + 1 except: pass try: globals()["t%s" % i] = yandex(sentence, source_lang, target_lang) trans.add(str(i), globals()["t%s" % i]) sources_name.add(str(i), "YANDEX") i = i + 1 except: pass # print(trans) # print(sources_name) trans_text = compare_outputs(sentence, trans["0"], trans, sources_name) return trans_text # take paragraph which returns translated paragraph by comparing translated ouptputs from different resources def translation_with_spcecial_dots(Sentence, source_lang, target_lang): special_characters = ["...", "…", ". . ."] translated_text = [] for i in special_characters: if i not in Sentence: continue Sentences = Sentence.split(i) for Sentence in Sentences: if Sentence == " " or Sentence == "": continue if any(ext in Sentence for ext in special_characters): trans_text = translation_with_spcecial_dots( Sentence, source_lang, target_lang ) else: if Sentence != Sentences[-1]: trans_text = all_translator(Sentence, source_lang, target_lang) + i else: trans_text = all_translator(Sentence, source_lang, target_lang) translated_text.append(trans_text) return " ".join(translated_text) def translate_comparison(text, source_lang, target_lang): sentences = sent_tokenize(text) special_characters = ["...", "…", ". . ."] translated_text = [] for sentence in sentences: if sentence == " " or sentence == "": continue if any(ext in sentence for ext in special_characters): trans_text = translation_with_spcecial_dots( sentence, source_lang, target_lang ) translated_text.append(trans_text) else: trans_text = all_translator(sentence, source_lang, target_lang) translated_text.append(trans_text) return " ".join(translated_text) # take a sentence and give translated sentence by comparing outputs from different resources def compare_outputs(sentence, t0, trans, sources_name): k = [] s = [] methods_name = { "0": "MNF", "1": "Gleu", "2": "Meteor", "3": "Rougen", "4": "Rougel", } google_output = t0 # print("google", google_output) output1, source1 = manual_diff_score(trans, sources_name) # print("MNF", output1) output2, source2 = gleu_diff_score(trans, sources_name) # print("gleu", output2) output3, source3 = meteor_diff_score(trans, sources_name) # print("meteor", output3) output4, source4, output5, source5 = rouge_diff_score(trans, sources_name) # print("rougen", output4) # print("rougel", output5) if google_output == output1 == output2 == output3 == output4 == output5: # print("all output is same as google") return google_output else: if google_output != output1: k.append(output1) s.append(source1) else: k.append(" ") s.append(" ") if google_output != output2: k.append(output2) s.append(source2) else: k.append(" ") s.append(" ") if google_output != output3: k.append(output3) s.append(source3) else: k.append(" ") s.append(" ") if google_output != output4: k.append(output4) s.append(source4) else: k.append(" ") s.append(" ") if google_output != output5: k.append(output5) s.append(source5) else: k.append(" ") s.append(" ") k.insert(0, sentence) k.insert(1, google_output) s1ANDm1, s2ANDm2, s3ANDm3 = selection_source( s, sources_name, trans, methods_name ) # print("s1", s1ANDm1) # print("s2", s2ANDm2) # print("s3", s3ANDm3) # print(s1ANDm1[0]) # print(sources_name) # add_dial_comparison_doc1a(doc1a, table1a , k, s, s1ANDm1[0]) # add_dial_comparison_doc1b(doc1b, table1b , k, s, s1ANDm1[0]) add_dial_comparison_doc2( doc2, table2, sentence, s1ANDm1, s2ANDm2, s3ANDm3, sources_name, trans ) for a, b in sources_name.items(): if b == s1ANDm1[0]: k = a output1 = trans[str(k)] return output1 def add_dial_comparison_doc1a(doc1a, table1a, k, s, selected_source): row_Cells = table1a.add_row().cells for i in range(4): row_Cells[i].text = k[i] for i in range(2): if s[i] != " ": if s[i] == selected_source: row_Cells[i + 2].paragraphs[0].add_run( "(Source : " + s[i] + ")" ).bold = True else: row_Cells[i + 2].paragraphs[0].add_run("(Source : " + s[i] + ")") def add_dial_comparison_doc1b(doc1b, table1b, k, s, selected_source): row_Cells = table1b.add_row().cells n = len(k) for i in range(4, n): row_Cells[i - 4].text = k[i] for i in range(4): if s[i + 2] != " ": if s[i + 2] == selected_source: row_Cells[i].paragraphs[0].add_run( "(Source : " + s[i + 2] + ")" ).bold = True else: row_Cells[i].paragraphs[0].add_run("(Source : " + s[i + 2] + ")") # to return the table with best 3 outputs def add_dial_comparison_doc2( doc2, table2, sentence, s1ANDm1, s2ANDm2, s3ANDm3, sources_name, trans ): row_Cells = table2.add_row().cells for a, b in sources_name.items(): # print(sources_name.items()) # print(b) # print(s1ANDm1[0]) if b == s1ANDm1[0]: k = a output1 = trans[str(k)] row_Cells[0].text = sentence row_Cells[1].text = output1 row_Cells[1].paragraphs[0].add_run("(Source : " + str(s1ANDm1[0]) + ")") row_Cells[1].paragraphs[0].add_run("(Methods : " + str(s1ANDm1[1]) + ")") if s2ANDm2[0] == "": row_Cells[2].text = "" else: for a, b in sources_name.items(): if b == s2ANDm2[0]: k = a output2 = trans[str(k)] row_Cells[2].text = output2 row_Cells[2].paragraphs[0].add_run("(Source : " + str(s2ANDm2[0]) + ")") row_Cells[2].paragraphs[0].add_run("(Methods : " + str(s2ANDm2[1]) + ")") if s3ANDm3[0] == "": row_Cells[3].text = "" else: for a, b in sources_name.items(): if b == s3ANDm3[0]: k = a output3 = trans[str(k)] row_Cells[3].text = output3 row_Cells[3].paragraphs[0].add_run("(Source : " + str(s3ANDm3[0]) + ")") row_Cells[3].paragraphs[0].add_run("(Methods : " + str(s3ANDm3[1]) + ")") for scene in tqdm(scenes[:5]): for i, line in enumerate(scene): if i == 0: addSlugLine(doc, line) continue if type(line) == type(""): if global_non_dialogue_flag == "Yes": # non_dial_src_lang = language_detector(line) # non_dial_translate = non_dial_checker( non_dial_dest_lang, non_dial_src_lang ) # print("non_dial_translate", non_dial_translate) # if non_dial_translate: # if non_dial_src_lang in translation_list and non_dial_dest_lang in translation_list: # trans_text = translate_comparison(line , non_dial_src_lang, non_dial_dest_lang) # addActionLine(doc, trans_text, non_dial_dest_lang) # else: # addActionLine(doc, line, non_dial_dest_lang) if ( non_dial_src_lang in translation_list and non_dial_dest_lang in translation_list ): trans_text = translate_comparison( line, non_dial_src_lang, non_dial_dest_lang ) addActionLine(doc, trans_text, non_dial_dest_lang) else: addActionLine(doc, line, non_dial_dest_lang) else: addActionLine(doc, line, non_dial_dest_lang) else: # print(line) [speaker] = line.keys() # print([speaker]) if speaker == "Transition": addTransition(doc, line[speaker]) continue addSpeaker(doc, speaker) if global_dialogue_flag == "Yes": print("In dialogue") dial_src_lang = language_detector(line[speaker][2]) print("dial_src_lang", dial_src_lang) # print("p", line[speaker][0]) if line[speaker][0] != "NONE": out = google(line[speaker][0], dial_src_lang, dial_dest_lang) addParenthetical(doc, out) # else: # addParenthetical(doc,line[speaker][0]) dial_translate = dial_checker(dial_dest_lang, dial_src_lang) print("dial_translate", dial_translate) if dial_translate: print("dialogue to be translated", line[speaker][2]) if line[speaker][2] == "": continue if ( dial_src_lang in translation_list and dial_dest_lang in translation_list ): trans_text = translate_comparison( line[speaker][2], dial_src_lang, dial_dest_lang ) if dual_dial_script == "Yes": dual_script(doc, line[speaker][2], trans_text, dial_dest_lang) else: addDialogue(doc, trans_text, dial_dest_lang) else: addDialogue(doc, line[speaker][2], dial_dest_lang) else: addParenthetical(doc, line[speaker][0]) addDialogue(doc, line[speaker][2], dial_dest_lang) doc.save(doc_file) # doc1a.save("1"+doc_file) # doc1b.save("2"+doc_file) doc2.save( str(filename1.split(".")[0]) + "_trans_to_" + str(dial_dest_lang) + "_" + "final.docx" )