Conversion_Kitchen_Code/kitchen_counter/conversion/attachments/final_translation.py

727 lines
20 KiB
Python
Executable File

import os
import sys
import docx
import re
from tqdm import tqdm
from collections import Counter
import ntpath
from docx.shared import Inches, Cm, Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.table import WD_TABLE_ALIGNMENT, WD_ALIGN_VERTICAL
import requests, uuid, json
import nltk.translate.bleu_score as bleu
import nltk.translate.gleu_score as gleu
from rouge_score import rouge_scorer
import numpy as np
import statistics
from statistics import mode
from indicnlp.tokenize import sentence_tokenize
import nltk
try:
print("time1111")
nltk.data.find("tokenizers/punkt")
except LookupError:
#nltk.download("punkt")
pass
try:
nltk.data.find("wordnet")
except LookupError: ###nltk.download('wordnet')
print("pass")
from nltk.tokenize import sent_tokenize
# import logging
# from logger import get_module_logger
# log = get_module_logger(__name__)
# log.info('Logger working')
# google
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "My First Project-2573112d5326.json"
from google.cloud import translate
from google.cloud import translate_v2 as Translate
translate_client = Translate.Client()
client = translate.TranslationServiceClient()
project_id = "authentic-bongo-272808"
location = "global"
parent = f"projects/{project_id}/locations/{location}"
from script_detector import script_cat
from buck_2_unicode import buck_2_unicode
from translation_metric import (
manual_diff_score,
bleu_diff_score,
gleu_diff_score,
meteor_diff_score,
rouge_diff_score,
diff_score,
critera4_5,
)
from selection_source import (
selection_source,
function5,
function41,
function311,
function221,
function2111,
function11111,
selection_source_transliteration,
two_sources_two_outputs,
)
from script_writing import (
addSlugLine,
addActionLine,
addSpeaker,
addParenthetical,
addDialogue,
dual_script,
addTransition,
dial_checker,
non_dial_checker,
)
from script_reading import (
breaksen,
getRefined,
getSlugAndNonSlug,
getSpeakers,
getScenes,
)
from translation_resources import ibm_watson, google, aws, azure, lingvanex, yandex
from transliteration_resources import (
azure_transliteration,
indic_trans,
om_transliterator,
libindic,
indic_transliteration_IAST,
indic_transliteration_ITRANS,
sheetal,
ritwik,
)
filename1 = sys.argv[1] # original file
non_dial_src_lang = sys.argv[2]
non_dial_dest_lang = sys.argv[3]
dial_src_lang = sys.argv[4]
dial_dest_lang = sys.argv[5]
dual_dial_script = sys.argv[6] # yes/No
if non_dial_src_lang != non_dial_dest_lang:
global_non_dialogue_flag = "Yes"
else:
global_non_dialogue_flag = "No"
if dial_src_lang != dial_dest_lang:
global_dialogue_flag = "Yes"
else:
global_dialogue_flag = "No"
# filename1 = sys.argv[1]
# dial_dest_lang = sys.argv[2]
# #dial_dest_lang = user_script_data.get("dial_dest_language")
# dial_dest_script = sys.argv[3]C:\Users\lokesh\Desktop\mnfproject1\scripts\activate
# #dial_dest_script = user_script_data.get("dial_dest_script")
# non_dial_dest_lang = sys.argv[4]
# #non_dial_dest_lang = user_script_data.get("nondial_dest_language")
# dual_dial_script = sys.argv[5]
# #dual_dial_script = user_script_data.get("dual_dial_script") # Yes,No
translation_list = [
"en",
"ta",
"hi",
"ar",
"ur",
"kn",
"gu",
"bg",
"bn",
"te",
"ml",
"ru",
"sr",
"uk",
"hr",
"ga",
"sq",
"mr",
"fa",
"tr",
"hu",
"it",
"ro",
"pa",
"gu",
"or",
"zh",
"ne",
"fr",
"es",
"id",
"el",
"ja",
"ko",
"be",
"uz",
"sd",
"af",
"de",
"is",
"ig",
"la",
"pt",
"my",
"th",
"su",
"lo",
"am",
"si",
"az",
"kk",
"mk",
"bs",
"ps",
"mg",
"ms",
"yo",
"cs",
"da",
"nl",
"tl",
"no",
"sl",
"sv",
"vi",
"cy",
"he",
"hy",
"km",
"ka",
"mn",
"ku",
"ky",
"tk",
"he",
"hy",
"km",
"ka",
"mn",
"ku",
"ky",
"tk",
"fi",
"ht",
"haw",
"lt",
"lb",
"mt",
"pl",
"eo",
"tt",
"ug",
"ha",
"so",
"sw",
"yi",
"eu",
"ca",
"ceb",
"co",
"et",
"fy",
"gl",
"hmn",
"rw",
"lv",
"mi",
"sm",
"gd",
"st",
"sn",
"sk",
"xh",
"zu",
]
# create an instance of a word document
doc = docx.Document()
doc_file = (
"translated/"
+ str(dial_dest_lang)
+ "_"
+ "trans"
+ "_of_"
+ ntpath.basename(filename1)
)
print(doc_file)
doc1a = docx.Document()
sections = doc1a.sections
for section in sections:
section.top_margin = Inches(0.2)
section.bottom_margin = Inches(0.2)
section.left_margin = Inches(0.2)
section.right_margin = Inches(0.2)
section = doc1a.sections[-1]
new_height = section.page_width
section.page_width = section.page_height
section.page_height = new_height
name = "Dialogue Comparision Table of " + doc_file
doc1a.add_heading(name, 0)
doc_para = doc1a.add_paragraph()
doc_para.add_run(
"Translation resources used : Google, IBM watson, AWS, Azure, Lingvanex, Yandex"
).bold = True
table1a = doc1a.add_table(rows=1, cols=4)
table1a.style = "TableGrid"
hdr_Cells = table1a.rows[0].cells
hdr_Cells[0].paragraphs[0].add_run("Input").bold = True
hdr_Cells[1].paragraphs[0].add_run("Google").bold = True
hdr_Cells[2].paragraphs[0].add_run("MNF Diff Score Method").bold = True
hdr_Cells[3].paragraphs[0].add_run("Bleu Diff Score Method").bold = True
doc1b = docx.Document()
sections = doc1b.sections
for section in sections:
section.top_margin = Inches(0.2)
section.bottom_margin = Inches(0.2)
section.left_margin = Inches(0.2)
section.right_margin = Inches(0.2)
section = doc1b.sections[-1]
new_height = section.page_width
section.page_width = section.page_height
section.page_height = new_height
name = "Dialogue Comparision Table of " + doc_file
doc1b.add_heading(name, 0)
doc_para = doc1b.add_paragraph()
doc_para.add_run(
"Translation resources used : Google, IBM watson, AWS, Azure, Lingvanex, Yandex"
).bold = True
table1b = doc1b.add_table(rows=1, cols=4)
table1b.style = "TableGrid"
hdr_Cells = table1b.rows[0].cells
hdr_Cells[0].paragraphs[0].add_run("Gleu Diff Score Method").bold = True
hdr_Cells[1].paragraphs[0].add_run("Meteor Diff Score Method").bold = True
hdr_Cells[2].paragraphs[0].add_run("Rougen Diff Score Method").bold = True
hdr_Cells[3].paragraphs[0].add_run("Rougel Diff Score Method").bold = True
doc2 = docx.Document()
sections = doc2.sections
for section in sections:
section.top_margin = Inches(0.2)
section.bottom_margin = Inches(0.2)
section.left_margin = Inches(0.2)
section.right_margin = Inches(0.2)
section = doc2.sections[-1]
new_height = section.page_width
section.page_width = section.page_height
section.page_height = new_height
name = "Final table " + doc_file
doc2.add_heading(name, 0)
doc_para = doc2.add_paragraph()
doc_para.add_run(
"Translation resources used : Google, IBM watson, AWS, Azure, Lingvanex, Yandex"
).bold = True
table2 = doc2.add_table(rows=1, cols=4)
table2.style = "TableGrid"
hdr_Cells = table2.rows[0].cells
hdr_Cells[0].paragraphs[0].add_run("Input").bold = True
hdr_Cells[1].paragraphs[0].add_run("Output1").bold = True
hdr_Cells[2].paragraphs[0].add_run("Output2").bold = True
hdr_Cells[3].paragraphs[0].add_run("Output3").bold = True
# process the input script and return scenes
refined, total_scenes = getRefined(filename1)
print(refined)
# log.debug(refined)
sluglines, without_slug = getSlugAndNonSlug(refined)
print(sluglines)
# log.debug(sluglines)
characters = getSpeakers(without_slug)
# print(characters)
# log.debug(characters)
scenes, actionline, parenthetical_lis, speakers, dialogues = getScenes(
refined, total_scenes, characters
)
# print(scenes)
# to detect the language
def language_detector(text):
result = translate_client.translate(text, target_language="hi")
det_lang = result["detectedSourceLanguage"]
return det_lang
class myDict(dict):
def __init__(self):
self = dict()
def add(self, key, value):
self[key] = value
def all_translator(sentence, source_lang, target_lang):
print("in all translator", sentence)
# if sentence=="" or sentence==" ":
# return
i = 0
trans = myDict()
sources_name = myDict()
try:
globals()["t%s" % i] = google(sentence, source_lang, target_lang)
trans.add(str(i), globals()["t%s" % i])
sources_name.add(str(i), "GOOGLE")
i = i + 1
except:
pass
try:
globals()["t%s" % i] = ibm_watson(sentence, source_lang, target_lang)
trans.add(str(i), globals()["t%s" % i])
sources_name.add(str(i), "IBM_WATSON")
i = i + 1
except:
pass
try:
globals()["t%s" % i] = aws(sentence, source_lang, target_lang)
trans.add(str(i), globals()["t%s" % i])
sources_name.add(str(i), "AWS")
i = i + 1
except:
pass
try:
globals()["t%s" % i] = azure(sentence, target_lang)
trans.add(str(i), globals()["t%s" % i])
sources_name.add(str(i), "AZURE")
i = i + 1
except:
pass
try:
globals()["t%s" % i] = lingvanex(sentence, source_lang, target_lang)
trans.add(str(i), globals()["t%s" % i])
sources_name.add(str(i), "LINGVANEX")
i = i + 1
except:
pass
try:
globals()["t%s" % i] = yandex(sentence, source_lang, target_lang)
trans.add(str(i), globals()["t%s" % i])
sources_name.add(str(i), "YANDEX")
i = i + 1
except:
pass
# print(trans)
# print(sources_name)
trans_text = compare_outputs(sentence, trans["0"], trans, sources_name)
return trans_text
# take paragraph which returns translated paragraph by comparing translated ouptputs from different resources
def translation_with_spcecial_dots(Sentence, source_lang, target_lang):
special_characters = ["...", "", ". . ."]
translated_text = []
for i in special_characters:
if i not in Sentence:
continue
Sentences = Sentence.split(i)
for Sentence in Sentences:
if Sentence == " " or Sentence == "":
continue
if any(ext in Sentence for ext in special_characters):
trans_text = translation_with_spcecial_dots(
Sentence, source_lang, target_lang
)
else:
if Sentence != Sentences[-1]:
trans_text = all_translator(Sentence, source_lang, target_lang) + i
else:
trans_text = all_translator(Sentence, source_lang, target_lang)
translated_text.append(trans_text)
return " ".join(translated_text)
def translate_comparison(text, source_lang, target_lang):
sentences = sent_tokenize(text)
special_characters = ["...", "", ". . ."]
translated_text = []
for sentence in sentences:
if sentence == " " or sentence == "":
continue
if any(ext in sentence for ext in special_characters):
trans_text = translation_with_spcecial_dots(
sentence, source_lang, target_lang
)
translated_text.append(trans_text)
else:
trans_text = all_translator(sentence, source_lang, target_lang)
translated_text.append(trans_text)
return " ".join(translated_text)
# take a sentence and give translated sentence by comparing outputs from different resources
def compare_outputs(sentence, t0, trans, sources_name):
k = []
s = []
methods_name = {
"0": "MNF",
"1": "Gleu",
"2": "Meteor",
"3": "Rougen",
"4": "Rougel",
}
google_output = t0
# print("google", google_output)
output1, source1 = manual_diff_score(trans, sources_name)
# print("MNF", output1)
output2, source2 = gleu_diff_score(trans, sources_name)
# print("gleu", output2)
output3, source3 = meteor_diff_score(trans, sources_name)
# print("meteor", output3)
output4, source4, output5, source5 = rouge_diff_score(trans, sources_name)
# print("rougen", output4)
# print("rougel", output5)
if google_output == output1 == output2 == output3 == output4 == output5:
# print("all output is same as google")
return google_output
else:
if google_output != output1:
k.append(output1)
s.append(source1)
else:
k.append(" ")
s.append(" ")
if google_output != output2:
k.append(output2)
s.append(source2)
else:
k.append(" ")
s.append(" ")
if google_output != output3:
k.append(output3)
s.append(source3)
else:
k.append(" ")
s.append(" ")
if google_output != output4:
k.append(output4)
s.append(source4)
else:
k.append(" ")
s.append(" ")
if google_output != output5:
k.append(output5)
s.append(source5)
else:
k.append(" ")
s.append(" ")
k.insert(0, sentence)
k.insert(1, google_output)
s1ANDm1, s2ANDm2, s3ANDm3 = selection_source(
s, sources_name, trans, methods_name
)
# print("s1", s1ANDm1)
# print("s2", s2ANDm2)
# print("s3", s3ANDm3)
# print(s1ANDm1[0])
# print(sources_name)
# add_dial_comparison_doc1a(doc1a, table1a , k, s, s1ANDm1[0])
# add_dial_comparison_doc1b(doc1b, table1b , k, s, s1ANDm1[0])
add_dial_comparison_doc2(
doc2, table2, sentence, s1ANDm1, s2ANDm2, s3ANDm3, sources_name, trans
)
for a, b in sources_name.items():
if b == s1ANDm1[0]:
k = a
output1 = trans[str(k)]
return output1
def add_dial_comparison_doc1a(doc1a, table1a, k, s, selected_source):
row_Cells = table1a.add_row().cells
for i in range(4):
row_Cells[i].text = k[i]
for i in range(2):
if s[i] != " ":
if s[i] == selected_source:
row_Cells[i + 2].paragraphs[0].add_run(
"(Source : " + s[i] + ")"
).bold = True
else:
row_Cells[i + 2].paragraphs[0].add_run("(Source : " + s[i] + ")")
def add_dial_comparison_doc1b(doc1b, table1b, k, s, selected_source):
row_Cells = table1b.add_row().cells
n = len(k)
for i in range(4, n):
row_Cells[i - 4].text = k[i]
for i in range(4):
if s[i + 2] != " ":
if s[i + 2] == selected_source:
row_Cells[i].paragraphs[0].add_run(
"(Source : " + s[i + 2] + ")"
).bold = True
else:
row_Cells[i].paragraphs[0].add_run("(Source : " + s[i + 2] + ")")
# to return the table with best 3 outputs
def add_dial_comparison_doc2(
doc2, table2, sentence, s1ANDm1, s2ANDm2, s3ANDm3, sources_name, trans
):
row_Cells = table2.add_row().cells
for a, b in sources_name.items():
# print(sources_name.items())
# print(b)
# print(s1ANDm1[0])
if b == s1ANDm1[0]:
k = a
output1 = trans[str(k)]
row_Cells[0].text = sentence
row_Cells[1].text = output1
row_Cells[1].paragraphs[0].add_run("(Source : " + str(s1ANDm1[0]) + ")")
row_Cells[1].paragraphs[0].add_run("(Methods : " + str(s1ANDm1[1]) + ")")
if s2ANDm2[0] == "":
row_Cells[2].text = ""
else:
for a, b in sources_name.items():
if b == s2ANDm2[0]:
k = a
output2 = trans[str(k)]
row_Cells[2].text = output2
row_Cells[2].paragraphs[0].add_run("(Source : " + str(s2ANDm2[0]) + ")")
row_Cells[2].paragraphs[0].add_run("(Methods : " + str(s2ANDm2[1]) + ")")
if s3ANDm3[0] == "":
row_Cells[3].text = ""
else:
for a, b in sources_name.items():
if b == s3ANDm3[0]:
k = a
output3 = trans[str(k)]
row_Cells[3].text = output3
row_Cells[3].paragraphs[0].add_run("(Source : " + str(s3ANDm3[0]) + ")")
row_Cells[3].paragraphs[0].add_run("(Methods : " + str(s3ANDm3[1]) + ")")
for scene in tqdm(scenes[:5]):
for i, line in enumerate(scene):
if i == 0:
addSlugLine(doc, line)
continue
if type(line) == type(""):
if global_non_dialogue_flag == "Yes":
# non_dial_src_lang = language_detector(line)
# non_dial_translate = non_dial_checker( non_dial_dest_lang, non_dial_src_lang )
# print("non_dial_translate", non_dial_translate)
# if non_dial_translate:
# if non_dial_src_lang in translation_list and non_dial_dest_lang in translation_list:
# trans_text = translate_comparison(line , non_dial_src_lang, non_dial_dest_lang)
# addActionLine(doc, trans_text, non_dial_dest_lang)
# else:
# addActionLine(doc, line, non_dial_dest_lang)
if (
non_dial_src_lang in translation_list
and non_dial_dest_lang in translation_list
):
trans_text = translate_comparison(
line, non_dial_src_lang, non_dial_dest_lang
)
addActionLine(doc, trans_text, non_dial_dest_lang)
else:
addActionLine(doc, line, non_dial_dest_lang)
else:
addActionLine(doc, line, non_dial_dest_lang)
else:
# print(line)
[speaker] = line.keys()
# print([speaker])
if speaker == "Transition":
addTransition(doc, line[speaker])
continue
addSpeaker(doc, speaker)
if global_dialogue_flag == "Yes":
print("In dialogue")
dial_src_lang = language_detector(line[speaker][2])
print("dial_src_lang", dial_src_lang)
# print("p", line[speaker][0])
if line[speaker][0] != "NONE":
out = google(line[speaker][0], dial_src_lang, dial_dest_lang)
addParenthetical(doc, out)
# else:
# addParenthetical(doc,line[speaker][0])
dial_translate = dial_checker(dial_dest_lang, dial_src_lang)
print("dial_translate", dial_translate)
if dial_translate:
print("dialogue to be translated", line[speaker][2])
if line[speaker][2] == "":
continue
if (
dial_src_lang in translation_list
and dial_dest_lang in translation_list
):
trans_text = translate_comparison(
line[speaker][2], dial_src_lang, dial_dest_lang
)
if dual_dial_script == "Yes":
dual_script(doc, line[speaker][2], trans_text, dial_dest_lang)
else:
addDialogue(doc, trans_text, dial_dest_lang)
else:
addDialogue(doc, line[speaker][2], dial_dest_lang)
else:
addParenthetical(doc, line[speaker][0])
addDialogue(doc, line[speaker][2], dial_dest_lang)
doc.save(doc_file)
# doc1a.save("1"+doc_file)
# doc1b.save("2"+doc_file)
doc2.save(
str(filename1.split(".")[0])
+ "_trans_to_"
+ str(dial_dest_lang)
+ "_"
+ "final.docx"
)