Conversion_Kitchen_Code/kitchen_counter/conversion/attachments/final_translation.py

727 lines
20 KiB
Python
Raw Normal View History

2024-04-27 09:33:09 +00:00
import os
import sys
import docx
import re
from tqdm import tqdm
from collections import Counter
import ntpath
from docx.shared import Inches, Cm, Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.table import WD_TABLE_ALIGNMENT, WD_ALIGN_VERTICAL
import requests, uuid, json
import nltk.translate.bleu_score as bleu
import nltk.translate.gleu_score as gleu
from rouge_score import rouge_scorer
import numpy as np
import statistics
from statistics import mode
from indicnlp.tokenize import sentence_tokenize
import nltk
try:
print("time1111")
nltk.data.find("tokenizers/punkt")
except LookupError:
#nltk.download("punkt")
pass
try:
nltk.data.find("wordnet")
except LookupError: ###nltk.download('wordnet')
print("pass")
from nltk.tokenize import sent_tokenize
# import logging
# from logger import get_module_logger
# log = get_module_logger(__name__)
# log.info('Logger working')
# google
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "My First Project-2573112d5326.json"
from google.cloud import translate
from google.cloud import translate_v2 as Translate
translate_client = Translate.Client()
client = translate.TranslationServiceClient()
project_id = "authentic-bongo-272808"
location = "global"
parent = f"projects/{project_id}/locations/{location}"
from script_detector import script_cat
from buck_2_unicode import buck_2_unicode
from translation_metric import (
manual_diff_score,
bleu_diff_score,
gleu_diff_score,
meteor_diff_score,
rouge_diff_score,
diff_score,
critera4_5,
)
from selection_source import (
selection_source,
function5,
function41,
function311,
function221,
function2111,
function11111,
selection_source_transliteration,
two_sources_two_outputs,
)
from script_writing import (
addSlugLine,
addActionLine,
addSpeaker,
addParenthetical,
addDialogue,
dual_script,
addTransition,
dial_checker,
non_dial_checker,
)
from script_reading import (
breaksen,
getRefined,
getSlugAndNonSlug,
getSpeakers,
getScenes,
)
from translation_resources import ibm_watson, google, aws, azure, lingvanex, yandex
from transliteration_resources import (
azure_transliteration,
indic_trans,
om_transliterator,
libindic,
indic_transliteration_IAST,
indic_transliteration_ITRANS,
sheetal,
ritwik,
)
filename1 = sys.argv[1] # original file
non_dial_src_lang = sys.argv[2]
non_dial_dest_lang = sys.argv[3]
dial_src_lang = sys.argv[4]
dial_dest_lang = sys.argv[5]
dual_dial_script = sys.argv[6] # yes/No
if non_dial_src_lang != non_dial_dest_lang:
global_non_dialogue_flag = "Yes"
else:
global_non_dialogue_flag = "No"
if dial_src_lang != dial_dest_lang:
global_dialogue_flag = "Yes"
else:
global_dialogue_flag = "No"
# filename1 = sys.argv[1]
# dial_dest_lang = sys.argv[2]
# #dial_dest_lang = user_script_data.get("dial_dest_language")
# dial_dest_script = sys.argv[3]C:\Users\lokesh\Desktop\mnfproject1\scripts\activate
# #dial_dest_script = user_script_data.get("dial_dest_script")
# non_dial_dest_lang = sys.argv[4]
# #non_dial_dest_lang = user_script_data.get("nondial_dest_language")
# dual_dial_script = sys.argv[5]
# #dual_dial_script = user_script_data.get("dual_dial_script") # Yes,No
translation_list = [
"en",
"ta",
"hi",
"ar",
"ur",
"kn",
"gu",
"bg",
"bn",
"te",
"ml",
"ru",
"sr",
"uk",
"hr",
"ga",
"sq",
"mr",
"fa",
"tr",
"hu",
"it",
"ro",
"pa",
"gu",
"or",
"zh",
"ne",
"fr",
"es",
"id",
"el",
"ja",
"ko",
"be",
"uz",
"sd",
"af",
"de",
"is",
"ig",
"la",
"pt",
"my",
"th",
"su",
"lo",
"am",
"si",
"az",
"kk",
"mk",
"bs",
"ps",
"mg",
"ms",
"yo",
"cs",
"da",
"nl",
"tl",
"no",
"sl",
"sv",
"vi",
"cy",
"he",
"hy",
"km",
"ka",
"mn",
"ku",
"ky",
"tk",
"he",
"hy",
"km",
"ka",
"mn",
"ku",
"ky",
"tk",
"fi",
"ht",
"haw",
"lt",
"lb",
"mt",
"pl",
"eo",
"tt",
"ug",
"ha",
"so",
"sw",
"yi",
"eu",
"ca",
"ceb",
"co",
"et",
"fy",
"gl",
"hmn",
"rw",
"lv",
"mi",
"sm",
"gd",
"st",
"sn",
"sk",
"xh",
"zu",
]
# create an instance of a word document
doc = docx.Document()
doc_file = (
"translated/"
+ str(dial_dest_lang)
+ "_"
+ "trans"
+ "_of_"
+ ntpath.basename(filename1)
)
print(doc_file)
doc1a = docx.Document()
sections = doc1a.sections
for section in sections:
section.top_margin = Inches(0.2)
section.bottom_margin = Inches(0.2)
section.left_margin = Inches(0.2)
section.right_margin = Inches(0.2)
section = doc1a.sections[-1]
new_height = section.page_width
section.page_width = section.page_height
section.page_height = new_height
name = "Dialogue Comparision Table of " + doc_file
doc1a.add_heading(name, 0)
doc_para = doc1a.add_paragraph()
doc_para.add_run(
"Translation resources used : Google, IBM watson, AWS, Azure, Lingvanex, Yandex"
).bold = True
table1a = doc1a.add_table(rows=1, cols=4)
table1a.style = "TableGrid"
hdr_Cells = table1a.rows[0].cells
hdr_Cells[0].paragraphs[0].add_run("Input").bold = True
hdr_Cells[1].paragraphs[0].add_run("Google").bold = True
hdr_Cells[2].paragraphs[0].add_run("MNF Diff Score Method").bold = True
hdr_Cells[3].paragraphs[0].add_run("Bleu Diff Score Method").bold = True
doc1b = docx.Document()
sections = doc1b.sections
for section in sections:
section.top_margin = Inches(0.2)
section.bottom_margin = Inches(0.2)
section.left_margin = Inches(0.2)
section.right_margin = Inches(0.2)
section = doc1b.sections[-1]
new_height = section.page_width
section.page_width = section.page_height
section.page_height = new_height
name = "Dialogue Comparision Table of " + doc_file
doc1b.add_heading(name, 0)
doc_para = doc1b.add_paragraph()
doc_para.add_run(
"Translation resources used : Google, IBM watson, AWS, Azure, Lingvanex, Yandex"
).bold = True
table1b = doc1b.add_table(rows=1, cols=4)
table1b.style = "TableGrid"
hdr_Cells = table1b.rows[0].cells
hdr_Cells[0].paragraphs[0].add_run("Gleu Diff Score Method").bold = True
hdr_Cells[1].paragraphs[0].add_run("Meteor Diff Score Method").bold = True
hdr_Cells[2].paragraphs[0].add_run("Rougen Diff Score Method").bold = True
hdr_Cells[3].paragraphs[0].add_run("Rougel Diff Score Method").bold = True
doc2 = docx.Document()
sections = doc2.sections
for section in sections:
section.top_margin = Inches(0.2)
section.bottom_margin = Inches(0.2)
section.left_margin = Inches(0.2)
section.right_margin = Inches(0.2)
section = doc2.sections[-1]
new_height = section.page_width
section.page_width = section.page_height
section.page_height = new_height
name = "Final table " + doc_file
doc2.add_heading(name, 0)
doc_para = doc2.add_paragraph()
doc_para.add_run(
"Translation resources used : Google, IBM watson, AWS, Azure, Lingvanex, Yandex"
).bold = True
table2 = doc2.add_table(rows=1, cols=4)
table2.style = "TableGrid"
hdr_Cells = table2.rows[0].cells
hdr_Cells[0].paragraphs[0].add_run("Input").bold = True
hdr_Cells[1].paragraphs[0].add_run("Output1").bold = True
hdr_Cells[2].paragraphs[0].add_run("Output2").bold = True
hdr_Cells[3].paragraphs[0].add_run("Output3").bold = True
# process the input script and return scenes
refined, total_scenes = getRefined(filename1)
print(refined)
# log.debug(refined)
sluglines, without_slug = getSlugAndNonSlug(refined)
print(sluglines)
# log.debug(sluglines)
characters = getSpeakers(without_slug)
# print(characters)
# log.debug(characters)
scenes, actionline, parenthetical_lis, speakers, dialogues = getScenes(
refined, total_scenes, characters
)
# print(scenes)
# to detect the language
def language_detector(text):
result = translate_client.translate(text, target_language="hi")
det_lang = result["detectedSourceLanguage"]
return det_lang
class myDict(dict):
def __init__(self):
self = dict()
def add(self, key, value):
self[key] = value
def all_translator(sentence, source_lang, target_lang):
print("in all translator", sentence)
# if sentence=="" or sentence==" ":
# return
i = 0
trans = myDict()
sources_name = myDict()
try:
globals()["t%s" % i] = google(sentence, source_lang, target_lang)
trans.add(str(i), globals()["t%s" % i])
sources_name.add(str(i), "GOOGLE")
i = i + 1
except:
pass
try:
globals()["t%s" % i] = ibm_watson(sentence, source_lang, target_lang)
trans.add(str(i), globals()["t%s" % i])
sources_name.add(str(i), "IBM_WATSON")
i = i + 1
except:
pass
try:
globals()["t%s" % i] = aws(sentence, source_lang, target_lang)
trans.add(str(i), globals()["t%s" % i])
sources_name.add(str(i), "AWS")
i = i + 1
except:
pass
try:
globals()["t%s" % i] = azure(sentence, target_lang)
trans.add(str(i), globals()["t%s" % i])
sources_name.add(str(i), "AZURE")
i = i + 1
except:
pass
try:
globals()["t%s" % i] = lingvanex(sentence, source_lang, target_lang)
trans.add(str(i), globals()["t%s" % i])
sources_name.add(str(i), "LINGVANEX")
i = i + 1
except:
pass
try:
globals()["t%s" % i] = yandex(sentence, source_lang, target_lang)
trans.add(str(i), globals()["t%s" % i])
sources_name.add(str(i), "YANDEX")
i = i + 1
except:
pass
# print(trans)
# print(sources_name)
trans_text = compare_outputs(sentence, trans["0"], trans, sources_name)
return trans_text
# take paragraph which returns translated paragraph by comparing translated ouptputs from different resources
def translation_with_spcecial_dots(Sentence, source_lang, target_lang):
special_characters = ["...", "", ". . ."]
translated_text = []
for i in special_characters:
if i not in Sentence:
continue
Sentences = Sentence.split(i)
for Sentence in Sentences:
if Sentence == " " or Sentence == "":
continue
if any(ext in Sentence for ext in special_characters):
trans_text = translation_with_spcecial_dots(
Sentence, source_lang, target_lang
)
else:
if Sentence != Sentences[-1]:
trans_text = all_translator(Sentence, source_lang, target_lang) + i
else:
trans_text = all_translator(Sentence, source_lang, target_lang)
translated_text.append(trans_text)
return " ".join(translated_text)
def translate_comparison(text, source_lang, target_lang):
sentences = sent_tokenize(text)
special_characters = ["...", "", ". . ."]
translated_text = []
for sentence in sentences:
if sentence == " " or sentence == "":
continue
if any(ext in sentence for ext in special_characters):
trans_text = translation_with_spcecial_dots(
sentence, source_lang, target_lang
)
translated_text.append(trans_text)
else:
trans_text = all_translator(sentence, source_lang, target_lang)
translated_text.append(trans_text)
return " ".join(translated_text)
# take a sentence and give translated sentence by comparing outputs from different resources
def compare_outputs(sentence, t0, trans, sources_name):
k = []
s = []
methods_name = {
"0": "MNF",
"1": "Gleu",
"2": "Meteor",
"3": "Rougen",
"4": "Rougel",
}
google_output = t0
# print("google", google_output)
output1, source1 = manual_diff_score(trans, sources_name)
# print("MNF", output1)
output2, source2 = gleu_diff_score(trans, sources_name)
# print("gleu", output2)
output3, source3 = meteor_diff_score(trans, sources_name)
# print("meteor", output3)
output4, source4, output5, source5 = rouge_diff_score(trans, sources_name)
# print("rougen", output4)
# print("rougel", output5)
if google_output == output1 == output2 == output3 == output4 == output5:
# print("all output is same as google")
return google_output
else:
if google_output != output1:
k.append(output1)
s.append(source1)
else:
k.append(" ")
s.append(" ")
if google_output != output2:
k.append(output2)
s.append(source2)
else:
k.append(" ")
s.append(" ")
if google_output != output3:
k.append(output3)
s.append(source3)
else:
k.append(" ")
s.append(" ")
if google_output != output4:
k.append(output4)
s.append(source4)
else:
k.append(" ")
s.append(" ")
if google_output != output5:
k.append(output5)
s.append(source5)
else:
k.append(" ")
s.append(" ")
k.insert(0, sentence)
k.insert(1, google_output)
s1ANDm1, s2ANDm2, s3ANDm3 = selection_source(
s, sources_name, trans, methods_name
)
# print("s1", s1ANDm1)
# print("s2", s2ANDm2)
# print("s3", s3ANDm3)
# print(s1ANDm1[0])
# print(sources_name)
# add_dial_comparison_doc1a(doc1a, table1a , k, s, s1ANDm1[0])
# add_dial_comparison_doc1b(doc1b, table1b , k, s, s1ANDm1[0])
add_dial_comparison_doc2(
doc2, table2, sentence, s1ANDm1, s2ANDm2, s3ANDm3, sources_name, trans
)
for a, b in sources_name.items():
if b == s1ANDm1[0]:
k = a
output1 = trans[str(k)]
return output1
def add_dial_comparison_doc1a(doc1a, table1a, k, s, selected_source):
row_Cells = table1a.add_row().cells
for i in range(4):
row_Cells[i].text = k[i]
for i in range(2):
if s[i] != " ":
if s[i] == selected_source:
row_Cells[i + 2].paragraphs[0].add_run(
"(Source : " + s[i] + ")"
).bold = True
else:
row_Cells[i + 2].paragraphs[0].add_run("(Source : " + s[i] + ")")
def add_dial_comparison_doc1b(doc1b, table1b, k, s, selected_source):
row_Cells = table1b.add_row().cells
n = len(k)
for i in range(4, n):
row_Cells[i - 4].text = k[i]
for i in range(4):
if s[i + 2] != " ":
if s[i + 2] == selected_source:
row_Cells[i].paragraphs[0].add_run(
"(Source : " + s[i + 2] + ")"
).bold = True
else:
row_Cells[i].paragraphs[0].add_run("(Source : " + s[i + 2] + ")")
# to return the table with best 3 outputs
def add_dial_comparison_doc2(
doc2, table2, sentence, s1ANDm1, s2ANDm2, s3ANDm3, sources_name, trans
):
row_Cells = table2.add_row().cells
for a, b in sources_name.items():
# print(sources_name.items())
# print(b)
# print(s1ANDm1[0])
if b == s1ANDm1[0]:
k = a
output1 = trans[str(k)]
row_Cells[0].text = sentence
row_Cells[1].text = output1
row_Cells[1].paragraphs[0].add_run("(Source : " + str(s1ANDm1[0]) + ")")
row_Cells[1].paragraphs[0].add_run("(Methods : " + str(s1ANDm1[1]) + ")")
if s2ANDm2[0] == "":
row_Cells[2].text = ""
else:
for a, b in sources_name.items():
if b == s2ANDm2[0]:
k = a
output2 = trans[str(k)]
row_Cells[2].text = output2
row_Cells[2].paragraphs[0].add_run("(Source : " + str(s2ANDm2[0]) + ")")
row_Cells[2].paragraphs[0].add_run("(Methods : " + str(s2ANDm2[1]) + ")")
if s3ANDm3[0] == "":
row_Cells[3].text = ""
else:
for a, b in sources_name.items():
if b == s3ANDm3[0]:
k = a
output3 = trans[str(k)]
row_Cells[3].text = output3
row_Cells[3].paragraphs[0].add_run("(Source : " + str(s3ANDm3[0]) + ")")
row_Cells[3].paragraphs[0].add_run("(Methods : " + str(s3ANDm3[1]) + ")")
for scene in tqdm(scenes[:5]):
for i, line in enumerate(scene):
if i == 0:
addSlugLine(doc, line)
continue
if type(line) == type(""):
if global_non_dialogue_flag == "Yes":
# non_dial_src_lang = language_detector(line)
# non_dial_translate = non_dial_checker( non_dial_dest_lang, non_dial_src_lang )
# print("non_dial_translate", non_dial_translate)
# if non_dial_translate:
# if non_dial_src_lang in translation_list and non_dial_dest_lang in translation_list:
# trans_text = translate_comparison(line , non_dial_src_lang, non_dial_dest_lang)
# addActionLine(doc, trans_text, non_dial_dest_lang)
# else:
# addActionLine(doc, line, non_dial_dest_lang)
if (
non_dial_src_lang in translation_list
and non_dial_dest_lang in translation_list
):
trans_text = translate_comparison(
line, non_dial_src_lang, non_dial_dest_lang
)
addActionLine(doc, trans_text, non_dial_dest_lang)
else:
addActionLine(doc, line, non_dial_dest_lang)
else:
addActionLine(doc, line, non_dial_dest_lang)
else:
# print(line)
[speaker] = line.keys()
# print([speaker])
if speaker == "Transition":
addTransition(doc, line[speaker])
continue
addSpeaker(doc, speaker)
if global_dialogue_flag == "Yes":
print("In dialogue")
dial_src_lang = language_detector(line[speaker][2])
print("dial_src_lang", dial_src_lang)
# print("p", line[speaker][0])
if line[speaker][0] != "NONE":
out = google(line[speaker][0], dial_src_lang, dial_dest_lang)
addParenthetical(doc, out)
# else:
# addParenthetical(doc,line[speaker][0])
dial_translate = dial_checker(dial_dest_lang, dial_src_lang)
print("dial_translate", dial_translate)
if dial_translate:
print("dialogue to be translated", line[speaker][2])
if line[speaker][2] == "":
continue
if (
dial_src_lang in translation_list
and dial_dest_lang in translation_list
):
trans_text = translate_comparison(
line[speaker][2], dial_src_lang, dial_dest_lang
)
if dual_dial_script == "Yes":
dual_script(doc, line[speaker][2], trans_text, dial_dest_lang)
else:
addDialogue(doc, trans_text, dial_dest_lang)
else:
addDialogue(doc, line[speaker][2], dial_dest_lang)
else:
addParenthetical(doc, line[speaker][0])
addDialogue(doc, line[speaker][2], dial_dest_lang)
doc.save(doc_file)
# doc1a.save("1"+doc_file)
# doc1b.save("2"+doc_file)
doc2.save(
str(filename1.split(".")[0])
+ "_trans_to_"
+ str(dial_dest_lang)
+ "_"
+ "final.docx"
)