Conversion_Kitchen_Code/kitchen_counter/conversion/subtitling/final_translation_subtitile.py

496 lines
17 KiB
Python
Raw Normal View History

2024-04-27 09:33:09 +00:00
import os
import sys
import docx
import re
# import textract
from tqdm import tqdm
from collections import Counter
import ntpath
from docx.shared import Inches, Cm, Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.table import WD_TABLE_ALIGNMENT, WD_ALIGN_VERTICAL
import requests, uuid, json
import nltk.translate.bleu_score as bleu
import nltk.translate.gleu_score as gleu
from rouge_score import rouge_scorer
import numpy as np
import statistics
from statistics import mode
from indicnlp.tokenize import sentence_tokenize
import nltk
try:
print("time33333")
nltk.data.find('tokenizers/punkt')
except:
#LookupError: nltk.download('punkt')
pass
try: nltk.data.find('wordnet')
except LookupError: ###nltk.download('wordnet')
print("error in finding wordnet222222222")
from nltk.tokenize import sent_tokenize
# import logging
# from logger import get_module_logger
# log = get_module_logger(__name__)
# log.info('Logger working')
#google
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="gifted-mountain-318504-0a5f94cda0c8.json"
from google.cloud import translate
from google.cloud import translate_v2 as Translate
translate_client = Translate.Client()
client = translate.TranslationServiceClient()
project_id = "excellent-hue-272808"
location = "global"
parent = f"projects/{project_id}/locations/{location}"
from script_detector import script_cat
from buck_2_unicode import buck_2_unicode
from translation_metric import manual_diff_score, bleu_diff_score, gleu_diff_score, meteor_diff_score, rouge_diff_score, diff_score, critera4_5
from selection_source import selection_source, function5, function41, function311, function221, function2111, function11111, selection_source_transliteration, two_sources_two_outputs
from script_writing import addSlugLine, addActionLine, addSpeaker, addParenthetical, addDialogue, dual_script, addTransition, dial_checker, non_dial_checker
from script_reading import breaksen, getRefined, getSlugAndNonSlug, getSpeakers, getScenes
from translation_resources import ibm_watson, google, aws, azure, lingvanex, yandex
from transliteration_resources import azure_transliteration, indic_trans, om_transliterator, libindic, indic_transliteration_IAST, indic_transliteration_ITRANS, sheetal, ritwik
from script_writing import default_script
from nltk.tokenize import regexp_tokenize
#comes from frontend after detection algo
# filename1 = sys.argv[1]
# dial_src_lang = sys.argv[2]
# dial_dest_lang = sys.argv[3]
# non_dial_src_lang = sys.argv[4]
# non_dial_dest_lang = sys.argv[5]
# dual_dial_script = sys.argv[6]
#to check whether action lines or dialogues will translate
if non_dial_src_lang != non_dial_dest_lang:
global_non_dialogue_flag = "Yes"
else:global_non_dialogue_flag = "No"
if dial_src_lang != dial_dest_lang:
global_dialogue_flag = "Yes"
else:global_dialogue_flag = "No"
translation_list = ['en', 'ta', 'hi', 'ar', 'ur', 'kn', 'gu', 'bg', 'bn', 'te', 'ml', 'ru', 'sr', 'uk', 'hr', 'ga', 'sq', 'mr',
'fa', 'tr', 'hu', 'it', 'ro','pa','gu','or','zh-CN', 'zh-TW','ne','fr','es','id','el','ja','ko','be','uz','sd','af','de','is',
'ig','la','pt','my','th','su','lo','am','si','az','kk','mk','bs','ps','mg','ms','yo','cs','da','nl','tl','no','sl','sv',
'vi','cy','he','hy','km','ka','mn','ku','ky','tk','he','hy','km','ka','mn','ku','ky','tk','fi','ht','haw','lt','lb','mt',
'pl','eo','tt','ug','ha','so','sw','yi','eu','ca','ceb','co','et','fy','gl','hmn','rw','lv','mi','sm','gd','st','sn','sk',
'xh','zu']
# create an instance of a word document
doc = docx.Document()
doc_file = "translated/" +str(dial_dest_lang) +"_" +"trans" + '_of_'+ ntpath.basename(filename1)
#print(doc_file)
doc2 = docx.Document()
sections = doc2.sections
for section in sections:
section.top_margin = Inches(0.2)
section.bottom_margin = Inches(0.2)
section.left_margin = Inches(0.2)
section.right_margin = Inches(0.2)
section = doc2.sections[-1]
new_height = section.page_width
section.page_width = section.page_height
section.page_height = new_height
name = 'Final table '+doc_file
doc2.add_heading(name, 0)
doc_para = doc2.add_paragraph()
doc_para.add_run('Translation resources used : Google, IBM watson, AWS, Azure, Lingvanex, Yandex').bold = True
table2 = doc2.add_table(rows=1,cols=4)
table2.style = 'TableGrid'
hdr_Cells = table2.rows[0].cells
hdr_Cells[0].paragraphs[0].add_run("Input").bold=True
hdr_Cells[1].paragraphs[0].add_run("Output1").bold=True
hdr_Cells[2].paragraphs[0].add_run("Output2").bold=True
hdr_Cells[3].paragraphs[0].add_run("Output3").bold=True
#process the input script and return scenes
refined,total_scenes = getRefined(filename1)
#print(refined)
#log.debug(refined)
sluglines,without_slug = getSlugAndNonSlug(refined)
#print(sluglines)
#log.debug(sluglines)
characters = getSpeakers(without_slug)
#log.debug(characters)
scenes,actionline,parenthetical_lis,speakers,dialogues = getScenes(refined,total_scenes,characters)
print(scenes)
#to detect the language
def language_detector(text):
result = translate_client.translate(text, target_language='hi')
det_lang = result["detectedSourceLanguage"]
return det_lang
class myDict(dict):
def __init__(self):
self = dict()
def add(self, key, value):
self[key] = value
def all_translator(sentence, source_lang, target_lang):
i = 0
trans = myDict()
sources_name = myDict()
try:
globals()['t%s' % i] = google(sentence, source_lang, target_lang)
trans.add(str(i), globals()['t%s' % i])
sources_name.add(str(i), "GOOGLE")
i = i+1
except:
pass
try:
globals()['t%s' % i] = ibm_watson(sentence, source_lang, target_lang)
trans.add(str(i), globals()['t%s' % i])
sources_name.add(str(i), "IBM_WATSON")
i = i+1
except:
pass
try:
globals()['t%s' % i] = aws(sentence, source_lang, target_lang)
trans.add(str(i), globals()['t%s' % i])
sources_name.add(str(i), "AWS")
i = i+1
except:
pass
try:
globals()['t%s' % i]= azure(sentence, target_lang)
trans.add(str(i), globals()['t%s' % i])
sources_name.add(str(i), "AZURE")
i = i+1
except:
pass
try:
globals()['t%s' % i] = lingvanex(sentence, source_lang, target_lang)
trans.add(str(i), globals()['t%s' % i])
sources_name.add(str(i), "LINGVANEX")
i = i+1
except:
pass
try:
globals()['t%s' % i] = yandex(sentence, source_lang, target_lang)
trans.add(str(i), globals()['t%s' % i])
sources_name.add(str(i), "YANDEX")
i = i+1
except:
pass
trans_text = compare_outputs(sentence, trans["0"], trans, sources_name, target_lang)
return trans_text
def recursive_dots(Sentence, source_lang, target_lang):
special_characters = ['....', '', '. . .', '...']
translated_text = []
for i in special_characters:
if i not in Sentence:
continue
Sentences = Sentence.split(i)
for Sentence in Sentences:
if Sentence=="" or Sentence==" ":
continue
if any(ext in Sentence for ext in special_characters):
trans_text = translation_with_spcecial_dots(Sentence, source_lang, target_lang)
else:
if Sentence!= Sentences[-1]:
trans_text = all_translator(Sentence, source_lang, target_lang) + i
else:
trans_text = all_translator(Sentence, source_lang, target_lang)
translated_text.append(trans_text)
return " ".join(translated_text)
def translation_with_spcecial_dots(Sentence, source_lang, target_lang):
special_characters = ['....', '', '. . .', '...']
translated_text = []
for ext in special_characters:
if ext in Sentence:
splitter = ext
break
Sentences = Sentence.split(splitter)
for Sentence in Sentences:
if Sentence=="" or Sentence==" ":
continue
if any(ext in Sentence for ext in special_characters):
trans_text = recursive_dots(Sentence, source_lang, target_lang)
else:
if Sentence!= Sentences[-1]:
trans_text = all_translator(Sentence, source_lang, target_lang) + splitter
else:
trans_text = all_translator(Sentence, source_lang, target_lang)
translated_text.append(trans_text)
return " ".join(translated_text)
def translate_comparison(text, source_lang, target_lang):
sentences=sent_tokenize(text)
special_characters = ['....', '', '. . .', '...']
translated_text = []
for sentence in sentences:
if any(ext in sentence for ext in special_characters):
trans_text = translation_with_spcecial_dots(sentence, source_lang, target_lang)
translated_text.append(trans_text)
else:
trans_text = all_translator(sentence, source_lang, target_lang)
translated_text.append(trans_text)
return " ".join(translated_text)
def script_det(text):
punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~“"'''
no_punct = ""
for char in text:
if char not in punctuations:
no_punct = char
break
#print("alphabet", no_punct)
script = script_cat(no_punct)[0]
#print("script", script)
return script
def punct_remover(string):
#punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~…।“”'''
punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~…।1234567890'''
for x in string.lower():
if x in punctuations:
string = string.replace(x, " ")
return string
def word_transliterate(sentence, dest_script):
return sentence
def final_out(output1, output2, output3, dest_lang):
temp_output1 = punct_remover(output1)
temp_output2 = punct_remover(output2)
temp_output3 = punct_remover(output3)
#for word in regexp_tokenize(output1, "[\w']+")
for word in temp_output1.split():
if script_det(word) != default_script[dest_lang]:
for word in temp_output2.split():
if script_det(word) != default_script[dest_lang]:
for word in temp_output3.split():
if script_det(word) != default_script[dest_lang]:
#print("in3")
output1 = word_transliterate(output1, default_script[dest_lang])
return output1
return output3
return output2
return output1
#take a sentence and give translated sentence by comparing outputs from different resources
def compare_outputs(sentence, t0, trans, sources_name, target_lang):
k=[]
s=[]
methods_name = {'0':'MNF', '1':'Gleu', '2':'Meteor', '3':'Rougen', '4':'Rougel'}
google_output = t0
#print("google", google_output)
output1, source1 = manual_diff_score(trans, sources_name)
#print("MNF", output1)
output2, source2 = gleu_diff_score(trans, sources_name)
#print("gleu", output2)
output3, source3 = meteor_diff_score(trans, sources_name)
#print("meteor", output3)
output4, source4, output5, source5 = rouge_diff_score(trans, sources_name)
#print("rougen", output4)
#print("rougel", output5)
if google_output == output1 == output2==output3==output4==output5:
#print("all output are same as google")
return google_output
else:
if google_output != output1:
k.append(output1)
s.append(source1)
else:
k.append(" ")
s.append(" ")
if google_output != output2:
k.append(output2)
s.append(source2)
else:
k.append(" ")
s.append(" ")
if google_output != output3:
k.append(output3)
s.append(source3)
else:
k.append(" ")
s.append(" ")
if google_output != output4:
k.append(output4)
s.append(source4)
else:
k.append(" ")
s.append(" ")
if google_output != output5:
k.append(output5)
s.append(source5)
else:
k.append(" ")
s.append(" ")
k.insert(0,sentence)
k.insert(1,google_output)
s1ANDm1, s2ANDm2, s3ANDm3 = selection_source(s, sources_name, trans, methods_name )
# print("s1", s1ANDm1)
# print("s2", s2ANDm2)
# print("s3", s3ANDm3)
#print(s1ANDm1[0])
#print(sources_name)
#add_dial_comparison_doc1a(doc1a, table1a , k, s, s1ANDm1[0])
#add_dial_comparison_doc1b(doc1b, table1b , k, s, s1ANDm1[0])
add_dial_comparison_doc2(doc2, table2, sentence, s1ANDm1, s2ANDm2, s3ANDm3, sources_name, trans)
for a, b in sources_name.items():
if b == s1ANDm1[0]:
k = a
output1 = trans[str(k)]
if s2ANDm2[0] != "":
for c, d in sources_name.items():
if d == s2ANDm2[0]:
l = c
output2 = trans[str(l)]
else:
output2 = output1
if s3ANDm3[0] != "":
for e, f in sources_name.items():
if f == s3ANDm3[0]:
m = e
output3 = trans[str(m)]
else:
output3 = output1
# print("output1", output1)
# print("output2", output2)
# print("output3", output3)
output = final_out(output1, output2, output3, target_lang)
# print("output", output)
return output
#to return the table with best 3 outputs
def add_dial_comparison_doc2(doc2, table2, sentence, s1ANDm1, s2ANDm2, s3ANDm3, sources_name, trans):
row_Cells = table2.add_row().cells
for a, b in sources_name.items():
if b == s1ANDm1[0]:
k = a
output1 = trans[str(k)]
row_Cells[0].text= sentence
row_Cells[1].text= output1
row_Cells[1].paragraphs[0].add_run('(Source : '+str(s1ANDm1[0])+')')
row_Cells[1].paragraphs[0].add_run('(Methods : '+str(s1ANDm1[1])+')')
if s2ANDm2[0] == "":
row_Cells[2].text= ""
else:
for a, b in sources_name.items():
if b == s2ANDm2[0]:
k = a
output2 = trans[str(k)]
row_Cells[2].text= output2
row_Cells[2].paragraphs[0].add_run('(Source : '+str(s2ANDm2[0])+')')
row_Cells[2].paragraphs[0].add_run('(Methods : '+str(s2ANDm2[1])+')')
if s3ANDm3[0] == "":
row_Cells[3].text= ""
else:
for a, b in sources_name.items():
if b == s3ANDm3[0]:
k = a
output3 = trans[str(k)]
row_Cells[3].text= output3
row_Cells[3].paragraphs[0].add_run('(Source : '+str(s3ANDm3[0])+')')
row_Cells[3].paragraphs[0].add_run('(Methods : '+str(s3ANDm3[1])+')')
for scene in tqdm(scenes[:1]):
for i,line in enumerate(scene):
if i == 0:
addSlugLine(doc,line)
continue
if type(line)==type(""):
if global_non_dialogue_flag == "Yes":
if non_dial_src_lang in translation_list and non_dial_dest_lang in translation_list:
trans_text = translate_comparison(line , non_dial_src_lang, non_dial_dest_lang)
addActionLine(doc, trans_text, non_dial_dest_lang)
else:
addActionLine(doc, line, non_dial_dest_lang)
else:
addActionLine(doc, line, non_dial_dest_lang)
else:
#print(line)
[speaker] = line.keys()
#print([speaker])
if speaker == 'Transition':
addTransition(doc,line[speaker])
continue
addSpeaker(doc,speaker)
if global_dialogue_flag == "Yes":
print("In dialogue")
#print("dilo")
print("dialogue", line[speaker][2])
dial_src_lang = language_detector(line[speaker][2])
print("dial_src_lang", dial_src_lang)
#print("p", line[speaker][0])
if line[speaker][0] != 'NONE':
print("parenthitical", line[speaker][0])
par_lang = language_detector(line[speaker][0])
out = google(line[speaker][0], par_lang, dial_dest_lang)
#out = google(line[speaker][0], dial_src_lang, dial_dest_lang)
addParenthetical(doc,out)
dial_translate = dial_checker(dial_dest_lang, dial_src_lang)
print("dial_translate", dial_translate)
#dial_translate = True
if dial_translate:
print("dialogue to be translated", line[speaker][2])
if line[speaker][2] == "":
continue
if dial_src_lang in translation_list and dial_dest_lang in translation_list:
trans_text = translate_comparison(line[speaker][2] , dial_src_lang, dial_dest_lang)
if dual_dial_script == "Yes":
dual_script(doc, line[speaker][2], trans_text, dial_dest_lang)
else:
addDialogue(doc, trans_text, dial_dest_lang)
else:
addDialogue(doc, line[speaker][2], dial_dest_lang)
else:
addParenthetical(doc,line[speaker][0])
addDialogue(doc, line[speaker][2], dial_dest_lang)
doc.save(doc_file)
doc2.save("final_comparision.docx")
#doc2.save(str(filename1.split('.')[0])+"_trans_to_"+str(dial_dest_lang)+'_'+"final.docx")