Conversion_Kitchen_Code/kitchen_counter/conversion/attachments/final_transliteration.py

583 lines
24 KiB
Python
Executable File

import os
import sys
import docx
import re
from tqdm import tqdm
from collections import Counter
import ntpath
from docx.shared import Inches, Cm, Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.table import WD_TABLE_ALIGNMENT, WD_ALIGN_VERTICAL
import requests, uuid, json
import nltk.translate.bleu_score as bleu
import nltk.translate.gleu_score as gleu
from rouge_score import rouge_scorer
import numpy as np
from indicnlp.tokenize import sentence_tokenize
import nltk
try:
print("time2222")
nltk.data.find('tokenizers/punkt')
except:
#LookupError: nltk.download('punkt')
pass
try: nltk.data.find('wordnet')
except LookupError: ###nltk.download('wordnet')
print("error in finding wordnet11111")
# import logging
# from logger import get_module_logger
# log = get_module_logger(__name__)
# log.info('Logger working')
#google
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="My First Project-2573112d5326.json"
from google.cloud import translate
from google.cloud import translate_v2 as Translate
translate_client = Translate.Client()
client = translate.TranslationServiceClient()
project_id = 'authentic-bongo-272808'
location = "global"
parent = f"projects/{project_id}/locations/{location}"
from script_detector import script_cat
from buck_2_unicode import buck_2_unicode
from translation_metric import manual_diff_score, bleu_diff_score, gleu_diff_score, meteor_diff_score, rouge_diff_score, diff_score, critera4_5
from selection_source import selection_source, function5, function41, function311, function221, function2111, function11111, selection_source_transliteration, two_sources_two_outputs
from script_writing import addSlugLine, addActionLine, addSpeaker, addParenthetical, addDialogue, dual_script, addTransition, dial_checker, non_dial_checker
from script_reading import breaksen, getRefined, getSlugAndNonSlug, getSpeakers, getScenes
from translation_resources import ibm_watson, google, aws, azure, lingvanex, yandex
from transliteration_resources import azure_transliteration, indic_trans, om_transliterator, libindic, indic_transliteration_IAST, indic_transliteration_ITRANS, sheetal, ritwik
filename1 = sys.argv[1] # get translated file from UI-1(translation)
dial_dest_script = sys.argv[2]
dual_dial_script = sys.argv[3] #Yes/No
translation_and_transliteration = sys.argv[4] #Yes/No
filename2 = sys.argv[5] # original file or take input as scenes from final translation
# create an instance of a word document
doc = docx.Document()
doc_file = "translated/" + "trans" + ntpath.basename(filename1)
print(doc_file)
doc1a = docx.Document()
sections = doc1a.sections
for section in sections:
section.top_margin = Inches(0.2)
section.bottom_margin = Inches(0.2)
section.left_margin = Inches(0.2)
section.right_margin = Inches(0.2)
section = doc1a.sections[-1]
new_height = section.page_width
section.page_width = section.page_height
section.page_height = new_height
name = 'Dialogue Comparision Table of '+doc_file
doc1a.add_heading(name, 0)
doc_para = doc1a.add_paragraph()
doc_para.add_run('Translation resources used : Google, IBM watson, AWS, Azure, Lingvanex, Yandex').bold = True
table1a = doc1a.add_table(rows=1,cols=4)
table1a.style = 'TableGrid'
hdr_Cells = table1a.rows[0].cells
hdr_Cells[0].paragraphs[0].add_run("Input").bold=True
hdr_Cells[1].paragraphs[0].add_run("Google").bold=True
hdr_Cells[2].paragraphs[0].add_run("MNF Diff Score Method").bold=True
hdr_Cells[3].paragraphs[0].add_run("Bleu Diff Score Method").bold=True
doc1b = docx.Document()
sections = doc1b.sections
for section in sections:
section.top_margin = Inches(0.2)
section.bottom_margin = Inches(0.2)
section.left_margin = Inches(0.2)
section.right_margin = Inches(0.2)
section = doc1b.sections[-1]
new_height = section.page_width
section.page_width = section.page_height
section.page_height = new_height
name = 'Dialogue Comparision Table of '+doc_file
doc1b.add_heading(name, 0)
doc_para = doc1b.add_paragraph()
doc_para.add_run('Translation resources used : Google, IBM watson, AWS, Azure, Lingvanex, Yandex').bold = True
table1b = doc1b.add_table(rows=1,cols=4)
table1b.style = 'TableGrid'
hdr_Cells = table1b.rows[0].cells
hdr_Cells[0].paragraphs[0].add_run("Gleu Diff Score Method").bold=True
hdr_Cells[1].paragraphs[0].add_run("Meteor Diff Score Method").bold=True
hdr_Cells[2].paragraphs[0].add_run("Rougen Diff Score Method").bold=True
hdr_Cells[3].paragraphs[0].add_run("Rougel Diff Score Method").bold=True
doc2 = docx.Document()
sections = doc2.sections
for section in sections:
section.top_margin = Inches(0.2)
section.bottom_margin = Inches(0.2)
section.left_margin = Inches(0.2)
section.right_margin = Inches(0.2)
section = doc2.sections[-1]
new_height = section.page_width
section.page_width = section.page_height
section.page_height = new_height
name = 'Final table '+doc_file
doc2.add_heading(name, 0)
doc_para = doc2.add_paragraph()
doc_para.add_run('Translation resources used : Google, IBM watson, AWS, Azure, Lingvanex, Yandex').bold = True
table2 = doc2.add_table(rows=1,cols=4)
table2.style = 'TableGrid'
hdr_Cells = table2.rows[0].cells
hdr_Cells[0].paragraphs[0].add_run("Input").bold=True
hdr_Cells[1].paragraphs[0].add_run("Output1").bold=True
hdr_Cells[2].paragraphs[0].add_run("Output2").bold=True
hdr_Cells[3].paragraphs[0].add_run("Output3").bold=True
refined,total_scenes = getRefined(filename1)
print(refined)
sluglines,without_slug = getSlugAndNonSlug(refined)
print(sluglines)
characters = getSpeakers(without_slug)
#print(characters)
scenes,actionline,parenthetical_lis,speakers,dialogues = getScenes(refined,total_scenes,characters)
#print(scenes)
refined,total_scenes = getRefined(filename2)
sluglines,without_slug = getSlugAndNonSlug(refined)
characters = getSpeakers(without_slug)
scenes1,actionline,parenthetical_lis,speakers,dialogues = getScenes(refined,total_scenes,characters)
def language_detector(text):
result = translate_client.translate(text, target_language='hi')
det_lang = result["detectedSourceLanguage"]
return det_lang
def script_det(text):
punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
no_punct = ""
for char in text:
if char not in punctuations:
no_punct = char
break
script = script_cat(no_punct)[0]
return script
def punct_remover(string):
punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~…।'''
for x in string.lower():
if x in punctuations:
string = string.replace(x, " ")
return string
def space_after_punct(text):
#text = text.replace('...',' ... ')
text = text.replace('. . .',' ... ')
text = re.sub('([,!?()…-])', r'\1 ', text)
text = re.sub('\s{2,}', ' ', text)
return text
def final_transliterated_sentence(original, transliterated):
original = space_after_punct(original)
punct_list = ['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', ' ', '-', '.', '/', ':', ';',
'<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', '', '...', '']
sentence = []
j = 0
for i in range(len(original.split())):
if original.split()[i] in punct_list:
sentence.append(original.split()[i])
elif original.split()[i][-1] in punct_list:
temp = transliterated.split()[j] + original.split()[i][-1]
sentence.append(temp)
j = j+1
elif original.split()[i][-1] not in punct_list:
temp = transliterated.split()[j]
sentence.append(temp)
j = j+1
transliterated_sentence = " ".join(sentence)
transliterated_sentence.replace(' ... ','...')
transliterated_sentence.replace('', '')
return transliterated_sentence
def MNF_translate(text, dest_lang):
result = translate_client.translate(text, target_language = dest_lang)
translated_text = result['translatedText']
return translated_text
def dial_comparison_transliteration_rom_dev_ph1(text, source_lang, source_script, dest_script):
source_lang = "hi"
source_script = "Latin"
dest_script = "Devanagari"
sources_name = {'0':'Azure', '1':'indic_trans', '2':'google', '3':'indic_trans_IAST'}
sentences=sentence_tokenize.sentence_split(text, lang='en')
priority_list =['Azure', 'indic_trans', 'google', 'indic_trans_IAST', ]
transliterated_text=[]
for sentence in sentences:
if sentence == "" or sentence == " . . ." or sentence == " . ." or sentence ==" . . ”":
continue
print(sentence)
OUT=[]
for word in sentence.split():
if word==".":
continue
print(word)
t0 = azure_transliteration(word, source_lang, source_script, dest_script)
#print(t0)
t1 = indic_trans(word, source_script, dest_script)
#print(t1)
t2 = google(word, 'en', 'hi')
#print(t2)
t3 = indic_transliteration_IAST(word)
#print(t3)
outputs=[t0, t1, t2, t3]
out = compare_outputs_transliteration(word, outputs, sources_name, priority_list)
OUT.append(out)
transliterated_text.append(" ".join(OUT))
return " ".join(transliterated_text)
# def dial_comparison_transliteration_dev_rom_ph1(text, source_lang, source_script, dest_script):
# sources_name = {'0':'indic_trans', '1':'Azure', '2':'libindic', '3':'sheetal', '4':'ritwik'}
# sentences=sentence_tokenize.sentence_split(text, lang='hi')
# priority_list =['indic_trans', 'Azure', 'ritwik', 'sheetal', 'libindic']
# transliterated_text=[]
# for sentence in sentences:
# if sentence == "" or sentence == " . . ." or sentence == " . ." or sentence ==" . . ”":
# continue
# print(sentence)
# OUT=[]
# for word in sentence.split():
# if word==".":
# continue
# print(word)
# t0 = indic_trans(word, source_script, dest_script)
# #print(t0)
# t1 = azure_transliteration(word, source_lang, source_script, dest_script)
# #print(t1)
# t2 = libindic(word, dest_script).rstrip()
# #print(t2)
# t3 = sheetal(word).replace('\n','')
# #print(t3)
# t4 = ritwik(word).replace('\n','').rstrip()
# #print(t4)
# outputs=[t0, t1, t2, t3, t4]
# out = compare_outputs_transliteration(word, outputs, sources_name, priority_list)
# OUT.append(out)
# transliterated_text.append(" ".join(OUT))
# return " ".join(transliterated_text)
def dial_comparison_transliteration_dev_rom_ph1(text, source_lang, source_script, dest_script):
sources_name = {'0':'indic_trans', '1':'Azure', '2':'libindic', '3':'sheetal', '4':'ritwik'}
sentences=sentence_tokenize.sentence_split(text, lang='hi')
priority_list =['indic_trans', 'Azure', 'ritwik', 'sheetal', 'libindic']
transliterated_text=[]
for sentence in sentences:
if sentence == "" or sentence == " . . ." or sentence == " . ." or sentence ==" . . ”":
continue
print("original sentence", sentence)
temp_sentence = punct_remover(sentence)
print("sentence after punctuation", temp_sentence)
t0 = indic_trans(temp_sentence, source_script, dest_script)
#print(t0)
t1 = azure_transliteration(temp_sentence, source_lang, source_script, dest_script)
#print(t1)
t2 = libindic(temp_sentence, dest_script).rstrip()
#print(t2)
t3 = sheetal(temp_sentence).replace('\n','')
#print(t3)
t4 = ritwik(temp_sentence).replace('\n','').rstrip()
#print(t4)
Out= []
outputs = []
for i in range(len(temp_sentence.split())):
word = temp_sentence.split()[i]
T0 = t0.split()[i]
T1 = t1.split()[i]
T2 = t2.split()[i]
T3 = t3.split()[i]
T4 = t4.split()[i]
outputs=[T0, T1, T2, T3, T4]
out = compare_outputs_transliteration(word, outputs, sources_name, priority_list)
Out.append(out)
trans_sent_wo_punct = " ".join(Out)
print("trans_sent_wo_punct", trans_sent_wo_punct)
transliterated_sentence = final_transliterated_sentence(sentence, trans_sent_wo_punct)
print("trans_sent_with_punct", transliterated_sentence)
transliterated_text.append(transliterated_sentence)
return " ".join(transliterated_text)
def dial_comparison_transliteration_arbic_to_rom_ph1(text, source_lang, source_script, dest_script):
print("hello")
sources_name = {'0':'indic_trans', '1':'Azure', '2':'buck_2_unicode'}
sentences=sentence_tokenize.sentence_split(text, lang='en')
priority_list =['indic_trans', 'Azure', 'buck_2_unicode' ]
transliterated_text=[]
for sentence in sentences:
if sentence == "" or sentence == " . . ." or sentence == " . ." or sentence ==" . . ”":
continue
print(sentence)
OUT=[]
for word in sentence.split():
if word==".":
continue
print(word)
t0 = indic_trans(word, source_script, dest_script)
t1 = azure_transliteration(word, source_lang, source_script, dest_script)
t2 = buck_2_unicode(word)
outputs=[t0, t1, t2]
out = compare_outputs_transliteration(word, outputs, sources_name, priority_list)
OUT.append(out)
transliterated_text.append(" ".join(OUT))
return " ".join(transliterated_text)
def dial_comparison_transliteration_kann_to_rom_ph1(text, source_lang, source_script, dest_script):
print("hello")
sources_name = {'0':'om_transliteration', '1':'indic_trans', '2':'libindic', '3':'Azure'}
sentences=sentence_tokenize.sentence_split(text, lang='en')
priority_list =['om_transliteration', 'indic_trans', 'libindic', 'Azure']
transliterated_text=[]
for sentence in sentences:
if sentence == "" or sentence == " . . ." or sentence == " . ." or sentence ==" . . ”":
continue
print(sentence)
OUT=[]
for word in sentence.split():
if word==".":
continue
print(word)
t0 = om_transliterator(word)
t1 = indic_trans(word, source_script, dest_script)
t2 = libindic(text, dest_script)
t3 = azure_transliteration(word, source_lang, source_script, dest_script)
outputs=[t0, t1, t2, t3]
out = compare_outputs_transliteration(word, outputs, sources_name, priority_list)
OUT.append(out)
transliterated_text.append(" ".join(OUT))
return " ".join(transliterated_text)
def dial_comparison_transliteration_tamil_to_rom_ph1(text, source_lang, source_script, dest_script):
print("hello")
sources_name = {'0':'Azure', '1':'libindic', '2':'indic_trans', }
sentences=sentence_tokenize.sentence_split(text, lang='en')
priority_list =['Azure', 'libindic', 'indic_trans']
transliterated_text=[]
for sentence in sentences:
if sentence == "" or sentence == " . . ." or sentence == " . ." or sentence ==" . . ”":
continue
print(sentence)
OUT=[]
for word in sentence.split():
if word==".":
continue
print(word)
t0 = azure_transliteration(word, source_lang, source_script, dest_script)
t2 = libindic(text, dest_script)
t1 = indic_trans(word, source_script, dest_script)
outputs=[t0, t1, t2]
out = compare_outputs_transliteration(word, outputs, sources_name, priority_list)
OUT.append(out)
transliterated_text.append(" ".join(OUT))
return " ".join(transliterated_text)
def compare_outputs_transliteration(word, outputs, sources_name, priority_list):
#print(outputs)
O1ANDS1, O2ANDS2 = selection_source_transliteration(sources_name, outputs, priority_list)
print(O1ANDS1)
add_dial_comparison_doc2_transliteration(doc2, table2, word, O1ANDS1, O2ANDS2, sources_name)
return O1ANDS1[0]
def add_dial_comparison_doc2_transliteration(doc2, table2, word, O1ANDS1, O2ANDS2, sources_name):
row_Cells = table2.add_row().cells
row_Cells[0].text= word
row_Cells[1].text= O1ANDS1[0]
row_Cells[1].paragraphs[0].add_run('(Source : '+str(O1ANDS1[1])+')')
row_Cells[2].text= O2ANDS2[0]
row_Cells[2].paragraphs[0].add_run('(Source : '+str(O2ANDS2[1])+')')
original_dialogues = []
for scene in tqdm(scenes1[:5]):
for i,line in enumerate(scene):
if i == 0:
#addSlugLine(doc,line)
continue
if type(line)==type(""):
#addActionLine(doc, line, non_dial_src_lang)
continue
#print("action_line")
#non_dial_src_lang = language_detector(line)
#print("non_dial_src_lang", non_dial_src_lang)
#non_dial_translate = non_dial_checker( non_dial_dest_lang, non_dial_src_lang )
#print("non_dial_translate", non_dial_translate)
#print("line", line)
#if non_dial_translate:
# if non_dial_src_lang in translation_list and non_dial_dest_lang in translation_list:
# trans_text = dial_comparison(line , non_dial_src_lang, non_dial_dest_lang)
# addActionLine(doc, trans_text, non_dial_dest_lang)
#else:
# addActionLine(doc, line, non_dial_dest_lang)
else:
print("In dialogue")
[speaker] = line.keys()
if speaker == 'Transition':
# if want to translate transition also along with action line use addTransition(doc,translator.translate(speaker,dest = gtrans_dict[actionline_dest_lang]).text)
#addTransition(doc,line[speaker])
continue
#addSpeaker(doc,speaker)
if line[speaker][0] != 'NONE': # In parenthitical part
# non_dial_translate = "no"
# if non_dial_translate == "yes":
# out = MNF_translate(line[speaker][0], non_dial_dest_lang)
# addParenthetical(doc,out)
# else:
# addParenthetical(doc,line[speaker][0])
continue
#print("dialogue to be transliterated ", line[speaker][2])
if line[speaker][2] == "":
continue
original_dialogues.append(line[speaker][2])
# if dial_dest_script == "Latin" and dial_src_script == "Devanagari":
# trans_text = dial_comparison_transliteration_dev_rom_ph1(line[speaker][2], dial_src_lang, dial_src_script, dial_dest_script)
# elif dial_dest_script == "Devanagari" and dial_src_script == "Latin":
# trans_text = dial_comparison_transliteration_rom_dev_ph1(line[speaker][2], dial_src_lang, dial_src_script, dial_dest_script)
# elif dial_dest_script == "Latin" and dial_src_script == "Arabic":
# trans_text = dial_comparison_transliteration_arbic_to_rom_ph1(line[speaker][2], dial_src_lang, dial_src_script, dial_dest_script )
# elif dial_dest_script == "Latin" and dial_src_script == "Kannada":
# trans_text = dial_comparison_transliteration_kann_to_rom_ph1(line[speaker][2], dial_src_lang, dial_src_script, dial_dest_script )
# elif dial_dest_script == "Latin" and dial_src_script == "Tamil":
# trans_text = dial_comparison_transliteration_tamil_to_rom_ph1(line[speaker][2], dial_src_lang, dial_src_script, dial_dest_script )
# if dual_dial_script == "Yes":
# dual_script(doc, line[speaker][2], trans_text, dial_src_lang)
# else:
# addDialogue(doc, trans_text, dial_src_lang)
for scene in tqdm(scenes):
x = "False"
y = "False"
for i,line in enumerate(scene):
if i == 0:
continue
if type(line)==type(""):
x = "True"
non_dial_src_lang = language_detector(line)
else:
y = "True"
[speaker] = line.keys()
if speaker == 'Transition':
continue
if line[speaker][0] != 'NONE':
continue
dial_src_lang = language_detector(line[speaker][2])
dial_src_script = script_det(line[speaker][2])
if x == "True" and y == "True":
break
print("non_dial_src_lang", non_dial_src_lang)
print("dial_src_lang", dial_src_lang)
print("dial_src_script", dial_src_script)
print("dial_dest_script", dial_dest_script)
j = 0
for scene in tqdm(scenes[:5]):
for i,line in enumerate(scene):
if i == 0:
addSlugLine(doc,line)
continue
if type(line)==type(""):
addActionLine(doc, line, non_dial_src_lang)
#print("action_line")
#non_dial_src_lang = language_detector(line)
#print("non_dial_src_lang", non_dial_src_lang)
#non_dial_translate = non_dial_checker( non_dial_dest_lang, non_dial_src_lang )
#print("non_dial_translate", non_dial_translate)
#print("line", line)
#if non_dial_translate:
# if non_dial_src_lang in translation_list and non_dial_dest_lang in translation_list:
# trans_text = dial_comparison(line , non_dial_src_lang, non_dial_dest_lang)
# addActionLine(doc, trans_text, non_dial_dest_lang)
#else:
# addActionLine(doc, line, non_dial_dest_lang)
else:
print("In dialogue")
[speaker] = line.keys()
if speaker == 'Transition':
# if want to translate transition also along with action line use addTransition(doc,translator.translate(speaker,dest = gtrans_dict[actionline_dest_lang]).text)
addTransition(doc,line[speaker])
continue
addSpeaker(doc,speaker)
if line[speaker][0] != 'NONE': # In parenthitical part
non_dial_translate = "no"
if non_dial_translate == "yes":
out = MNF_translate(line[speaker][0], non_dial_dest_lang)
addParenthetical(doc,out)
else:
addParenthetical(doc,line[speaker][0])
print("dialogue to be transliterated ", line[speaker][2])
if line[speaker][2] == "":
continue
if dial_dest_script == "Latin" and dial_src_script == "Devanagari":
trans_text = dial_comparison_transliteration_dev_rom_ph1(line[speaker][2], dial_src_lang, dial_src_script, dial_dest_script)
elif dial_dest_script == "Devanagari" and dial_src_script == "Latin":
trans_text = dial_comparison_transliteration_rom_dev_ph1(line[speaker][2], dial_src_lang, dial_src_script, dial_dest_script)
elif dial_dest_script == "Latin" and dial_src_script == "Arabic":
trans_text = dial_comparison_transliteration_arbic_to_rom_ph1(line[speaker][2], dial_src_lang, dial_src_script, dial_dest_script )
elif dial_dest_script == "Latin" and dial_src_script == "Kannada":
trans_text = dial_comparison_transliteration_kann_to_rom_ph1(line[speaker][2], dial_src_lang, dial_src_script, dial_dest_script )
elif dial_dest_script == "Latin" and dial_src_script == "Tamil":
trans_text = dial_comparison_transliteration_tamil_to_rom_ph1(line[speaker][2], dial_src_lang, dial_src_script, dial_dest_script )
if dual_dial_script == "Yes":
if translation_and_transliteration == "Yes":
dual_script(doc, original_dialogues[j], trans_text, dial_src_lang)
j=j+1
else:
dual_script(doc, line[speaker][2], trans_text, dial_src_lang)
else:
addDialogue(doc, trans_text, dial_src_lang)
doc.save(doc_file)
#doc1a.save("1"+doc_file)
#doc1b.save("2"+doc_file)
doc2.save("final.docx")