785 lines
34 KiB
Python
785 lines
34 KiB
Python
|
import os
|
||
|
import sys
|
||
|
import docx
|
||
|
import re
|
||
|
# import textract
|
||
|
from tqdm import tqdm
|
||
|
from collections import Counter
|
||
|
import ntpath
|
||
|
from docx.shared import Inches, Cm, Pt
|
||
|
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
||
|
from docx.enum.table import WD_TABLE_ALIGNMENT, WD_ALIGN_VERTICAL
|
||
|
import requests, uuid, json
|
||
|
import nltk.translate.bleu_score as bleu
|
||
|
import nltk.translate.gleu_score as gleu
|
||
|
from rouge_score import rouge_scorer
|
||
|
import numpy as np
|
||
|
from indicnlp.tokenize import sentence_tokenize
|
||
|
import nltk
|
||
|
|
||
|
from MNF.settings import BasePath
|
||
|
basePath = BasePath()
|
||
|
from google.cloud import translate
|
||
|
from google.cloud import translate_v2 as Translate
|
||
|
|
||
|
from .script_detector import script_cat
|
||
|
from .buck_2_unicode import buck_2_unicode
|
||
|
from .translation_metric import manual_diff_score, bleu_diff_score, gleu_diff_score, meteor_diff_score, rouge_diff_score, diff_score, critera4_5
|
||
|
from .selection_source import selection_source, function5, function41, function311, function221, function2111, function11111, selection_source_transliteration, two_sources_two_outputs
|
||
|
from .script_writing import addSlugLine, addActionLine, addSpeaker, addParenthetical, addDialogue, dual_script, addTransition, dial_checker, non_dial_checker
|
||
|
from .script_reading import breaksen, getRefined, getSlugAndNonSlug, getSpeakers, getScenes
|
||
|
from .translation_resources import ibm_watson, google, aws, azure, yandex #lingvanex
|
||
|
from .transliteration_resources import azure_transliteration, indic_trans, om_transliterator, libindic, indic_transliteration_IAST, indic_transliteration_ITRANS, sheetal,unicode_transliteration_GURMUKHI,indic_transliteration_GURMUKHI,transliteration_LATIN_CYRILLIC,indic_transliteration_TELUGU,unicode_transliteration_GURMUKHI_LATIN,indic_transliteration_GURMUKHI_LATIN,transliteration_CYRILIC_LATIN,ConvertToLatin,readonly
|
||
|
from .detection import language_detector, script_det
|
||
|
# import logging
|
||
|
# from logger import get_module_logger
|
||
|
# log = get_module_logger(__name__)
|
||
|
# log.info('Logger working')
|
||
|
|
||
|
|
||
|
def makeTransliteration_translation(translation_and_transliteration, translated_file, dial_dest_script, dual_dial_script, original_file):
|
||
|
#os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = rf"{basePath}/conversion/My First Project-2573112d5326.json"
|
||
|
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = rf"{basePath}/MNF/json_keys/authentication.json"
|
||
|
# os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = rf"{basePath}/conversion/gifted-mountain-318504-4f001d5f08db.json"
|
||
|
translate_client = Translate.Client()
|
||
|
client = translate.TranslationServiceClient()
|
||
|
project_id = 'authentic-bongo-272808'
|
||
|
location = "global"
|
||
|
parent = f"projects/{project_id}/locations/{location}"
|
||
|
|
||
|
|
||
|
translation_and_transliteration = translation_and_transliteration
|
||
|
filename1 = translated_file
|
||
|
dial_dest_script = dial_dest_script
|
||
|
dual_dial_script = dual_dial_script
|
||
|
filename2 = original_file
|
||
|
|
||
|
|
||
|
# create an instance of a word document
|
||
|
doc = docx.Document()
|
||
|
docfile = translated_file
|
||
|
print(docfile)
|
||
|
|
||
|
doc2 = docx.Document()
|
||
|
sections = doc2.sections
|
||
|
for section in sections:
|
||
|
section.top_margin = Inches(0.2)
|
||
|
section.bottom_margin = Inches(0.2)
|
||
|
section.left_margin = Inches(0.2)
|
||
|
section.right_margin = Inches(0.2)
|
||
|
section = doc2.sections[-1]
|
||
|
new_height = section.page_width
|
||
|
section.page_width = section.page_height
|
||
|
section.page_height = new_height
|
||
|
name = 'Final table '+docfile
|
||
|
doc2.add_heading(name, 0)
|
||
|
doc_para = doc2.add_paragraph()
|
||
|
doc_para.add_run('Translation resources used : Google, IBM watson, AWS, Azure, Lingvanex, Yandex').bold = True
|
||
|
table2 = doc2.add_table(rows=1,cols=4)
|
||
|
table2.style = 'TableGrid'
|
||
|
hdr_Cells = table2.rows[0].cells
|
||
|
hdr_Cells[0].paragraphs[0].add_run("Input").bold=True
|
||
|
hdr_Cells[1].paragraphs[0].add_run("Output1").bold=True
|
||
|
hdr_Cells[2].paragraphs[0].add_run("Output2").bold=True
|
||
|
hdr_Cells[3].paragraphs[0].add_run("Output3").bold=True
|
||
|
|
||
|
#scenes for translated file
|
||
|
refined,total_scenes = getRefined(filename1)
|
||
|
#print(refined)
|
||
|
sluglines,without_slug = getSlugAndNonSlug(refined)
|
||
|
#print(sluglines)
|
||
|
characters = getSpeakers(without_slug)
|
||
|
#print(characters)
|
||
|
scenes,actionline,parenthetical_lis,speakers,dialogues = getScenes(refined,total_scenes,characters)
|
||
|
#print(scenes)
|
||
|
|
||
|
#scenes1 for original file
|
||
|
refined,total_scenes = getRefined(filename2)
|
||
|
sluglines,without_slug = getSlugAndNonSlug(refined)
|
||
|
characters = getSpeakers(without_slug)
|
||
|
scenes1,actionline,parenthetical_lis,speakers,dialogues = getScenes(refined,total_scenes,characters)
|
||
|
|
||
|
|
||
|
def punct_remover(string):
|
||
|
punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~…।'''
|
||
|
for x in string.lower():
|
||
|
if x in punctuations:
|
||
|
string = string.replace(x, " ")
|
||
|
return string
|
||
|
|
||
|
def space_after_punct(text):
|
||
|
#text = text.replace('...',' ... ')
|
||
|
text = text.replace('. . .',' ... ')
|
||
|
text = re.sub('([,!?()…-])', r'\1 ', text)
|
||
|
text = re.sub('\s{2,}', ' ', text)
|
||
|
return text
|
||
|
|
||
|
def final_transliterated_sentence(original, transliterated):
|
||
|
original = space_after_punct(original)
|
||
|
punct_list = ['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', ' ', '-', '.', '/', ':', ';',
|
||
|
'<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', '…', '...', '।']
|
||
|
sentence = []
|
||
|
j = 0
|
||
|
|
||
|
for i in range(len(original.split())):
|
||
|
|
||
|
if original.split()[i] in punct_list:
|
||
|
sentence.append(original.split()[i])
|
||
|
elif original.split()[i][-1] in punct_list:
|
||
|
temp = transliterated.split()[j] + original.split()[i][-1]
|
||
|
sentence.append(temp)
|
||
|
j = j+1
|
||
|
elif original.split()[i][-1] not in punct_list:
|
||
|
temp = transliterated.split()[j]
|
||
|
sentence.append(temp)
|
||
|
j = j+1
|
||
|
|
||
|
transliterated_sentence = " ".join(sentence)
|
||
|
transliterated_sentence.replace(' ... ','...')
|
||
|
transliterated_sentence.replace('… ', '…')
|
||
|
return transliterated_sentence
|
||
|
|
||
|
def MNF_translate(text, dest_lang):
|
||
|
result = translate_client.translate(text, target_language = dest_lang)
|
||
|
translated_text = result['translatedText']
|
||
|
return translated_text
|
||
|
|
||
|
def google_length_checker(t, temp_sentence, t0):
|
||
|
if len(t.split()) >= len(temp_sentence.split()):
|
||
|
return t
|
||
|
elif len(t.split()) == len(temp_sentence.split())-1:
|
||
|
final_t = t+ " " + t0.split()[-1]
|
||
|
return final_t
|
||
|
elif len(t.split()) == len(temp_sentence.split())-2:
|
||
|
final_t = t+ " " + t0.split()[-2] + " " + t0.split()[-1]
|
||
|
return final_t
|
||
|
else:
|
||
|
return t
|
||
|
|
||
|
def Halant_remover(T3):
|
||
|
if T3[-1] == "्":
|
||
|
return T3[:-1]
|
||
|
else: return T3
|
||
|
|
||
|
def dial_comparison_transliteration_rom_dev_ph1(text, source_lang, source_script, dest_script):
|
||
|
source_lang = "hi"
|
||
|
source_script = "Latin"
|
||
|
dest_script = "Devanagari"
|
||
|
sources_name = {'0':'Azure', '1':'indic_trans', '2':'google', '3':'indic_trans_IAST'}
|
||
|
sentences=sentence_tokenize.sentence_split(text, lang='en')
|
||
|
priority_list =['Azure', 'indic_trans', 'google', 'indic_trans_IAST', ]
|
||
|
|
||
|
transliterated_text=[]
|
||
|
for sentence in sentences:
|
||
|
if sentence == "" or sentence == " . . ." or sentence == " . ." or sentence ==" . . ”":
|
||
|
continue
|
||
|
print(sentence)
|
||
|
OUT=[]
|
||
|
for word in sentence.split():
|
||
|
if word==".":
|
||
|
continue
|
||
|
print(word)
|
||
|
t0 = azure_transliteration(word, source_lang, source_script, dest_script)
|
||
|
#print(t0)
|
||
|
t1 = indic_trans(word, source_script, dest_script)
|
||
|
#print(t1)
|
||
|
t2 = google(word, 'en', 'hi')
|
||
|
#print(t2)
|
||
|
t3 = indic_transliteration_IAST(word)
|
||
|
#print(t3)
|
||
|
outputs=[t0, t1, t2, t3]
|
||
|
out = compare_outputs_transliteration(word, outputs, sources_name, priority_list)
|
||
|
OUT.append(out)
|
||
|
transliterated_text.append(" ".join(OUT))
|
||
|
return " ".join(transliterated_text)
|
||
|
|
||
|
|
||
|
def dial_comparison_transliteration_rom_dev_ph1_sentence_wise(text, source_lang, source_script, dest_script):
|
||
|
source_lang = "hi"
|
||
|
sources_name = {'0':'Azure', '1':'indic_trans', '2':'google', '3':'indic_trans_IAST'}
|
||
|
etc_punctuation =["", " . . .", " . .", " . . ”"]
|
||
|
sentences=sentence_tokenize.sentence_split(text, lang='en')
|
||
|
priority_list =['Azure', 'indic_trans', 'google', 'indic_trans_IAST']
|
||
|
|
||
|
transliterated_text=[]
|
||
|
for sentence in sentences:
|
||
|
if sentence in etc_punctuation:
|
||
|
continue
|
||
|
|
||
|
print("original_sentence", sentence)
|
||
|
temp_sentence = punct_remover(sentence)
|
||
|
print("sentence_without_punctuation", temp_sentence)
|
||
|
|
||
|
t00 = azure_transliteration(temp_sentence, source_lang, source_script, dest_script)
|
||
|
print(t00)
|
||
|
t11 = indic_trans(temp_sentence, source_script, dest_script)
|
||
|
print(t11)
|
||
|
t = google(temp_sentence, 'en', 'hi')
|
||
|
#print("btw", t)
|
||
|
t22 = google_length_checker(t, temp_sentence, t00)
|
||
|
print("T22 transliteration",t22)
|
||
|
t33 = indic_transliteration_IAST(temp_sentence)
|
||
|
print(t33)
|
||
|
|
||
|
Out= []
|
||
|
outputs = []
|
||
|
for i in range(len(temp_sentence.split())):
|
||
|
word = temp_sentence.split()[i]
|
||
|
# print("test of trasnliteration",i)
|
||
|
T0 = t00.split()[i]
|
||
|
# print("test of trasnliteration T0",T0)
|
||
|
# print("test of trasnliteration T11",t11)
|
||
|
T1 = t11.split()[i]
|
||
|
# print("test of trasnliteration T1",T1)
|
||
|
# print("test of trasnliteration T22",t22)
|
||
|
T2 = t22.split()[i]
|
||
|
# print("test of trasnliteration T2",T2)
|
||
|
T3 = t33.split()[i]
|
||
|
T3 = Halant_remover(T3)
|
||
|
|
||
|
outputs=[T0, T1, T2, T3]
|
||
|
out = compare_outputs_transliteration(word, outputs, sources_name, priority_list)
|
||
|
Out.append(out)
|
||
|
trans_sent_wo_punct = " ".join(Out)
|
||
|
print("trans_sent_wo_punct", trans_sent_wo_punct)
|
||
|
|
||
|
transliterated_sentence = final_transliterated_sentence(sentence, trans_sent_wo_punct)
|
||
|
print("trans_sent_w_punct", transliterated_sentence)
|
||
|
transliterated_text.append(transliterated_sentence)
|
||
|
|
||
|
return " ".join(transliterated_text)
|
||
|
|
||
|
# def dial_comparison_transliteration_dev_rom_ph1(text, source_lang, source_script, dest_script):
|
||
|
# sources_name = {'0':'indic_trans', '1':'Azure', '2':'libindic', '3':'sheetal', '4':'ritwik'}
|
||
|
# sentences=sentence_tokenize.sentence_split(text, lang='hi')
|
||
|
# priority_list =['indic_trans', 'Azure', 'ritwik', 'sheetal', 'libindic']
|
||
|
|
||
|
# transliterated_text=[]
|
||
|
# for sentence in sentences:
|
||
|
# if sentence == "" or sentence == " . . ." or sentence == " . ." or sentence ==" . . ”":
|
||
|
# continue
|
||
|
# print("original sentence", sentence)
|
||
|
# temp_sentence = punct_remover(sentence)
|
||
|
# print("sentence after punctuation", temp_sentence)
|
||
|
|
||
|
# t0 = indic_trans(temp_sentence, source_script, dest_script)
|
||
|
# #print(t0)
|
||
|
# t1 = azure_transliteration(temp_sentence, source_lang, source_script, dest_script)
|
||
|
# #print(t1)
|
||
|
# t2 = libindic(temp_sentence, dest_script).rstrip()
|
||
|
# #print(t2)
|
||
|
# t3 = sheetal(temp_sentence).replace('\n','')
|
||
|
# #print(t3)
|
||
|
# t4 = ritwik(temp_sentence).replace('\n','').rstrip()
|
||
|
# #print(t4)
|
||
|
|
||
|
|
||
|
# Out= []
|
||
|
# outputs = []
|
||
|
# for i in range(len(temp_sentence.split())):
|
||
|
# word = temp_sentence.split()[i]
|
||
|
|
||
|
# T0 = t0.split()[i]
|
||
|
# T1 = t1.split()[i]
|
||
|
# T2 = t2.split()[i]
|
||
|
# T3 = t3.split()[i]
|
||
|
# T4 = t4.split()[i]
|
||
|
|
||
|
# outputs=[T0, T1, T2, T3, T4]
|
||
|
# out = compare_outputs_transliteration(word, outputs, sources_name, priority_list)
|
||
|
# Out.append(out)
|
||
|
# trans_sent_wo_punct = " ".join(Out)
|
||
|
# print("trans_sent_wo_punct", trans_sent_wo_punct)
|
||
|
|
||
|
# transliterated_sentence = final_transliterated_sentence(sentence, trans_sent_wo_punct)
|
||
|
# print("trans_sent_with_punct", transliterated_sentence)
|
||
|
# transliterated_text.append(transliterated_sentence)
|
||
|
|
||
|
# return " ".join(transliterated_text)
|
||
|
def dial_comparison_transliteration_dev_rom_ph1(text, source_lang, source_script, dest_script):
|
||
|
#sources_name = {'0':'indic_trans', '1':'Azure', '2':'libindic', '3':'sheetal', '4':'ritwik'}
|
||
|
sources_name = {'0':'indic_trans', '1':'Azure', '2':'libindic', '3':'sheetal'}
|
||
|
sentences=sentence_tokenize.sentence_split(text, lang='hi')
|
||
|
#priority_list =['indic_trans', 'Azure', 'ritwik, 'sheetal', 'libindic']
|
||
|
priority_list =['indic_trans', 'Azure', 'sheetal', 'libindic']
|
||
|
|
||
|
|
||
|
transliterated_text=[]
|
||
|
for sentence in sentences:
|
||
|
if sentence == "" or sentence == " . . ." or sentence == " . ." or sentence ==" . . ”":
|
||
|
continue
|
||
|
print(sentence)
|
||
|
OUT=[]
|
||
|
for word in sentence.split():
|
||
|
if word==".":
|
||
|
continue
|
||
|
print(word)
|
||
|
t0 = indic_trans(word, source_script, dest_script)
|
||
|
#print(t0)
|
||
|
t1 = azure_transliteration(word, source_lang, source_script, dest_script)
|
||
|
#print(t1)
|
||
|
t2 = libindic(word, dest_script).rstrip()
|
||
|
#print(t2)
|
||
|
t3 = sheetal(word).replace('\n','')
|
||
|
#print(t3)
|
||
|
#t4 = ritwik(word).replace('\n','').rstrip()
|
||
|
#print(t4)
|
||
|
# outputs=[t0, t1, t2, t3, t4]
|
||
|
outputs=[t0, t1, t2, t3]
|
||
|
out = compare_outputs_transliteration(word, outputs, sources_name, priority_list)
|
||
|
OUT.append(out)
|
||
|
transliterated_text.append(" ".join(OUT))
|
||
|
return " ".join(transliterated_text)
|
||
|
|
||
|
|
||
|
def dial_comparison_transliteration_arbic_to_rom_ph1(text, source_lang, source_script, dest_script):
|
||
|
print("hello")
|
||
|
sources_name = {'0':'indic_trans', '1':'Azure', '2':'buck_2_unicode'}
|
||
|
sentences=sentence_tokenize.sentence_split(text, lang='en')
|
||
|
priority_list =['indic_trans', 'Azure', 'buck_2_unicode' ]
|
||
|
|
||
|
transliterated_text=[]
|
||
|
for sentence in sentences:
|
||
|
if sentence == "" or sentence == " . . ." or sentence == " . ." or sentence ==" . . ”":
|
||
|
continue
|
||
|
print(sentence)
|
||
|
OUT=[]
|
||
|
for word in sentence.split():
|
||
|
if word==".":
|
||
|
continue
|
||
|
print(word)
|
||
|
t0 = indic_trans(word, source_script, dest_script)
|
||
|
t1 = azure_transliteration(word, source_lang, source_script, dest_script)
|
||
|
t2 = buck_2_unicode(word)
|
||
|
outputs=[t0, t1, t2]
|
||
|
out = compare_outputs_transliteration(word, outputs, sources_name, priority_list)
|
||
|
OUT.append(out)
|
||
|
transliterated_text.append(" ".join(OUT))
|
||
|
return " ".join(transliterated_text)
|
||
|
|
||
|
def dial_comparison_transliteration_kann_to_rom_ph1(text, source_lang, source_script, dest_script):
|
||
|
print("hello")
|
||
|
sources_name = {'0':'om_transliteration', '1':'indic_trans', '2':'libindic', '3':'Azure'}
|
||
|
sentences=sentence_tokenize.sentence_split(text, lang='en')
|
||
|
priority_list =['om_transliteration', 'indic_trans', 'libindic', 'Azure']
|
||
|
|
||
|
transliterated_text=[]
|
||
|
for sentence in sentences:
|
||
|
if sentence == "" or sentence == " . . ." or sentence == " . ." or sentence ==" . . ”":
|
||
|
continue
|
||
|
print(sentence)
|
||
|
OUT=[]
|
||
|
for word in sentence.split():
|
||
|
if word==".":
|
||
|
continue
|
||
|
print(word)
|
||
|
t0 = om_transliterator(word)
|
||
|
t1 = indic_trans(word, source_script, dest_script)
|
||
|
t2 = libindic(word, dest_script)
|
||
|
t3 = azure_transliteration(word, source_lang, source_script, dest_script)
|
||
|
outputs=[t0, t1, t2, t3]
|
||
|
out = compare_outputs_transliteration(word, outputs, sources_name, priority_list)
|
||
|
OUT.append(out)
|
||
|
transliterated_text.append(" ".join(OUT))
|
||
|
return " ".join(transliterated_text)
|
||
|
|
||
|
def dial_comparison_transliteration_tamil_to_rom_ph1(text, source_lang, source_script, dest_script):
|
||
|
print("hello")
|
||
|
sources_name = {'0':'Azure', '1':'libindic', '2':'indic_trans', }
|
||
|
sentences=sentence_tokenize.sentence_split(text, lang='en')
|
||
|
priority_list =['Azure', 'libindic', 'indic_trans']
|
||
|
|
||
|
transliterated_text=[]
|
||
|
for sentence in sentences:
|
||
|
if sentence == "" or sentence == " . . ." or sentence == " . ." or sentence ==" . . ”":
|
||
|
continue
|
||
|
print(sentence)
|
||
|
OUT=[]
|
||
|
for word in sentence.split():
|
||
|
if word==".":
|
||
|
continue
|
||
|
print(word)
|
||
|
t0 = azure_transliteration(word, source_lang, source_script, dest_script)
|
||
|
t2 = libindic(word, dest_script)
|
||
|
t1 = indic_trans(word, source_script, dest_script)
|
||
|
outputs=[t0, t1, t2]
|
||
|
out = compare_outputs_transliteration(word, outputs, sources_name, priority_list)
|
||
|
OUT.append(out)
|
||
|
transliterated_text.append(" ".join(OUT))
|
||
|
return " ".join(transliterated_text)
|
||
|
|
||
|
def dial_comparison_transliteration_beng_tel_mal_to_rom_ph1(text, source_lang, source_script, dest_script):
|
||
|
print("hello gujarati to latin")
|
||
|
sources_name = {'0':'Azure', '1':'indic_trans', '2':'libindic'}
|
||
|
sentences=sentence_tokenize.sentence_split(text, lang='en')
|
||
|
priority_list =['Azure', 'indic_trans', 'libindic']
|
||
|
|
||
|
transliterated_text=[]
|
||
|
for sentence in sentences:
|
||
|
if sentence == "" or sentence == " . . ." or sentence == " . ." or sentence ==" . . ”":
|
||
|
continue
|
||
|
#print(sentence)
|
||
|
OUT=[]
|
||
|
for word in sentence.split():
|
||
|
if word==".":
|
||
|
continue
|
||
|
print(word)
|
||
|
t0 = azure_transliteration(word, source_lang, source_script, dest_script)
|
||
|
t1 = indic_trans(word, source_script, dest_script)
|
||
|
t2 = libindic(word, dest_script)
|
||
|
outputs=[t0, t1, t2]
|
||
|
out = compare_outputs_transliteration(word, outputs, sources_name, priority_list)
|
||
|
OUT.append(out)
|
||
|
transliterated_text.append(" ".join(OUT))
|
||
|
return " ".join(transliterated_text)
|
||
|
|
||
|
def dial_comparison_transliteration_latin_gurmukhi(text,source_lang, source_script, dest_script):
|
||
|
source_lang = "pa"
|
||
|
sources_name = {'0':'Azure', '1':'indic_trans', '2':'indic_trans_IAST'}
|
||
|
etc_punctuation =["", " . . .", " . .", " . . ”"]
|
||
|
sentences=sentence_tokenize.sentence_split(text, lang='en')
|
||
|
priority_list =['Azure', 'indic_trans', 'indic_trans_IAST']
|
||
|
|
||
|
transliterated_text=[]
|
||
|
for sentence in sentences:
|
||
|
if sentence in etc_punctuation:
|
||
|
continue
|
||
|
temp_sentence = punct_remover(sentence)
|
||
|
|
||
|
t00 = azure_transliteration(temp_sentence, source_lang, source_script, dest_script)
|
||
|
t11 = indic_transliteration_GURMUKHI(temp_sentence)
|
||
|
t22 = unicode_transliteration_GURMUKHI(temp_sentence)
|
||
|
|
||
|
Out= []
|
||
|
outputs = []
|
||
|
for i in range(len(temp_sentence.split())):
|
||
|
word = temp_sentence.split()[i]
|
||
|
T0 = t00.split()[i]
|
||
|
T1 = t11.split()[i]
|
||
|
T2 = t22.split()[i]
|
||
|
outputs=[T0, T1, T2]
|
||
|
out = compare_outputs_transliteration(word, outputs, sources_name, priority_list)
|
||
|
Out.append(out)
|
||
|
trans_sent_wo_punct = " ".join(Out)
|
||
|
transliterated_sentence = final_transliterated_sentence(sentence, trans_sent_wo_punct)
|
||
|
transliterated_text.append(transliterated_sentence)
|
||
|
|
||
|
return " ".join(transliterated_text)
|
||
|
|
||
|
def dial_comparison_transliteration_latin_cyrillic(text,source_lang, source_script, dest_script):
|
||
|
source_lang = "bg"
|
||
|
sources_name = {'0':'Azure', '1':'indic_trans'}
|
||
|
etc_punctuation =["", " . . .", " . .", " . . ”"]
|
||
|
sentences=sentence_tokenize.sentence_split(text, lang='en')
|
||
|
priority_list =['Azure', 'indic_trans']
|
||
|
|
||
|
transliterated_text=[]
|
||
|
for sentence in sentences:
|
||
|
if sentence in etc_punctuation:
|
||
|
continue
|
||
|
temp_sentence = punct_remover(sentence)
|
||
|
|
||
|
t00 = azure_transliteration(temp_sentence, source_lang, source_script, dest_script)
|
||
|
t11 = transliteration_LATIN_CYRILLIC(temp_sentence)
|
||
|
#t22 = polygot(text)
|
||
|
Out= []
|
||
|
outputs = []
|
||
|
for i in range(len(temp_sentence.split())):
|
||
|
word = temp_sentence.split()[i]
|
||
|
T0 = t00.split()[i]
|
||
|
T1 = t11.split()[i]
|
||
|
#T2 = t22.split()[i]
|
||
|
outputs=[T0, T1]
|
||
|
#outputs=[T0, T1, T2]
|
||
|
out = compare_outputs_transliteration(word, outputs, sources_name, priority_list)
|
||
|
Out.append(out)
|
||
|
trans_sent_wo_punct = " ".join(Out)
|
||
|
|
||
|
transliterated_sentence = final_transliterated_sentence(sentence, trans_sent_wo_punct)
|
||
|
transliterated_text.append(transliterated_sentence)
|
||
|
return " ".join(transliterated_text)
|
||
|
|
||
|
def dial_comparison_transliteration_latin_telugu_sentence_wise(text, source_lang, source_script, dest_script):
|
||
|
source_lang = "te"
|
||
|
sources_name = {'0':'indic_trans', '1':'Azure'}
|
||
|
priority_list =['indic_trans', 'Azure',]
|
||
|
etc_punctuation =["", " . . .", " . .", " . . ”"]
|
||
|
sentences=sentence_tokenize.sentence_split(text, lang='hi')
|
||
|
|
||
|
transliterated_text=[]
|
||
|
for sentence in sentences:
|
||
|
#if sentence == "" or sentence == " . . ." or sentence == " . ." or sentence ==" . . ”":
|
||
|
if sentence in etc_punctuation:
|
||
|
continue
|
||
|
|
||
|
print("original_sentence", sentence)
|
||
|
temp_sentence = punct_remover(sentence)
|
||
|
print("sentence_without_punctuation", temp_sentence)
|
||
|
|
||
|
t0 = indic_transliteration_TELUGU(temp_sentence)
|
||
|
print(t0)
|
||
|
t1 = azure_transliteration(temp_sentence, source_lang, source_script, dest_script)
|
||
|
print(t1)
|
||
|
Out= []
|
||
|
outputs = []
|
||
|
for i in range(len(temp_sentence.split())):
|
||
|
word = temp_sentence.split()[i]
|
||
|
|
||
|
T0 = t0.split()[i]
|
||
|
T1 = t1.split()[i]
|
||
|
|
||
|
outputs=[T0, T1]
|
||
|
out = compare_outputs_transliteration(word, outputs, sources_name, priority_list)
|
||
|
Out.append(out)
|
||
|
trans_sent_wo_punct = " ".join(Out)
|
||
|
print("trans_sent_wo_punct", trans_sent_wo_punct)
|
||
|
|
||
|
transliterated_sentence = final_transliterated_sentence(sentence, trans_sent_wo_punct)
|
||
|
print("trans_sent_w_punct", transliterated_sentence)
|
||
|
transliterated_text.append(transliterated_sentence)
|
||
|
return " ".join(transliterated_text)
|
||
|
|
||
|
|
||
|
def dial_comparison_transliteration_gurmukhi_latin_sentence_wise(text, source_lang, source_script, dest_script):
|
||
|
source_lang = "pa"
|
||
|
sources_name = {'0':'indic_trans', '1':'Azure', '2':'unicode'}
|
||
|
priority_list =['indic_trans', 'Azure', 'unicode']
|
||
|
etc_punctuation =["", " . . .", " . .", " . . ”"]
|
||
|
sentences=sentence_tokenize.sentence_split(text, lang='hi')
|
||
|
|
||
|
transliterated_text=[]
|
||
|
for sentence in sentences:
|
||
|
#if sentence == "" or sentence == " . . ." or sentence == " . ." or sentence ==" . . ”":
|
||
|
if sentence in etc_punctuation:
|
||
|
continue
|
||
|
|
||
|
print("original_sentence", sentence)
|
||
|
temp_sentence = punct_remover(sentence)
|
||
|
print("sentence_without_punctuation", temp_sentence)
|
||
|
|
||
|
t0 = indic_transliteration_GURMUKHI_LATIN(temp_sentence)
|
||
|
print(t0)
|
||
|
t1 = azure_transliteration(temp_sentence, source_lang, source_script, dest_script)
|
||
|
print(t1)
|
||
|
t2 = unicode_transliteration_GURMUKHI_LATIN(temp_sentence).rstrip()
|
||
|
print(t2)
|
||
|
Out= []
|
||
|
outputs = []
|
||
|
for i in range(len(temp_sentence.split())):
|
||
|
word = temp_sentence.split()[i]
|
||
|
|
||
|
T0 = t0.split()[i]
|
||
|
T1 = t1.split()[i]
|
||
|
T2 = t2.split()[i]
|
||
|
|
||
|
outputs=[T0, T1, T2]
|
||
|
out = compare_outputs_transliteration(word, outputs, sources_name, priority_list)
|
||
|
Out.append(out)
|
||
|
trans_sent_wo_punct = " ".join(Out)
|
||
|
print("trans_sent_wo_punct", trans_sent_wo_punct)
|
||
|
|
||
|
transliterated_sentence = final_transliterated_sentence(sentence, trans_sent_wo_punct)
|
||
|
print("trans_sent_w_punct", transliterated_sentence)
|
||
|
transliterated_text.append(transliterated_sentence)
|
||
|
|
||
|
return " ".join(transliterated_text)
|
||
|
|
||
|
def dial_comparison_transliteration_cyrilic_latin_sentence_wise(text, source_lang, source_script, dest_script):
|
||
|
print("cyrillic to latin")
|
||
|
source_lang = "bg"
|
||
|
sources_name = {'0':'indic_trans', '1':'Azure', '2':'unicode'}
|
||
|
priority_list =['indic_trans', 'Azure', 'unicode']
|
||
|
etc_punctuation =["", " . . .", " . .", " . . ”"]
|
||
|
sentences=sentence_tokenize.sentence_split(text, lang='hi')
|
||
|
|
||
|
transliterated_text=[]
|
||
|
for sentence in sentences:
|
||
|
#if sentence == "" or sentence == " . . ." or sentence == " . ." or sentence ==" . . ”":
|
||
|
if sentence in etc_punctuation:
|
||
|
continue
|
||
|
|
||
|
print("original_sentence", sentence)
|
||
|
temp_sentence = punct_remover(sentence)
|
||
|
print("sentence_without_punctuation", temp_sentence)
|
||
|
|
||
|
t0 = azure_transliteration(temp_sentence, source_lang, source_script, dest_script)
|
||
|
print("t0",t0)
|
||
|
t1 = transliteration_CYRILIC_LATIN(temp_sentence)
|
||
|
print("t1",t1)
|
||
|
t2 = ConvertToLatin(temp_sentence)
|
||
|
print("t2",t2)
|
||
|
|
||
|
Out= []
|
||
|
outputs = []
|
||
|
for i in range(len(temp_sentence.split())):
|
||
|
word = temp_sentence.split()[i]
|
||
|
T0 = t0.split()[i]
|
||
|
T1 = t1.split()[i]
|
||
|
T2 = t2.split()[i]
|
||
|
outputs=[T0, T1, T2]
|
||
|
out = compare_outputs_transliteration(word, outputs, sources_name, priority_list)
|
||
|
Out.append(out)
|
||
|
trans_sent_wo_punct = " ".join(Out)
|
||
|
print("trans_sent_wo_punct", trans_sent_wo_punct)
|
||
|
|
||
|
transliterated_sentence = final_transliterated_sentence(sentence, trans_sent_wo_punct)
|
||
|
print("trans_sent_w_punct", transliterated_sentence)
|
||
|
transliterated_text.append(transliterated_sentence)
|
||
|
|
||
|
return " ".join(transliterated_text)
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
def compare_outputs_transliteration(word, outputs, sources_name, priority_list):
|
||
|
#print(outputs)
|
||
|
O1ANDS1, O2ANDS2 = selection_source_transliteration(sources_name, outputs, priority_list)
|
||
|
print(O1ANDS1)
|
||
|
add_dial_comparison_doc2_transliteration(doc2, table2, word, O1ANDS1, O2ANDS2, sources_name)
|
||
|
return O1ANDS1[0]
|
||
|
|
||
|
def add_dial_comparison_doc2_transliteration(doc2, table2, word, O1ANDS1, O2ANDS2, sources_name):
|
||
|
row_Cells = table2.add_row().cells
|
||
|
row_Cells[0].text= word
|
||
|
row_Cells[1].text= O1ANDS1[0]
|
||
|
row_Cells[1].paragraphs[0].add_run('(Source : '+str(O1ANDS1[1])+')')
|
||
|
row_Cells[2].text= O2ANDS2[0]
|
||
|
row_Cells[2].paragraphs[0].add_run('(Source : '+str(O2ANDS2[1])+')')
|
||
|
|
||
|
|
||
|
#store original dialogues for dual dialogue format
|
||
|
original_dialogues = []
|
||
|
for scene in tqdm(scenes1[:]):
|
||
|
for i,line in enumerate(scene):
|
||
|
if i == 0:
|
||
|
continue
|
||
|
if type(line)==type(""):
|
||
|
continue
|
||
|
|
||
|
else:
|
||
|
print("In dialogue")
|
||
|
[speaker] = line.keys()
|
||
|
if speaker == 'Transition':
|
||
|
continue
|
||
|
|
||
|
#if line[speaker][0] != 'NONE': #cooment
|
||
|
#continue
|
||
|
# In parenthitical part
|
||
|
# print("parenthitical", line[speaker][0])
|
||
|
|
||
|
if line[speaker][2] == "":
|
||
|
continue
|
||
|
print("dialogue", line[speaker][2])
|
||
|
original_dialogues.append(line[speaker][2])
|
||
|
|
||
|
print("length of dialogues", len(original_dialogues))
|
||
|
print(original_dialogues)
|
||
|
#for detection
|
||
|
for scene in tqdm(scenes):
|
||
|
x = "False"
|
||
|
y = "False"
|
||
|
for i,line in enumerate(scene):
|
||
|
if i == 0:
|
||
|
continue
|
||
|
if type(line)==type(""):
|
||
|
x = "True"
|
||
|
non_dial_src_lang = language_detector(line)
|
||
|
|
||
|
else:
|
||
|
y = "True"
|
||
|
[speaker] = line.keys()
|
||
|
if speaker == 'Transition':
|
||
|
continue
|
||
|
if line[speaker][0] != 'NONE':
|
||
|
continue
|
||
|
|
||
|
dial_src_lang = language_detector(line[speaker][2])
|
||
|
dial_src_script = script_det(line[speaker][2])
|
||
|
|
||
|
if x == "True" and y == "True":
|
||
|
break
|
||
|
|
||
|
print("non_dial_src_lang", non_dial_src_lang)
|
||
|
print("dial_src_lang", dial_src_lang)
|
||
|
print("dial_src_script", dial_src_script)
|
||
|
|
||
|
#main for loop
|
||
|
j = 0
|
||
|
for scene in tqdm(scenes[:]):
|
||
|
for i,line in enumerate(scene):
|
||
|
if i == 0:
|
||
|
addSlugLine(doc,line)
|
||
|
continue
|
||
|
if type(line)==type(""):
|
||
|
addActionLine(doc, line, non_dial_src_lang)
|
||
|
|
||
|
|
||
|
else:
|
||
|
print("In dialogue")
|
||
|
[speaker] = line.keys()
|
||
|
if speaker == 'Transition':
|
||
|
addTransition(doc,line[speaker])
|
||
|
continue
|
||
|
addSpeaker(doc,speaker)
|
||
|
if line[speaker][0] != 'NONE': # In parenthitical part
|
||
|
non_dial_translate = "no"
|
||
|
if non_dial_translate == "yes":
|
||
|
out = MNF_translate(line[speaker][0], non_dial_dest_lang)
|
||
|
addParenthetical(doc,out)
|
||
|
else:
|
||
|
addParenthetical(doc,line[speaker][0])
|
||
|
|
||
|
print("dialogue to be transliterated ", line[speaker][2])
|
||
|
if line[speaker][2] == "":
|
||
|
continue
|
||
|
|
||
|
if dial_dest_script == "Latin" and dial_src_script == "Devanagari":
|
||
|
trans_text = dial_comparison_transliteration_dev_rom_ph1(line[speaker][2], dial_src_lang, dial_src_script, dial_dest_script)
|
||
|
elif dial_dest_script == "Devanagari" and dial_src_script == "Latin":
|
||
|
trans_text = dial_comparison_transliteration_rom_dev_ph1(line[speaker][2], dial_src_lang, dial_src_script, dial_dest_script)
|
||
|
#trans_text=dial_comparison_transliteration_rom_dev_ph1_sentence_wise(line[speaker][2], dial_src_lang, dial_src_script, dial_dest_script)
|
||
|
elif dial_dest_script == "Latin" and dial_src_script == "Arabic":
|
||
|
trans_text = dial_comparison_transliteration_arbic_to_rom_ph1(line[speaker][2], dial_src_lang, dial_src_script, dial_dest_script )
|
||
|
elif dial_dest_script == "Latin" and dial_src_script == "Kannada":
|
||
|
trans_text = dial_comparison_transliteration_kann_to_rom_ph1(line[speaker][2], dial_src_lang, dial_src_script, dial_dest_script )
|
||
|
elif dial_dest_script == "Latin" and dial_src_script == "Tamil":
|
||
|
trans_text = dial_comparison_transliteration_tamil_to_rom_ph1(line[speaker][2], dial_src_lang, dial_src_script, dial_dest_script )
|
||
|
|
||
|
elif dial_dest_script == "Latin" and dial_src_script == "Bengali":
|
||
|
trans_text = dial_comparison_transliteration_beng_tel_mal_to_rom_ph1(line[speaker][2], dial_src_lang, dial_src_script, dial_dest_script )
|
||
|
elif dial_dest_script == "Latin" and dial_src_script == "Telugu":
|
||
|
trans_text = dial_comparison_transliteration_beng_tel_mal_to_rom_ph1(line[speaker][2], dial_src_lang, dial_src_script, dial_dest_script )
|
||
|
elif dial_dest_script == "Latin" and dial_src_script == "Malayalam":
|
||
|
trans_text = dial_comparison_transliteration_beng_tel_mal_to_rom_ph1(line[speaker][2], dial_src_lang, dial_src_script, dial_dest_script )
|
||
|
elif dial_dest_script == "Gurmukhi" and dial_src_script == "Latin":
|
||
|
trans_text = dial_comparison_transliteration_latin_gurmukhi(line[speaker][2], dial_src_lang, dial_src_script, dial_dest_script )
|
||
|
elif dial_dest_script == "" and dial_src_script == "Cyrillic":
|
||
|
trans_text = dial_comparison_transliteration_latin_cyrillic(line[speaker][2], dial_src_lang, dial_src_script, dial_dest_script )
|
||
|
elif dial_dest_script == "Telugu" and dial_src_script == "Latin":
|
||
|
trans_text = dial_comparison_transliteration_latin_telugu_sentence_wise(line[speaker][2], dial_src_lang, dial_src_script, dial_dest_script )
|
||
|
elif dial_dest_script == "Latin" and dial_src_script == "Gurmukhi":
|
||
|
trans_text = dial_comparison_transliteration_gurmukhi_latin_sentence_wise(line[speaker][2], dial_src_lang, dial_src_script, dial_dest_script )
|
||
|
elif dial_dest_script == "Latin" and dial_src_script == "Cyrillic":
|
||
|
trans_text = dial_comparison_transliteration_cyrilic_latin_sentence_wise(line[speaker][2], dial_src_lang, dial_src_script, dial_dest_script )
|
||
|
elif dest_script == "Latin" and src_script == "Gujarati":
|
||
|
print("Gujarti to latin hoga")
|
||
|
trans_text = dial_comparison_transliteration_beng_tel_mal_to_rom_ph1(line[speaker][2], src_lang, src_script,dest_script )
|
||
|
elif dest_script == "Latin" and src_script == "Oriya":
|
||
|
trans_text = dial_comparison_transliteration_beng_tel_mal_to_rom_ph1(line[speaker][2], src_lang, src_script,dest_script )
|
||
|
|
||
|
if dual_dial_script == "Yes":
|
||
|
if translation_and_transliteration == "Yes":
|
||
|
dual_script(doc, original_dialogues[j], trans_text, dial_src_lang)
|
||
|
j=j+1
|
||
|
else:
|
||
|
dual_script(doc, line[speaker][2], trans_text, dial_src_lang)
|
||
|
|
||
|
else:
|
||
|
addDialogue(doc, trans_text, dial_src_lang)
|
||
|
|
||
|
doc.save(docfile)
|
||
|
# file_tr = rf"{basePath}/media/scripts/Final_Table" +ss str(forFinal.split('.')[0])+"_trans_to_"+str(dial_dest_lang)+'_'+"final.docx"
|
||
|
# doc2.save(file_tr)
|
||
|
print('done file is saved')
|
||
|
return docfile
|