Conversion_Kitchen_Code/kitchen_counter/conversion/attachments/final_transliteration.py

import os
import sys
import docx
import re
from tqdm import tqdm
from collections import Counter
import ntpath
from docx.shared import Inches, Cm, Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.table import WD_TABLE_ALIGNMENT, WD_ALIGN_VERTICAL
import requests, uuid, json
import  nltk.translate.bleu_score as bleu
import nltk.translate.gleu_score as gleu
from rouge_score import rouge_scorer
import numpy as np
from indicnlp.tokenize import sentence_tokenize
import nltk
try: 
   print("time2222")
   nltk.data.find('tokenizers/punkt')
except: 
   #LookupError: nltk.download('punkt')
   pass
try: nltk.data.find('wordnet')
except LookupError: ###nltk.download('wordnet')
    print("error in finding wordnet11111")

# import logging
# from logger import get_module_logger
# log = get_module_logger(__name__)
# log.info('Logger working')


#google
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="My First Project-2573112d5326.json"
from google.cloud import translate
from google.cloud import translate_v2 as Translate
translate_client = Translate.Client()
client = translate.TranslationServiceClient()
project_id = 'authentic-bongo-272808'
location = "global"
parent = f"projects/{project_id}/locations/{location}"


from script_detector import script_cat
from buck_2_unicode import buck_2_unicode
from translation_metric import manual_diff_score, bleu_diff_score, gleu_diff_score, meteor_diff_score, rouge_diff_score, diff_score, critera4_5
from selection_source import selection_source, function5, function41, function311, function221, function2111, function11111, selection_source_transliteration, two_sources_two_outputs
from script_writing import addSlugLine, addActionLine, addSpeaker, addParenthetical, addDialogue, dual_script, addTransition, dial_checker,  non_dial_checker
from script_reading import breaksen, getRefined, getSlugAndNonSlug, getSpeakers, getScenes
from translation_resources import ibm_watson, google, aws, azure, lingvanex, yandex
from transliteration_resources import azure_transliteration, indic_trans, om_transliterator, libindic, indic_transliteration_IAST, indic_transliteration_ITRANS, sheetal, ritwik


filename1 = sys.argv[1]  # get translated file from UI-1(translation)
dial_dest_script = sys.argv[2]
dual_dial_script = sys.argv[3]  #Yes/No


translation_and_transliteration = sys.argv[4]  #Yes/No
filename2 = sys.argv[5] # original file  or take input as scenes from final translation


# create an instance of a word document
doc = docx.Document()
doc_file = "translated/" + "trans" + ntpath.basename(filename1)
print(doc_file)

doc1a = docx.Document()
sections = doc1a.sections
for section in sections:
    section.top_margin = Inches(0.2)
    section.bottom_margin = Inches(0.2)
    section.left_margin = Inches(0.2)
    section.right_margin = Inches(0.2)
section = doc1a.sections[-1]
new_height = section.page_width
section.page_width = section.page_height
section.page_height = new_height
name = 'Dialogue Comparision Table of '+doc_file
doc1a.add_heading(name, 0)
doc_para = doc1a.add_paragraph()
doc_para.add_run('Translation resources used : Google, IBM watson, AWS, Azure, Lingvanex, Yandex').bold = True
table1a = doc1a.add_table(rows=1,cols=4)
table1a.style = 'TableGrid'
hdr_Cells = table1a.rows[0].cells
hdr_Cells[0].paragraphs[0].add_run("Input").bold=True
hdr_Cells[1].paragraphs[0].add_run("Google").bold=True
hdr_Cells[2].paragraphs[0].add_run("MNF Diff Score Method").bold=True
hdr_Cells[3].paragraphs[0].add_run("Bleu Diff Score Method").bold=True

doc1b = docx.Document()
sections = doc1b.sections
for section in sections:
    section.top_margin = Inches(0.2)
    section.bottom_margin = Inches(0.2)
    section.left_margin = Inches(0.2)
    section.right_margin = Inches(0.2)
section = doc1b.sections[-1]
new_height = section.page_width
section.page_width = section.page_height
section.page_height = new_height
name = 'Dialogue Comparision Table of '+doc_file
doc1b.add_heading(name, 0)
doc_para = doc1b.add_paragraph()
doc_para.add_run('Translation resources used : Google, IBM watson, AWS, Azure, Lingvanex, Yandex').bold = True
table1b = doc1b.add_table(rows=1,cols=4)
table1b.style = 'TableGrid'
hdr_Cells = table1b.rows[0].cells
hdr_Cells[0].paragraphs[0].add_run("Gleu Diff Score Method").bold=True
hdr_Cells[1].paragraphs[0].add_run("Meteor Diff Score Method").bold=True
hdr_Cells[2].paragraphs[0].add_run("Rougen Diff Score Method").bold=True
hdr_Cells[3].paragraphs[0].add_run("Rougel Diff Score Method").bold=True

doc2 = docx.Document()
sections = doc2.sections
for section in sections:
    section.top_margin = Inches(0.2)
    section.bottom_margin = Inches(0.2)
    section.left_margin = Inches(0.2)
    section.right_margin = Inches(0.2)
section = doc2.sections[-1]
new_height = section.page_width
section.page_width = section.page_height
section.page_height = new_height
name = 'Final table '+doc_file
doc2.add_heading(name, 0)
doc_para = doc2.add_paragraph()
doc_para.add_run('Translation resources used : Google, IBM watson, AWS, Azure, Lingvanex, Yandex').bold = True
table2 = doc2.add_table(rows=1,cols=4)
table2.style = 'TableGrid'
hdr_Cells = table2.rows[0].cells
hdr_Cells[0].paragraphs[0].add_run("Input").bold=True
hdr_Cells[1].paragraphs[0].add_run("Output1").bold=True
hdr_Cells[2].paragraphs[0].add_run("Output2").bold=True
hdr_Cells[3].paragraphs[0].add_run("Output3").bold=True


refined,total_scenes = getRefined(filename1)
print(refined)
sluglines,without_slug = getSlugAndNonSlug(refined)
print(sluglines)
characters = getSpeakers(without_slug)
#print(characters)
scenes,actionline,parenthetical_lis,speakers,dialogues = getScenes(refined,total_scenes,characters)
#print(scenes)

refined,total_scenes = getRefined(filename2)
sluglines,without_slug = getSlugAndNonSlug(refined)
characters = getSpeakers(without_slug)
scenes1,actionline,parenthetical_lis,speakers,dialogues = getScenes(refined,total_scenes,characters)


def language_detector(text):
    result = translate_client.translate(text, target_language='hi')
    det_lang = result["detectedSourceLanguage"]
    return det_lang

def script_det(text):
  punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
  no_punct = ""
  for char in text:
   if char not in punctuations:
       no_punct = char
       break
  script = script_cat(no_punct)[0]
  return script

def punct_remover(string):
  punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~…।'''
  for x in string.lower():
    if x in punctuations:
      string = string.replace(x, " ")
  return string

def space_after_punct(text):
  #text = text.replace('...',' ... ')
  text = text.replace('. . .',' ... ')
  text = re.sub('([,!?()…-])', r'\1 ', text)
  text = re.sub('\s{2,}', ' ', text)
  return text

def final_transliterated_sentence(original, transliterated):
  original = space_after_punct(original)
  punct_list = ['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', ' ', '-', '.', '/', ':', ';',
              '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', '…', '...', '।']
  sentence = []
  j = 0

  for i in range(len(original.split())):

    if original.split()[i] in punct_list:
      sentence.append(original.split()[i])
    elif original.split()[i][-1] in punct_list:
      temp = transliterated.split()[j] + original.split()[i][-1]
      sentence.append(temp)
      j = j+1
    elif original.split()[i][-1] not in punct_list:
      temp = transliterated.split()[j]
      sentence.append(temp)
      j = j+1

  transliterated_sentence = " ".join(sentence)
  transliterated_sentence.replace(' ... ','...')
  transliterated_sentence.replace('… ', '…')
  return transliterated_sentence

def MNF_translate(text, dest_lang):
  result = translate_client.translate(text, target_language = dest_lang)
  translated_text = result['translatedText']
  return translated_text

def dial_comparison_transliteration_rom_dev_ph1(text, source_lang, source_script, dest_script):
  source_lang = "hi"
  source_script = "Latin"
  dest_script = "Devanagari"
  sources_name = {'0':'Azure', '1':'indic_trans', '2':'google', '3':'indic_trans_IAST'}
  sentences=sentence_tokenize.sentence_split(text, lang='en')
  priority_list =['Azure', 'indic_trans', 'google', 'indic_trans_IAST', ]

  transliterated_text=[]
  for sentence in sentences:
      if sentence == "" or sentence == " . . ." or sentence == " . ." or sentence ==" . . ”":
        continue
      print(sentence)
      OUT=[]
      for word in sentence.split():
        if word==".":
          continue
        print(word)
        t0 = azure_transliteration(word, source_lang, source_script, dest_script)
        #print(t0)
        t1 = indic_trans(word, source_script, dest_script)
        #print(t1)
        t2 = google(word, 'en', 'hi')
        #print(t2)
        t3 = indic_transliteration_IAST(word)
        #print(t3)
        outputs=[t0, t1, t2, t3]
        out = compare_outputs_transliteration(word, outputs, sources_name, priority_list)
        OUT.append(out)
      transliterated_text.append(" ".join(OUT))
  return " ".join(transliterated_text)


# def dial_comparison_transliteration_dev_rom_ph1(text, source_lang, source_script, dest_script):
#   sources_name = {'0':'indic_trans', '1':'Azure', '2':'libindic', '3':'sheetal', '4':'ritwik'}
#   sentences=sentence_tokenize.sentence_split(text, lang='hi')
#   priority_list =['indic_trans', 'Azure', 'ritwik', 'sheetal', 'libindic']

#   transliterated_text=[]
#   for sentence in sentences:
#       if sentence == "" or sentence == " . . ." or sentence == " . ." or sentence ==" . . ”":
#         continue
#       print(sentence)
#       OUT=[]
#       for word in sentence.split():
#         if word==".":
#           continue
#         print(word)
#         t0 = indic_trans(word, source_script, dest_script)
#         #print(t0)
#         t1 = azure_transliteration(word, source_lang,  source_script, dest_script)
#         #print(t1)
#         t2 = libindic(word, dest_script).rstrip()
#         #print(t2)
#         t3 = sheetal(word).replace('\n','')
#         #print(t3)
#         t4 = ritwik(word).replace('\n','').rstrip()
#         #print(t4)
#         outputs=[t0, t1, t2, t3, t4]
#         out = compare_outputs_transliteration(word, outputs, sources_name, priority_list)
#         OUT.append(out)
#       transliterated_text.append(" ".join(OUT))
#   return " ".join(transliterated_text)

def dial_comparison_transliteration_dev_rom_ph1(text, source_lang, source_script, dest_script):
  sources_name = {'0':'indic_trans', '1':'Azure', '2':'libindic', '3':'sheetal', '4':'ritwik'}
  sentences=sentence_tokenize.sentence_split(text, lang='hi')
  priority_list =['indic_trans', 'Azure', 'ritwik', 'sheetal', 'libindic']

  transliterated_text=[]
  for sentence in sentences:
      if sentence == "" or sentence == " . . ." or sentence == " . ." or sentence ==" . . ”":
        continue
      print("original sentence", sentence)
      temp_sentence = punct_remover(sentence)
      print("sentence after punctuation", temp_sentence)

      t0 = indic_trans(temp_sentence, source_script, dest_script)
      #print(t0)
      t1 = azure_transliteration(temp_sentence, source_lang,  source_script, dest_script)
      #print(t1)
      t2 = libindic(temp_sentence, dest_script).rstrip()
      #print(t2)
      t3 = sheetal(temp_sentence).replace('\n','')
      #print(t3)
      t4 = ritwik(temp_sentence).replace('\n','').rstrip()
      #print(t4)


      Out= []
      outputs = []
      for i in range(len(temp_sentence.split())):
        word = temp_sentence.split()[i]

        T0 = t0.split()[i]
        T1 = t1.split()[i]
        T2 = t2.split()[i]
        T3 = t3.split()[i]
        T4 = t4.split()[i]

        outputs=[T0, T1, T2, T3, T4]
        out = compare_outputs_transliteration(word, outputs, sources_name, priority_list)
        Out.append(out)
        trans_sent_wo_punct = " ".join(Out)
      print("trans_sent_wo_punct", trans_sent_wo_punct)

      transliterated_sentence = final_transliterated_sentence(sentence, trans_sent_wo_punct)
      print("trans_sent_with_punct", transliterated_sentence)
      transliterated_text.append(transliterated_sentence)

  return " ".join(transliterated_text)

def dial_comparison_transliteration_arbic_to_rom_ph1(text, source_lang, source_script, dest_script):
  print("hello")
  sources_name = {'0':'indic_trans', '1':'Azure', '2':'buck_2_unicode'}
  sentences=sentence_tokenize.sentence_split(text, lang='en')
  priority_list =['indic_trans', 'Azure', 'buck_2_unicode' ]

  transliterated_text=[]
  for sentence in sentences:
      if sentence == "" or sentence == " . . ." or sentence == " . ." or sentence ==" . . ”":
        continue
      print(sentence)
      OUT=[]
      for word in sentence.split():
        if word==".":
          continue
        print(word)
        t0 = indic_trans(word, source_script, dest_script)
        t1 = azure_transliteration(word, source_lang, source_script, dest_script)
        t2 = buck_2_unicode(word)
        outputs=[t0, t1, t2]
        out = compare_outputs_transliteration(word, outputs, sources_name, priority_list)
        OUT.append(out)
      transliterated_text.append(" ".join(OUT))
  return " ".join(transliterated_text)


def dial_comparison_transliteration_kann_to_rom_ph1(text, source_lang, source_script, dest_script):
  print("hello")
  sources_name = {'0':'om_transliteration', '1':'indic_trans', '2':'libindic', '3':'Azure'}
  sentences=sentence_tokenize.sentence_split(text, lang='en')
  priority_list =['om_transliteration', 'indic_trans', 'libindic', 'Azure']

  transliterated_text=[]
  for sentence in sentences:
      if sentence == "" or sentence == " . . ." or sentence == " . ." or sentence ==" . . ”":
        continue
      print(sentence)
      OUT=[]
      for word in sentence.split():
        if word==".":
          continue
        print(word)
        t0 = om_transliterator(word)
        t1 = indic_trans(word, source_script, dest_script)
        t2 = libindic(text, dest_script)
        t3 = azure_transliteration(word, source_lang, source_script, dest_script)
        outputs=[t0, t1, t2, t3]
        out = compare_outputs_transliteration(word, outputs, sources_name, priority_list)
        OUT.append(out)
      transliterated_text.append(" ".join(OUT))
  return " ".join(transliterated_text)

def dial_comparison_transliteration_tamil_to_rom_ph1(text, source_lang, source_script, dest_script):
  print("hello")
  sources_name = {'0':'Azure', '1':'libindic', '2':'indic_trans', }
  sentences=sentence_tokenize.sentence_split(text, lang='en')
  priority_list =['Azure', 'libindic', 'indic_trans']

  transliterated_text=[]
  for sentence in sentences:
      if sentence == "" or sentence == " . . ." or sentence == " . ." or sentence ==" . . ”":
        continue
      print(sentence)
      OUT=[]
      for word in sentence.split():
        if word==".":
          continue
        print(word)
        t0 = azure_transliteration(word, source_lang, source_script, dest_script)
        t2 = libindic(text, dest_script)
        t1 = indic_trans(word, source_script, dest_script)
        outputs=[t0, t1, t2]
        out = compare_outputs_transliteration(word, outputs, sources_name, priority_list)
        OUT.append(out)
      transliterated_text.append(" ".join(OUT))
  return " ".join(transliterated_text)

def compare_outputs_transliteration(word, outputs, sources_name, priority_list):
      #print(outputs)
      O1ANDS1, O2ANDS2 = selection_source_transliteration(sources_name, outputs, priority_list)
      print(O1ANDS1)
      add_dial_comparison_doc2_transliteration(doc2, table2, word, O1ANDS1, O2ANDS2, sources_name)
      return O1ANDS1[0]

def add_dial_comparison_doc2_transliteration(doc2, table2, word, O1ANDS1, O2ANDS2, sources_name):
    row_Cells = table2.add_row().cells
    row_Cells[0].text= word
    row_Cells[1].text= O1ANDS1[0]
    row_Cells[1].paragraphs[0].add_run('(Source : '+str(O1ANDS1[1])+')')
    row_Cells[2].text= O2ANDS2[0]
    row_Cells[2].paragraphs[0].add_run('(Source : '+str(O2ANDS2[1])+')')


original_dialogues = []
for scene in tqdm(scenes1[:5]):
    for i,line in enumerate(scene):
        if i == 0:
            #addSlugLine(doc,line)
            continue
        if type(line)==type(""):

            #addActionLine(doc, line, non_dial_src_lang)
            continue
          #print("action_line")
          #non_dial_src_lang = language_detector(line)
          #print("non_dial_src_lang", non_dial_src_lang)

          #non_dial_translate = non_dial_checker( non_dial_dest_lang, non_dial_src_lang )
          #print("non_dial_translate", non_dial_translate)

          #print("line", line)
          #if non_dial_translate:
          #      if non_dial_src_lang in translation_list and non_dial_dest_lang in translation_list:
          #        trans_text = dial_comparison(line , non_dial_src_lang, non_dial_dest_lang)
          #        addActionLine(doc, trans_text, non_dial_dest_lang)
          #else:
          #    addActionLine(doc, line, non_dial_dest_lang)

        else:
            print("In dialogue")
            [speaker] = line.keys()
            if speaker == 'Transition':
                # if want to translate transition also along with action line use addTransition(doc,translator.translate(speaker,dest = gtrans_dict[actionline_dest_lang]).text)
                #addTransition(doc,line[speaker])
                continue
            #addSpeaker(doc,speaker)
            if line[speaker][0] != 'NONE': # In parenthitical part
                # non_dial_translate = "no"
                # if non_dial_translate == "yes":
                #     out = MNF_translate(line[speaker][0], non_dial_dest_lang)
                #     addParenthetical(doc,out)
                # else:
                #     addParenthetical(doc,line[speaker][0])
                continue

            #print("dialogue to be transliterated ", line[speaker][2])
            if line[speaker][2] == "":
                continue
            original_dialogues.append(line[speaker][2])

            # if dial_dest_script == "Latin" and dial_src_script == "Devanagari":
            #   trans_text = dial_comparison_transliteration_dev_rom_ph1(line[speaker][2], dial_src_lang, dial_src_script, dial_dest_script)
            # elif dial_dest_script == "Devanagari" and dial_src_script == "Latin":
            #   trans_text = dial_comparison_transliteration_rom_dev_ph1(line[speaker][2], dial_src_lang, dial_src_script, dial_dest_script)
            # elif dial_dest_script == "Latin" and dial_src_script == "Arabic":
            #   trans_text = dial_comparison_transliteration_arbic_to_rom_ph1(line[speaker][2], dial_src_lang, dial_src_script, dial_dest_script )
            # elif dial_dest_script == "Latin" and dial_src_script == "Kannada":
            #   trans_text = dial_comparison_transliteration_kann_to_rom_ph1(line[speaker][2], dial_src_lang, dial_src_script, dial_dest_script )
            # elif dial_dest_script == "Latin" and dial_src_script == "Tamil":
            #   trans_text = dial_comparison_transliteration_tamil_to_rom_ph1(line[speaker][2], dial_src_lang, dial_src_script, dial_dest_script )

            # if dual_dial_script == "Yes":
            #     dual_script(doc, line[speaker][2], trans_text, dial_src_lang)
            # else:
            #     addDialogue(doc, trans_text, dial_src_lang)


for scene in tqdm(scenes):
    x = "False"
    y = "False"
    for i,line in enumerate(scene):
        if i == 0:
            continue
        if type(line)==type(""):
           x = "True"
           non_dial_src_lang = language_detector(line)

        else:
            y = "True"
            [speaker] = line.keys()
            if speaker == 'Transition':
                continue
            if line[speaker][0] != 'NONE':
                continue

            dial_src_lang = language_detector(line[speaker][2])
            dial_src_script = script_det(line[speaker][2])

    if x == "True" and y == "True":
       break

print("non_dial_src_lang", non_dial_src_lang)
print("dial_src_lang", dial_src_lang)
print("dial_src_script", dial_src_script)
print("dial_dest_script", dial_dest_script)


j = 0
for scene in tqdm(scenes[:5]):
    for i,line in enumerate(scene):
        if i == 0:
            addSlugLine(doc,line)
            continue
        if type(line)==type(""):

            addActionLine(doc, line, non_dial_src_lang)
          #print("action_line")
          #non_dial_src_lang = language_detector(line)
          #print("non_dial_src_lang", non_dial_src_lang)

          #non_dial_translate = non_dial_checker( non_dial_dest_lang, non_dial_src_lang )
          #print("non_dial_translate", non_dial_translate)

          #print("line", line)
          #if non_dial_translate:
          #      if non_dial_src_lang in translation_list and non_dial_dest_lang in translation_list:
          #        trans_text = dial_comparison(line , non_dial_src_lang, non_dial_dest_lang)
          #        addActionLine(doc, trans_text, non_dial_dest_lang)
          #else:
          #    addActionLine(doc, line, non_dial_dest_lang)

        else:
            print("In dialogue")
            [speaker] = line.keys()
            if speaker == 'Transition':
                # if want to translate transition also along with action line use addTransition(doc,translator.translate(speaker,dest = gtrans_dict[actionline_dest_lang]).text)
                addTransition(doc,line[speaker])
                continue
            addSpeaker(doc,speaker)
            if line[speaker][0] != 'NONE': # In parenthitical part
                non_dial_translate = "no"
                if non_dial_translate == "yes":
                    out = MNF_translate(line[speaker][0], non_dial_dest_lang)
                    addParenthetical(doc,out)
                else:
                    addParenthetical(doc,line[speaker][0])

            print("dialogue to be transliterated ", line[speaker][2])
            if line[speaker][2] == "":
                continue

            if dial_dest_script == "Latin" and dial_src_script == "Devanagari":
              trans_text = dial_comparison_transliteration_dev_rom_ph1(line[speaker][2], dial_src_lang, dial_src_script, dial_dest_script)
            elif dial_dest_script == "Devanagari" and dial_src_script == "Latin":
              trans_text = dial_comparison_transliteration_rom_dev_ph1(line[speaker][2], dial_src_lang, dial_src_script, dial_dest_script)
            elif dial_dest_script == "Latin" and dial_src_script == "Arabic":
              trans_text = dial_comparison_transliteration_arbic_to_rom_ph1(line[speaker][2], dial_src_lang, dial_src_script, dial_dest_script )
            elif dial_dest_script == "Latin" and dial_src_script == "Kannada":
              trans_text = dial_comparison_transliteration_kann_to_rom_ph1(line[speaker][2], dial_src_lang, dial_src_script, dial_dest_script )
            elif dial_dest_script == "Latin" and dial_src_script == "Tamil":
              trans_text = dial_comparison_transliteration_tamil_to_rom_ph1(line[speaker][2], dial_src_lang, dial_src_script, dial_dest_script )

            if dual_dial_script == "Yes":
                if translation_and_transliteration == "Yes":
                  dual_script(doc, original_dialogues[j], trans_text, dial_src_lang)
                  j=j+1
                else:
                  dual_script(doc, line[speaker][2], trans_text, dial_src_lang)

            else:
                addDialogue(doc, trans_text, dial_src_lang)

doc.save(doc_file)
#doc1a.save("1"+doc_file)
#doc1b.save("2"+doc_file)
doc2.save("final.docx")