# Module Imports from importlib import import_module import os import sys import docx import re # import textract from tqdm import tqdm from collections import Counter import ntpath from docx.shared import Inches, Cm, Pt from docx.enum.text import WD_ALIGN_PARAGRAPH from docx.enum.table import WD_TABLE_ALIGNMENT, WD_ALIGN_VERTICAL import requests import uuid import json import nltk.translate.bleu_score as bleu import nltk.translate.gleu_score as gleu from rouge_score import rouge_scorer import numpy as np from indicnlp.tokenize import sentence_tokenize import nltk import unidecode import datetime from pytz import timezone # Helper Files Imports from .detection import language_detector, script_det from .buck_2_unicode import buck_2_unicode from .transString import transString from .translation_metric import ( manual_diff_score, bleu_diff_score, gleu_diff_score, meteor_diff_score, rouge_diff_score, diff_score, critera4_5, ) from .selection_source import ( selection_source, function5, function41, function311, function221, function2111, function11111, selection_source_transliteration, two_sources_two_outputs, ) from .script_writing import ( addSlugLine, addActionLine, addSpeaker, addParenthetical, addDialogue, dual_script, addTransition, dial_checker, non_dial_checker, ) from .script_reading import ( breaksen, getRefined, getSlugAndNonSlug, getSpeakers, getScenes, ) from .translation_resources import google, aws, azure, yandex from .transliteration_resources import ( azure_transliteration, indic_trans, indic_transliteration_OTHER_GUJARATI, indic_transliteration_OTHER_GURMUKHI, indic_transliteration_OTHER_ORIYA, om_transliterator, libindic, indic_transliteration_IAST, indic_transliteration_ITRANS, # polyglot_trans, sheetal, unicode_transliteration_GURMUKHI, indic_transliteration_GURMUKHI, transliteration_LATIN_CYRILLIC, indic_transliteration_TELUGU, unicode_transliteration_GURMUKHI_LATIN, indic_transliteration_GURMUKHI_LATIN, transliteration_CYRILIC_LATIN, ConvertToLatin, readonly, indic_transliteration_OTHER_DEVANAGRI, indic_transliteration_DEVANAGRI_OTHER, indic_transliteration_KANNADA_OTHER, indic_transliteration_OTHER_KANNADA, indic_transliteration_TAMIL_OTHER, indic_transliteration_OTHER_TAMIL, indic_transliteration_TELUGU_OTHER, indic_transliteration_MALAYALAM_OTHER, indic_transliteration_OTHER_GUJARATI, indic_transliteration_OTHER_GURMUKHI, indic_transliteration_OTHER_ORIYA, translit_CHINESE_LATIN, translit_th_sin_mng_heb_to_latin ) # , translit_THAI_LATIN from MNF.settings import BasePath # Importing Basepath of System basePath = BasePath() # -> Punctuation Remover code def punct_remover(string): punctuations = """!()-[]{};:'"\,<>./?@#$%^&*_~…।""" for x in string.lower(): if x in punctuations: string = string.replace(x, " ") return string # -> Space After Punctuation Remover code def space_after_punct(text): # text = text.replace('...',' ... ') text = text.replace(". . .", " ... ") text = re.sub("([,!?()…-])", r"\1 ", text) text = re.sub("\s{2,}", " ", text) return text # -> Removing Punctuation from Transliterated text code def final_transliterated_sentence(original, transliterated): original = space_after_punct(original) punct_list = [ "!", '"', "#", "$", "%", "&", "'", "(", ")", "*", "+", ",", " ", "-", ".", "/", ":", ";", "<", "=", ">", "?", "@", "[", "\\", "]", "^", "_", "`", "{", "|", "}", "~", "…", "...", "।", ] sentence = [] j = 0 for i in range(len(original.split())): if original.split()[i] in punct_list: sentence.append(original.split()[i]) elif original.split()[i][-1] in punct_list: temp = transliterated.split()[j] + original.split()[i][-1] sentence.append(temp) j = j + 1 elif original.split()[i][-1] not in punct_list: temp = transliterated.split()[j] sentence.append(temp) j = j + 1 transliterated_sentence = " ".join(sentence) transliterated_sentence.replace(" ... ", "...") transliterated_sentence.replace("… ", "…") return transliterated_sentence def google_length_checker(t, temp_sentence, t0): if len(t.split()) >= len(temp_sentence.split()): return t elif len(t.split()) == len(temp_sentence.split()) - 1: final_t = t + " " + t0.split()[-1] return final_t elif len(t.split()) == len(temp_sentence.split()) - 2: final_t = t + " " + t0.split()[-2] + " " + t0.split()[-1] return final_t else: return t # Special Symbol(Hindi Sentence Ending) Remover def Halant_remover(T3): if T3[-1] == "्": return T3[:-1] else: return T3 def dial_comparison_transliteration_rom_dev_ph1( text, source_lang, source_script, dest_script ): source_lang = "hi" source_script = "Latin" dest_script = "Devanagari" sources_name = { "0": "Azure", "1": "indic_trans", "2": "google", "3": "indic_trans_IAST", } sentences = sentence_tokenize.sentence_split(text, lang="en") priority_list = [ "Azure", "indic_trans", "google", "indic_trans_IAST", ] transliterated_text = [] for sentence in sentences: if ( sentence == "" or sentence == " . . ." or sentence == " . ." or sentence == " . . ”" ): continue OUT = [] for word in sentence.split(): if word == ".": continue t0 = azure_transliteration( word, source_lang, source_script, dest_script) t1 = indic_trans(word, source_script, dest_script) t2 = google(word, "en", "hi") t3 = indic_transliteration_IAST(word) outputs = [t0, t1, t2, t3] out = compare_outputs_transliteration( word, outputs, sources_name, priority_list ) OUT.append(out) transliterated_text.append(" ".join(OUT)) print("running perfectly") return " ".join(transliterated_text) def dial_comparison_transliteration_rom_dev_ph1_sentence_wise( text, source_lang, source_script, dest_script ): source_lang = "hi" sources_name = { "0": "Azure", "1": "indic_trans", "2": "google", "3": "indic_trans_IAST", } etc_punctuation = ["", " . . .", " . .", " . . ”"] sentences = sentence_tokenize.sentence_split(text, lang="en") priority_list = ["Azure", "indic_trans", "google", "indic_trans_IAST"] transliterated_text = [] for sentence in sentences: if sentence in etc_punctuation: continue print("original_sentence", sentence) temp_sentence = punct_remover(sentence) print("sentence_without_punctuation", temp_sentence) t00 = azure_transliteration( temp_sentence, source_lang, source_script, dest_script ) t11 = indic_trans(temp_sentence, source_script, dest_script) t = google(temp_sentence, "en", "hi") t22 = google_length_checker(t, temp_sentence, t00) t33 = indic_transliteration_IAST(temp_sentence) Out = [] for i in range(len(temp_sentence.split())): word = temp_sentence.split()[i] T0 = t00.split()[i] T1 = t11.split()[i] T2 = t22.split()[i] T3 = t33.split()[i] T3 = Halant_remover(T3) outputs = [T0, T1, T2, T3] out = compare_outputs_transliteration( word, outputs, sources_name, priority_list ) Out.append(out) trans_sent_wo_punct = " ".join(Out) print("trans_sent_wo_punct", trans_sent_wo_punct) transliterated_sentence = final_transliterated_sentence( sentence, trans_sent_wo_punct ) print("trans_sent_w_punct", transliterated_sentence) transliterated_text.append(transliterated_sentence) return " ".join(transliterated_text) def dial_comparison_transliteration_dev_rom_ph1_sentence_wise( text, source_lang, source_script, dest_script ): print("Entered Here1212") sources_name = {"0": "indic_trans", "1": "Azure", "2": "libindic", "3": "sheetal"} priority_list = ["indic_trans", "Azure", "sheetal", "libindic"] etc_punctuation = ["", " . . .", " . .", " . . ”"] sentences = sentence_tokenize.sentence_split(text, lang="hi") if source_lang == "ne": source_lang = "hi" transliterated_text = [] for sentence in sentences: if sentence in etc_punctuation: continue print("original_sentence", sentence) temp_sentence = punct_remover(sentence) print("sentence_without_punctuation", temp_sentence) t0 = indic_trans(temp_sentence, source_script, dest_script) t1 = azure_transliteration( temp_sentence, source_lang, source_script, dest_script ) print("before t1111111111") t2 = libindic(temp_sentence, dest_script).rstrip() print("before sheetal", t2) t3 = sheetal(temp_sentence).replace("\n", "") print("after sheetal", t3) Out = [] for i in range(len(temp_sentence.split())): word = temp_sentence.split()[i] T0 = t0.split()[i] T1 = t1.split()[i] T2 = t2.split()[i] T3 = t3.split()[i] outputs = [T0, T1, T2, T3] out = compare_outputs_transliteration( word, outputs, sources_name, priority_list ) Out.append(out) trans_sent_wo_punct = " ".join(Out) print("trans_sent_wo_punct", trans_sent_wo_punct) transliterated_sentence = final_transliterated_sentence( sentence, trans_sent_wo_punct ) print("trans_sent_w_punct", transliterated_sentence) transliterated_text.append(transliterated_sentence) print("Entered Exiting Here1212") return " ".join(transliterated_text) def dial_comparison_transliteration_dev_rom_ph1( text, source_lang, source_script, dest_script ): sources_name = {"0": "indic_trans", "1": "Azure", "2": "libindic", "3": "sheetal"} sentences = sentence_tokenize.sentence_split(text, lang="hi") priority_list = ["indic_trans", "Azure", "sheetal", "libindic"] transliterated_text = [] for sentence in sentences: if ( sentence == "" or sentence == " . . ." or sentence == " . ." or sentence == " . . ”" ): continue OUT = [] for word in sentence.split(): if word == ".": continue t0 = indic_trans(word, source_script, dest_script) t1 = azure_transliteration( word, source_lang, source_script, dest_script) t2 = libindic(word, dest_script).rstrip() t3 = sheetal(word).replace("\n", "") outputs = [t0, t1, t2, t3] out = compare_outputs_transliteration( word, outputs, sources_name, priority_list ) OUT.append(out) transliterated_text.append(" ".join(OUT)) return " ".join(transliterated_text) def dial_comparison_transliteration_arbic_to_rom_ph1( text, source_lang, source_script, dest_script ): sources_name = {"0": "indic_trans", "1": "Azure", "2": "buck_2_unicode"} sentences = sentence_tokenize.sentence_split(text, lang="en") priority_list = ["indic_trans", "Azure", "buck_2_unicode"] transliterated_text = [] for sentence in sentences: if ( sentence == "" or sentence == " . . ." or sentence == " . ." or sentence == " . . ”" ): continue OUT = [] for word in sentence.split(): if word == ".": continue t0 = indic_trans(word, source_script, dest_script) t1 = azure_transliteration( word, source_lang, source_script, dest_script) t2 = buck_2_unicode(word) outputs = [t0, t1, t2] out = compare_outputs_transliteration( word, outputs, sources_name, priority_list ) OUT.append(out) transliterated_text.append(" ".join(OUT)) return " ".join(transliterated_text) def dial_comparison_transliteration_kann_to_rom_ph1( text, source_lang, source_script, dest_script ): sources_name = { "0": "om_transliteration", "1": "indic_trans", "2": "libindic", "3": "Azure", } sentences = sentence_tokenize.sentence_split(text, lang="en") priority_list = ["om_transliteration", "indic_trans", "libindic", "Azure"] transliterated_text = [] for sentence in sentences: if ( sentence == "" or sentence == " . . ." or sentence == " . ." or sentence == " . . ”" ): continue OUT = [] for word in sentence.split(): if word == ".": continue t0 = om_transliterator(word) t1 = indic_trans(word, source_script, dest_script) t2 = libindic(word, dest_script) t3 = azure_transliteration( word, source_lang, source_script, dest_script) outputs = [t0, t1, t2, t3] out = compare_outputs_transliteration( word, outputs, sources_name, priority_list ) OUT.append(out) transliterated_text.append(" ".join(OUT)) return " ".join(transliterated_text) def dial_comparison_transliteration_tamil_to_rom_ph1( text, source_lang, source_script, dest_script ): sources_name = { "0": "Azure", "1": "libindic", "2": "indic_trans", } sentences = sentence_tokenize.sentence_split(text, lang="en") priority_list = ["Azure", "libindic", "indic_trans"] transliterated_text = [] for sentence in sentences: if ( sentence == "" or sentence == " . . ." or sentence == " . ." or sentence == " . . ”" ): continue OUT = [] for word in sentence.split(): if word == ".": continue t0 = azure_transliteration( word, source_lang, source_script, dest_script) t2 = libindic(word, dest_script) t1 = indic_trans(word, source_script, dest_script) outputs = [t0, t1, t2] out = compare_outputs_transliteration( word, outputs, sources_name, priority_list ) OUT.append(out) transliterated_text.append(" ".join(OUT)) return " ".join(transliterated_text) def dial_comparison_transliteration_beng_tel_mal_to_rom_ph1( text, source_lang, source_script, dest_script ): sources_name = {"0": "Azure", "1": "indic_trans", "2": "libindic"} sentences = sentence_tokenize.sentence_split(text, lang="en") priority_list = ["Azure", "indic_trans", "libindic"] transliterated_text = [] for sentence in sentences: if ( sentence == "" or sentence == " . . ." or sentence == " . ." or sentence == " . . ”" ): continue OUT = [] for word in sentence.split(): if word == ".": continue t0 = azure_transliteration( word, source_lang, source_script, dest_script) t1 = indic_trans(word, source_script, dest_script) t2 = libindic(word, dest_script) outputs = [t0, t1, t2] out = compare_outputs_transliteration( word, outputs, sources_name, priority_list ) OUT.append(out) transliterated_text.append(" ".join(OUT)) return " ".join(transliterated_text) def dial_comparison_transliteration_latin_gurmukhi( text, source_lang, source_script, dest_script ): source_lang = "pa" sources_name = {"0": "Azure", "1": "indic_trans", "2": "indic_trans_IAST"} etc_punctuation = ["", " . . .", " . .", " . . ”"] sentences = sentence_tokenize.sentence_split(text, lang="en") priority_list = ["Azure", "indic_trans", "indic_trans_IAST"] transliterated_text = [] for sentence in sentences: if sentence in etc_punctuation: continue temp_sentence = punct_remover(sentence) t00 = azure_transliteration( temp_sentence, source_lang, source_script, dest_script ) t11 = indic_transliteration_GURMUKHI(temp_sentence) t22 = unicode_transliteration_GURMUKHI(temp_sentence) Out = [] for i in range(len(temp_sentence.split())): word = temp_sentence.split()[i] T0 = t00.split()[i] T1 = t11.split()[i] T2 = t22.split()[i] outputs = [T0, T1, T2] out = compare_outputs_transliteration( word, outputs, sources_name, priority_list ) Out.append(out) trans_sent_wo_punct = " ".join(Out) transliterated_sentence = final_transliterated_sentence( sentence, trans_sent_wo_punct ) transliterated_text.append(transliterated_sentence) return " ".join(transliterated_text) def dial_comparison_transliteration_latin_cyrillic( text, source_lang, source_script, dest_script ): source_lang = "bg" sources_name = {"0": "Azure", "1": "indic_trans"} etc_punctuation = ["", " . . .", " . .", " . . ”"] sentences = sentence_tokenize.sentence_split(text, lang="en") priority_list = ["Azure", "indic_trans"] transliterated_text = [] for sentence in sentences: if sentence in etc_punctuation: continue temp_sentence = punct_remover(sentence) t00 = azure_transliteration( temp_sentence, source_lang, source_script, dest_script ) t11 = transliteration_LATIN_CYRILLIC(temp_sentence) Out = [] for i in range(len(temp_sentence.split())): word = temp_sentence.split()[i] T0 = t00.split()[i] T1 = t11.split()[i] # T2 = t22.split()[i] outputs = [T0, T1] # outputs=[T0, T1, T2] out = compare_outputs_transliteration( word, outputs, sources_name, priority_list ) Out.append(out) trans_sent_wo_punct = " ".join(Out) transliterated_sentence = final_transliterated_sentence( sentence, trans_sent_wo_punct ) transliterated_text.append(transliterated_sentence) return " ".join(transliterated_text) def dial_comparison_transliteration_latin_telugu_sentence_wise( text, source_lang, source_script, dest_script ): source_lang = "te" sources_name = { "0": "indic_translit", "1": "Azure", "2": "indic_trans", "3": "libindic", } priority_list = ["indic_translit", "Azure", "indic_trans", "libindic"] etc_punctuation = ["", " . . .", " . .", " . . ”"] sentences = sentence_tokenize.sentence_split(text, lang="hi") transliterated_text = [] for sentence in sentences: if sentence in etc_punctuation: continue print("original_sentence", sentence) temp_sentence = punct_remover(sentence) print("sentence_without_punctuation", temp_sentence) t0 = indic_transliteration_TELUGU(temp_sentence) t1 = azure_transliteration( temp_sentence, source_lang, source_script, dest_script ) t2 = indic_trans(temp_sentence, source_script, dest_script) t3 = libindic(temp_sentence, dest_script) Out = [] for i in range(len(temp_sentence.split())): word = temp_sentence.split()[i] T0 = t0.split()[i] T1 = t1.split()[i] T2 = t2.split()[i] T3 = t3.split()[i] outputs = [T0, T1, T2, T3] out = compare_outputs_transliteration( word, outputs, sources_name, priority_list ) Out.append(out) trans_sent_wo_punct = " ".join(Out) print("trans_sent_wo_punct", trans_sent_wo_punct) transliterated_sentence = final_transliterated_sentence( sentence, trans_sent_wo_punct ) print("trans_sent_w_punct", transliterated_sentence) transliterated_text.append(transliterated_sentence) return " ".join(transliterated_text) def dial_comparison_transliteration_gurmukhi_latin_sentence_wise( text, source_lang, source_script, dest_script ): source_lang = "pa" sources_name = {"0": "indic_trans", "1": "Azure", "2": "unicode"} priority_list = ["indic_trans", "Azure", "unicode"] etc_punctuation = ["", " . . .", " . .", " . . ”"] sentences = sentence_tokenize.sentence_split(text, lang="hi") transliterated_text = [] for sentence in sentences: if sentence in etc_punctuation: continue print("original_sentence", sentence) temp_sentence = punct_remover(sentence) print("sentence_without_punctuation", temp_sentence) t0 = indic_transliteration_GURMUKHI_LATIN(temp_sentence) t1 = azure_transliteration( temp_sentence, source_lang, source_script, dest_script ) t2 = unicode_transliteration_GURMUKHI_LATIN(temp_sentence).rstrip() Out = [] for i in range(len(temp_sentence.split())): word = temp_sentence.split()[i] T0 = t0.split()[i] T1 = t1.split()[i] T2 = t2.split()[i] outputs = [T0, T1, T2] out = compare_outputs_transliteration( word, outputs, sources_name, priority_list ) Out.append(out) trans_sent_wo_punct = " ".join(Out) print("trans_sent_wo_punct", trans_sent_wo_punct) transliterated_sentence = final_transliterated_sentence( sentence, trans_sent_wo_punct ) print("trans_sent_w_punct", transliterated_sentence) transliterated_text.append(transliterated_sentence) return " ".join(transliterated_text) def dial_comparison_transliteration_cyrilic_latin_sentence_wise( text, source_lang, source_script, dest_script ): source_lang = "bg" sources_name = {"0": "indic_trans", "1": "Azure", "2": "unicode"} priority_list = ["indic_trans", "Azure", "unicode"] etc_punctuation = ["", " . . .", " . .", " . . ”"] sentences = sentence_tokenize.sentence_split(text, lang="hi") transliterated_text = [] for sentence in sentences: if sentence in etc_punctuation: continue print("original_sentence", sentence) temp_sentence = punct_remover(sentence) print("sentence_without_punctuation", temp_sentence) t0 = azure_transliteration( temp_sentence, source_lang, source_script, dest_script ) t1 = transliteration_CYRILIC_LATIN(temp_sentence) t2 = ConvertToLatin(temp_sentence) Out = [] for i in range(len(temp_sentence.split())): word = temp_sentence.split()[i] T0 = t0.split()[i] T1 = t1.split()[i] T2 = t2.split()[i] outputs = [T0, T1, T2] out = compare_outputs_transliteration( word, outputs, sources_name, priority_list ) Out.append(out) trans_sent_wo_punct = " ".join(Out) print("trans_sent_wo_punct", trans_sent_wo_punct) transliterated_sentence = final_transliterated_sentence( sentence, trans_sent_wo_punct ) print("trans_sent_w_punct", transliterated_sentence) transliterated_text.append(transliterated_sentence) return " ".join(transliterated_text) def dial_comparison_transliteration_latin_to_gu_or_ml_ta_bn( text, source_lang, source_script, dest_script ): if dest_script == "Gujarati": source_lang = "gu" if dest_script == "Oriya": source_lang = "or" if dest_script == "Malayalam": source_lang = "ml" if dest_script == "Tamil": source_lang = "ta" if dest_script == "Bengali": source_lang = "bn" if dest_script == "Kannada": source_lang = "kn" sources_name = {"0": "Azure", "1": "indic_trans", "2": "indic_trans_IAST"} etc_punctuation = ["", " . . .", " . .", " . . ”"] sentences = sentence_tokenize.sentence_split(text, lang="en") priority_list = ["Azure", "indic_trans", "indic_trans_IAST"] transliterated_text = [] for sentence in sentences: if sentence in etc_punctuation: continue temp_sentence = punct_remover(sentence) t00 = azure_transliteration( temp_sentence, source_lang, source_script, dest_script ) t11 = libindic(temp_sentence, dest_script) t22 = indic_trans(temp_sentence, source_script, dest_script) Out = [] for i in range(len(temp_sentence.split())): word = temp_sentence.split()[i] T0 = t00.split()[i] T1 = t11.split()[i] T2 = t22.split()[i] outputs = [T0, T1, T2] out = compare_outputs_transliteration( word, outputs, sources_name, priority_list ) Out.append(out) trans_sent_wo_punct = " ".join(Out) transliterated_sentence = final_transliterated_sentence( sentence, trans_sent_wo_punct ) transliterated_text.append(transliterated_sentence) return " ".join(transliterated_text) def dial_comparison_transliteration_or_ml_gu_te_devanagari( text, source_lang, source_script, dest_script ): sources_name = {"0": "Azure", "1": "indic_trans", "2": "indic_trans_IAST"} etc_punctuation = ["", " . . .", " . .", " . . ”"] sentences = sentence_tokenize.sentence_split(text, lang="en") priority_list = ["Azure", "indic_trans", "indic_trans_IAST"] transliterated_text = [] for sentence in sentences: if sentence in etc_punctuation: continue temp_sentence = punct_remover(sentence) t00 = indic_transliteration_OTHER_DEVANAGRI( temp_sentence, source_script) t11 = libindic(temp_sentence, source_script) t22 = indic_trans(temp_sentence, source_script, dest_script) Out = [] for i in range(len(temp_sentence.split())): word = temp_sentence.split()[i] T0 = t00.split()[i] T1 = t11.split()[i] T2 = t22.split()[i] outputs = [T0, T1, T2] out = compare_outputs_transliteration( word, outputs, sources_name, priority_list ) Out.append(out) trans_sent_wo_punct = " ".join(Out) transliterated_sentence = final_transliterated_sentence( sentence, trans_sent_wo_punct ) transliterated_text.append(transliterated_sentence) return " ".join(transliterated_text) def dial_comparison_transliteration_devanagari_or_ml_gu_te( text, source_lang, source_script, dest_script ): sources_name = {"0": "Azure", "1": "indic_trans", "2": "indic_trans_IAST"} etc_punctuation = ["", " . . .", " . .", " . . ”"] sentences = sentence_tokenize.sentence_split(text, lang="en") priority_list = ["Azure", "indic_trans", "indic_trans_IAST"] transliterated_text = [] for sentence in sentences: if sentence in etc_punctuation: continue temp_sentence = punct_remover(sentence) t00 = indic_transliteration_DEVANAGRI_OTHER(temp_sentence, dest_script) t11 = libindic(temp_sentence, source_script) t22 = indic_trans(temp_sentence, source_script, dest_script) Out = [] for i in range(len(temp_sentence.split())): word = temp_sentence.split()[i] T0 = t00.split()[i] T1 = t11.split()[i] T2 = t22.split()[i] outputs = [T0, T1, T2] out = compare_outputs_transliteration( word, outputs, sources_name, priority_list ) Out.append(out) trans_sent_wo_punct = " ".join(Out) transliterated_sentence = final_transliterated_sentence( sentence, trans_sent_wo_punct ) transliterated_text.append(transliterated_sentence) return " ".join(transliterated_text) def dial_comparison_transliteration_kannada_ml_ta_te_ben( text, source_lang, source_script, dest_script ): sources_name = {"0": "Azure", "1": "indic_trans", "2": "indic_trans_IAST"} etc_punctuation = ["", " . . .", " . .", " . . ”"] sentences = sentence_tokenize.sentence_split(text, lang="en") priority_list = ["Azure", "indic_trans", "indic_trans_IAST"] transliterated_text = [] for sentence in sentences: if sentence in etc_punctuation: continue temp_sentence = punct_remover(sentence) t00 = indic_transliteration_KANNADA_OTHER(temp_sentence, dest_script) t11 = libindic(temp_sentence, source_script) t22 = indic_trans(temp_sentence, source_script, dest_script) Out = [] for i in range(len(temp_sentence.split())): word = temp_sentence.split()[i] T0 = t00.split()[i] T1 = t11.split()[i] T2 = t22.split()[i] outputs = [T0, T1, T2] out = compare_outputs_transliteration( word, outputs, sources_name, priority_list ) Out.append(out) trans_sent_wo_punct = " ".join(Out) transliterated_sentence = final_transliterated_sentence( sentence, trans_sent_wo_punct ) transliterated_text.append(transliterated_sentence) return " ".join(transliterated_text) def dial_comparison_transliteration_ml_ta_te_ben_kannada( text, source_lang, source_script, dest_script ): sources_name = {"0": "Azure", "1": "indic_trans", "2": "indic_trans_IAST"} etc_punctuation = ["", " . . .", " . .", " . . ”"] sentences = sentence_tokenize.sentence_split(text, lang="en") priority_list = ["Azure", "indic_trans", "indic_trans_IAST"] transliterated_text = [] for sentence in sentences: if sentence in etc_punctuation: continue temp_sentence = punct_remover(sentence) t00 = indic_transliteration_OTHER_KANNADA(temp_sentence, source_script) t11 = libindic(temp_sentence, source_script) t22 = indic_trans(temp_sentence, source_script, dest_script) Out = [] for i in range(len(temp_sentence.split())): word = temp_sentence.split()[i] T0 = t00.split()[i] T1 = t11.split()[i] T2 = t22.split()[i] outputs = [T0, T1, T2] out = compare_outputs_transliteration( word, outputs, sources_name, priority_list ) Out.append(out) trans_sent_wo_punct = " ".join(Out) transliterated_sentence = final_transliterated_sentence( sentence, trans_sent_wo_punct ) transliterated_text.append(transliterated_sentence) return " ".join(transliterated_text) def dial_comparison_transliteration_tamil_other( text, source_lang, source_script, dest_script ): sources_name = {"0": "Azure", "1": "indic_trans", "2": "indic_trans_IAST"} etc_punctuation = ["", " . . .", " . .", " . . ”"] sentences = sentence_tokenize.sentence_split(text, lang="en") priority_list = ["Azure", "indic_trans", "indic_trans_IAST"] transliterated_text = [] for sentence in sentences: if sentence in etc_punctuation: continue temp_sentence = punct_remover(sentence) t00 = indic_transliteration_TAMIL_OTHER(temp_sentence, dest_script) t11 = libindic(temp_sentence, source_script) t22 = indic_trans(temp_sentence, source_script, dest_script) Out = [] for i in range(len(temp_sentence.split())): word = temp_sentence.split()[i] T0 = t00.split()[i] T1 = t11.split()[i] T2 = t22.split()[i] outputs = [T0, T1, T2] out = compare_outputs_transliteration( word, outputs, sources_name, priority_list ) Out.append(out) trans_sent_wo_punct = " ".join(Out) transliterated_sentence = final_transliterated_sentence( sentence, trans_sent_wo_punct ) transliterated_text.append(transliterated_sentence) return " ".join(transliterated_text) def dial_comparison_transliteration_other_tamil( text, source_lang, source_script, dest_script ): sources_name = {"0": "Azure", "1": "indic_trans", "2": "indic_trans_IAST"} etc_punctuation = ["", " . . .", " . .", " . . ”"] sentences = sentence_tokenize.sentence_split(text, lang="en") priority_list = ["Azure", "indic_trans", "indic_trans_IAST"] transliterated_text = [] for sentence in sentences: if sentence in etc_punctuation: continue temp_sentence = punct_remover(sentence) t00 = indic_transliteration_OTHER_TAMIL(temp_sentence, source_script) t11 = libindic(temp_sentence, source_script) t22 = indic_trans(temp_sentence, source_script, dest_script) Out = [] for i in range(len(temp_sentence.split())): word = temp_sentence.split()[i] T0 = t00.split()[i] T1 = t11.split()[i] T2 = t22.split()[i] outputs = [T0, T1, T2] out = compare_outputs_transliteration( word, outputs, sources_name, priority_list ) Out.append(out) trans_sent_wo_punct = " ".join(Out) transliterated_sentence = final_transliterated_sentence( sentence, trans_sent_wo_punct ) transliterated_text.append(transliterated_sentence) return " ".join(transliterated_text) # -> Function to transliterate from telugu to malayalam def dial_comparison_transliteration_te_to_ml( text, source_lang, source_script, dest_script ): sources_name = {"0": "indic_trans", "1": "libindic", "2": "indic_trans_IAST"} etc_punctuation = ["", " . . .", " . .", " . . ”"] sentences = sentence_tokenize.sentence_split(text, lang="en") priority_list = ["indic_trans", "libindic", "indic_trans_IAST"] transliterated_text = [] for sentence in sentences: if sentence in etc_punctuation: continue temp_sentence = punct_remover(sentence) t00 = indic_trans(temp_sentence, source_script, dest_script) t11 = libindic(temp_sentence, dest_script) t22 = indic_transliteration_TELUGU_OTHER(temp_sentence, dest_script) Out = [] for i in range(len(temp_sentence.split())): word = temp_sentence.split()[i] T0 = t00.split()[i] T1 = t11.split()[i] T2 = t22.split()[i] outputs = [T0, T1, T2] out = compare_outputs_transliteration( word, outputs, sources_name, priority_list ) Out.append(out) trans_sent_wo_punct = " ".join(Out) transliterated_sentence = final_transliterated_sentence( sentence, trans_sent_wo_punct ) transliterated_text.append(transliterated_sentence) return " ".join(transliterated_text) # -> Function to transliterate from malayalam to telugu def dial_comparison_transliteration_ml_to_te( text, source_lang, source_script, dest_script ): sources_name = {"0": "indic_trans", "1": "libindic", "2": "indic_trans_IAST"} etc_punctuation = ["", " . . .", " . .", " . . ”"] sentences = sentence_tokenize.sentence_split(text, lang="en") priority_list = ["indic_trans", "libindic", "indic_trans_IAST"] transliterated_text = [] for sentence in sentences: if sentence in etc_punctuation: continue temp_sentence = punct_remover(sentence) t00 = azure_transliteration( temp_sentence, source_lang, source_script, dest_script ) # t00 = indic_trans(temp_sentence, source_script, dest_script) t11 = libindic(temp_sentence, dest_script) t22 = indic_transliteration_MALAYALAM_OTHER(temp_sentence, dest_script) Out = [] for i in range(len(temp_sentence.split())): word = temp_sentence.split()[i] T0 = t00.split()[i] T1 = t11.split()[i] T2 = t22.split()[i] outputs = [T0, T1, T2] out = compare_outputs_transliteration( word, outputs, sources_name, priority_list ) Out.append(out) trans_sent_wo_punct = " ".join(T0) transliterated_sentence = final_transliterated_sentence( sentence, trans_sent_wo_punct ) transliterated_text.append(transliterated_sentence) return " ".join(transliterated_text) # -> Function to transliterate from gujarati and oriya to gurmukhi def dial_comparison_transliteration_guj_or_to_gur( text, source_lang, source_script, dest_script ): sources_name = {"0": "indic_trans", "1": "libindic", "2": "indic_trans_IAST"} etc_punctuation = ["", " . . .", " . .", " . . ”"] sentences = sentence_tokenize.sentence_split(text, lang="en") priority_list = ["indic_trans", "libindic", "indic_trans_IAST"] transliterated_text = [] for sentence in sentences: if sentence in etc_punctuation: continue temp_sentence = punct_remover(sentence) t00 = indic_trans(temp_sentence, source_script, dest_script) t11 = libindic(temp_sentence, dest_script) t22 = indic_transliteration_OTHER_GURMUKHI( temp_sentence, source_script) Out = [] for i in range(len(temp_sentence.split())): word = temp_sentence.split()[i] T0 = t00.split()[i] T1 = t11.split()[i] T2 = t22.split()[i] outputs = [T0, T1, T2] out = compare_outputs_transliteration( word, outputs, sources_name, priority_list ) Out.append(out) trans_sent_wo_punct = " ".join(Out) transliterated_sentence = final_transliterated_sentence( sentence, trans_sent_wo_punct ) transliterated_text.append(transliterated_sentence) return " ".join(transliterated_text) # -> Function to transliterate from gurmukhi and oriya to gujarati def dial_comparison_transliteration_gur_or_to_guj( text, source_lang, source_script, dest_script ): sources_name = {"0": "indic_trans", "1": "libindic", "2": "indic_trans_IAST"} etc_punctuation = ["", " . . .", " . .", " . . ”"] sentences = sentence_tokenize.sentence_split(text, lang="en") priority_list = ["indic_trans", "libindic", "indic_trans_IAST"] transliterated_text = [] for sentence in sentences: if sentence in etc_punctuation: continue temp_sentence = punct_remover(sentence) t00 = indic_trans(temp_sentence, source_script, dest_script) t11 = libindic(temp_sentence, dest_script) t22 = indic_transliteration_OTHER_GUJARATI( temp_sentence, source_script) Out = [] for i in range(len(temp_sentence.split())): word = temp_sentence.split()[i] T0 = t00.split()[i] T1 = t11.split()[i] T2 = t22.split()[i] outputs = [T0, T1, T2] out = compare_outputs_transliteration( word, outputs, sources_name, priority_list ) Out.append(out) trans_sent_wo_punct = " ".join(Out) transliterated_sentence = final_transliterated_sentence( sentence, trans_sent_wo_punct ) transliterated_text.append(transliterated_sentence) return " ".join(transliterated_text) # -> Function to transliterate from gujarati and gurmukhi to oriya def dial_comparison_transliteration_guj_gur_to_or( text, source_lang, source_script, dest_script ): sources_name = {"0": "indic_trans", "1": "libindic", "2": "indic_trans_IAST"} etc_punctuation = ["", " . . .", " . .", " . . ”"] sentences = sentence_tokenize.sentence_split(text, lang="en") priority_list = ["indic_trans", "libindic", "indic_trans_IAST"] transliterated_text = [] for sentence in sentences: if sentence in etc_punctuation: continue temp_sentence = punct_remover(sentence) t00 = indic_trans(temp_sentence, source_script, dest_script) t11 = libindic(temp_sentence, dest_script) t22 = indic_transliteration_OTHER_ORIYA(temp_sentence, source_script) Out = [] for i in range(len(temp_sentence.split())): word = temp_sentence.split()[i] T0 = t00.split()[i] T1 = t11.split()[i] T2 = t22.split()[i] outputs = [T0, T1, T2] out = compare_outputs_transliteration( word, outputs, sources_name, priority_list ) Out.append(out) trans_sent_wo_punct = " ".join(Out) transliterated_sentence = final_transliterated_sentence( sentence, trans_sent_wo_punct ) transliterated_text.append(transliterated_sentence) return " ".join(transliterated_text) # -> Function to transliterate from latin to arabic def dial_comparison_transliteration_latin_arabic( text, source_lang, source_script, dest_script ): sources_name = {"0": "Azure", "1": "transString"} sentences = sentence_tokenize.sentence_split(text, lang="en") priority_list = ["Azure", "transString"] source_lang = "ar" transliterated_text = [] for sentence in sentences: if ( sentence == "" or sentence == " . . ." or sentence == " . ." or sentence == " . . ”" ): continue OUT = [] for word in sentence.split(): if word == ".": continue t0 = azure_transliteration( word, source_lang, source_script, dest_script) t1 = transString(word, 1) # t2 = polyglot_trans(word, source_script, dest_script) outputs = [t0, t1] out = compare_outputs_transliteration( word, outputs, sources_name, priority_list ) OUT.append(out) transliterated_text.append(" ".join(OUT)) return " ".join(transliterated_text) # -> Function to transliterate from chinese to latin def dial_comparison_transliteration_chinese_latin( text, source_lang, source_script, dest_script ): sources_name = {"0": "Azure", "1": "pinyin"} sentences = sentence_tokenize.sentence_split(text, lang="en") priority_list = ["Azure", "pinyin"] transliterated_text = [] for sentence in sentences: if ( sentence == "" or sentence == " . . ." or sentence == " . ." or sentence == " . . ”" ): continue OUT = [] for word in sentence.split(): if word == ".": continue t0 = azure_transliteration( word, source_lang, source_script, dest_script) t1 = translit_CHINESE_LATIN(word) # t2 = polyglot_trans(word, source_script, dest_script) outputs = [t0, t1] out = compare_outputs_transliteration( word, outputs, sources_name, priority_list ) OUT.append(out) transliterated_text.append(" ".join(OUT)) return unidecode.unidecode(" ".join(transliterated_text)) # -> Function to transliterate from thai, sinhala, mongolian and Hebrew to latin def dial_comparison_transliteration_th_sin_mng_heb_latin(text, source_lang, source_script, dest_script): sources_name = {'0': 'Azure', '1': 'anyascii'} sentences = sentence_tokenize.sentence_split(text, lang='en') priority_list = ['Azure', 'anyascii'] if source_lang == "iw": source_lang = "he" transliterated_text = [] for sentence in sentences: if sentence == "" or sentence == " . . ." or sentence == " . ." or sentence == " . . ”": continue OUT = [] for word in sentence.split(): if word == ".": continue t0 = azure_transliteration( word, source_lang, source_script, dest_script) t1 = translit_th_sin_mng_heb_to_latin(word) outputs = [t0, t1] out = compare_outputs_transliteration( word, outputs, sources_name, priority_list) OUT.append(out) transliterated_text.append(" ".join(OUT)) return " ".join(transliterated_text) def compare_outputs_transliteration(word, outputs, sources_name, priority_list): # print(outputs) # doc2 = docx.Document() # sections = doc2.sections # for section in sections: # section.top_margin = Inches(0.2) # section.bottom_margin = Inches(0.2) # section.left_margin = Inches(0.2) # section.right_margin = Inches(0.2) # section = doc2.sections[-1] # new_height = section.page_width # section.page_width = section.page_height # section.page_height = new_height # name = 'Final table ' + doc_file # doc2.add_heading(name, 0) # doc_para = doc2.add_paragraph() # doc_para.add_run('Translation resources used : Google, IBM watson, AWS, Azure, Lingvanex, Yandex').bold = True # table2 = doc2.add_table(rows=1, cols=4) # table2.style = 'TableGrid' # hdr_Cells = table2.rows[0].cells # hdr_Cells[0].paragraphs[0].add_run("Input").bold = True # hdr_Cells[1].paragraphs[0].add_run("Output1").bold = True # hdr_Cells[2].paragraphs[0].add_run("Output2").bold = True # hdr_Cells[3].paragraphs[0].add_run("Output3").bold = True O1ANDS1, O2ANDS2 = selection_source_transliteration( sources_name, outputs, priority_list ) print(O1ANDS1, "compare all transliterations") # add_dial_comparison_doc2_transliteration(doc2, table2, word, O1ANDS1, O2ANDS2, sources_name) return O1ANDS1[0] def add_dial_comparison_doc2_transliteration( doc2, table2, word, O1ANDS1, O2ANDS2, sources_name ): row_Cells = table2.add_row().cells row_Cells[0].text = word row_Cells[1].text = O1ANDS1[0] row_Cells[1].paragraphs[0].add_run("(Source : " + str(O1ANDS1[1]) + ")") row_Cells[2].text = O2ANDS2[0] row_Cells[2].paragraphs[0].add_run("(Source : " + str(O2ANDS2[1]) + ")") # -> Housing all the Script Pair Combinations for Transliterations def transliterate(dest_script, src_script, src_lang, text): print("transliterate",dest_script, src_script, src_lang, text) # if src_script == "Common" or dest_script == "Common" or src_script == "None" or dest_script == "None" or src_script == dest_script: # return trans_text = text if dest_script == "Latin" and src_script == "Devanagari": # trans_text = dial_comparison_transliteration_dev_rom_ph1(text, src_lang, src_script,dest_script) trans_text = dial_comparison_transliteration_dev_rom_ph1_sentence_wise( text, src_lang, src_script, dest_script ) elif dest_script == "Devanagari" and src_script == "Latin": trans_text = dial_comparison_transliteration_rom_dev_ph1( text, src_lang, src_script, dest_script ) # trans_text=dial_comparison_transliteration_rom_dev_ph1_sentence_wise(text, src_lang, src_script,dest_script) elif dest_script == "Latin" and src_script == "Arabic": trans_text = dial_comparison_transliteration_arbic_to_rom_ph1( text, src_lang, src_script, dest_script ) elif dest_script == "Latin" and src_script == "Kannada": trans_text = dial_comparison_transliteration_kann_to_rom_ph1( text, src_lang, src_script, dest_script ) elif dest_script == "Latin" and src_script == "Tamil": trans_text = dial_comparison_transliteration_tamil_to_rom_ph1( text, src_lang, src_script, dest_script ) elif dest_script == "Latin" and src_script == "Bengali": trans_text = dial_comparison_transliteration_beng_tel_mal_to_rom_ph1( text, src_lang, src_script, dest_script ) elif dest_script == "Latin" and src_script == "Telugu": trans_text = dial_comparison_transliteration_beng_tel_mal_to_rom_ph1( text, src_lang, src_script, dest_script ) elif dest_script == "Latin" and src_script == "Malayalam": trans_text = dial_comparison_transliteration_beng_tel_mal_to_rom_ph1( text, src_lang, src_script, dest_script ) elif dest_script == "Gurmukhi" and src_script == "Latin": trans_text = dial_comparison_transliteration_latin_gurmukhi( text, src_lang, src_script, dest_script ) elif dest_script == "Cyrillic" and src_script == "Latin": trans_text = dial_comparison_transliteration_latin_cyrillic( text, src_lang, src_script, dest_script ) elif dest_script == "Telugu" and src_script == "Latin": trans_text = dial_comparison_transliteration_latin_telugu_sentence_wise( text, src_lang, src_script, dest_script ) elif dest_script == "Latin" and src_script == "Gurmukhi": trans_text = dial_comparison_transliteration_gurmukhi_latin_sentence_wise( text, src_lang, src_script, dest_script ) elif dest_script == "Latin" and src_script == "Cyrillic": trans_text = dial_comparison_transliteration_cyrilic_latin_sentence_wise( text, src_lang, src_script, dest_script ) elif dest_script == "Latin" and src_script == "Gujarati": trans_text = dial_comparison_transliteration_beng_tel_mal_to_rom_ph1( text, src_lang, src_script, dest_script ) elif dest_script == "Latin" and src_script == "Oriya": trans_text = dial_comparison_transliteration_beng_tel_mal_to_rom_ph1( text, src_lang, src_script, dest_script ) elif dest_script == "Gujarati" and src_script == "Latin": trans_text = dial_comparison_transliteration_latin_to_gu_or_ml_ta_bn( text, src_lang, src_script, dest_script ) elif dest_script == "Oriya" and src_script == "Latin": trans_text = dial_comparison_transliteration_latin_to_gu_or_ml_ta_bn( text, src_lang, src_script, dest_script ) elif dest_script == "Tamil" and src_script == "Latin": trans_text = dial_comparison_transliteration_latin_to_gu_or_ml_ta_bn( text, src_lang, src_script, dest_script ) elif dest_script == "Malayalam" and src_script == "Latin": trans_text = dial_comparison_transliteration_latin_to_gu_or_ml_ta_bn( text, src_lang, src_script, dest_script ) elif dest_script == "Bengali" and src_script == "Latin": trans_text = dial_comparison_transliteration_latin_to_gu_or_ml_ta_bn( text, src_lang, src_script, dest_script ) elif dest_script == "Devanagari" and src_script == "Oriya": trans_text = dial_comparison_transliteration_or_ml_gu_te_devanagari( text, src_lang, src_script, dest_script ) elif dest_script == "Devanagari" and src_script == "Gujarati": trans_text = dial_comparison_transliteration_or_ml_gu_te_devanagari( text, src_lang, src_script, dest_script ) elif dest_script == "Devanagari" and src_script == "Malayalam": trans_text = dial_comparison_transliteration_or_ml_gu_te_devanagari( text, src_lang, src_script, dest_script ) elif dest_script == "Devanagari" and src_script == "Telugu": trans_text = dial_comparison_transliteration_or_ml_gu_te_devanagari( text, src_lang, src_script, dest_script ) elif dest_script == "Oriya" and src_script == "Devanagari": trans_text = dial_comparison_transliteration_devanagari_or_ml_gu_te( text, src_lang, src_script, dest_script ) elif dest_script == "Gujarati" and src_script == "Devanagari": trans_text = dial_comparison_transliteration_devanagari_or_ml_gu_te( text, src_lang, src_script, dest_script ) elif dest_script == "Malayalam" and src_script == "Devanagari": trans_text = dial_comparison_transliteration_devanagari_or_ml_gu_te( text, src_lang, src_script, dest_script ) elif dest_script == "Telugu" and src_script == "Devanagari": trans_text = dial_comparison_transliteration_devanagari_or_ml_gu_te( text, src_lang, src_script, dest_script ) elif dest_script == "Devanagari" and src_script == "Bengali": trans_text = dial_comparison_transliteration_or_ml_gu_te_devanagari( text, src_lang, src_script, dest_script ) elif dest_script == "Devanagari" and src_script == "Gurmukhi": trans_text = dial_comparison_transliteration_or_ml_gu_te_devanagari( text, src_lang, src_script, dest_script ) elif dest_script == "Devanagari" and src_script == "Kannada": trans_text = dial_comparison_transliteration_or_ml_gu_te_devanagari( text, src_lang, src_script, dest_script ) elif dest_script == "Bengali" and src_script == "Devanagari": trans_text = dial_comparison_transliteration_devanagari_or_ml_gu_te( text, src_lang, src_script, dest_script ) elif dest_script == "Gurmukhi" and src_script == "Devanagari": trans_text = dial_comparison_transliteration_devanagari_or_ml_gu_te( text, src_lang, src_script, dest_script ) elif dest_script == "Kannada" and src_script == "Devanagari": trans_text = dial_comparison_transliteration_devanagari_or_ml_gu_te( text, src_lang, src_script, dest_script ) elif dest_script == "Tamil" and src_script == "Kannada": trans_text = dial_comparison_transliteration_kannada_ml_ta_te_ben( text, src_lang, src_script, dest_script ) elif dest_script == "Malayalam" and src_script == "Kannada": trans_text = dial_comparison_transliteration_kannada_ml_ta_te_ben( text, src_lang, src_script, dest_script ) elif dest_script == "Telugu" and src_script == "Kannada": trans_text = dial_comparison_transliteration_kannada_ml_ta_te_ben( text, src_lang, src_script, dest_script ) elif dest_script == "Kannada" and src_script == "Tamil": trans_text = dial_comparison_transliteration_ml_ta_te_ben_kannada( text, src_lang, src_script, dest_script ) elif dest_script == "Kannada" and src_script == "Malayalam": trans_text = dial_comparison_transliteration_ml_ta_te_ben_kannada( text, src_lang, src_script, dest_script ) elif dest_script == "Kannada" and src_script == "Telugu": trans_text = dial_comparison_transliteration_ml_ta_te_ben_kannada( text, src_lang, src_script, dest_script ) elif dest_script == "Kannada" and src_script == "Latin": trans_text = dial_comparison_transliteration_latin_to_gu_or_ml_ta_bn( text, src_lang, src_script, dest_script ) elif dest_script == "Tamil" and src_script == "Devanagari": trans_text = dial_comparison_transliteration_devanagari_or_ml_gu_te( text, src_lang, src_script, dest_script ) elif dest_script == "Devanagari" and src_script == "Tamil": trans_text = dial_comparison_transliteration_or_ml_gu_te_devanagari( text, src_lang, src_script, dest_script ) elif dest_script == "Telugu" and src_script == "Tamil": trans_text = dial_comparison_transliteration_tamil_other( text, src_lang, src_script, dest_script ) elif dest_script == "Malayalam" and src_script == "Tamil": trans_text = dial_comparison_transliteration_tamil_other( text, src_lang, src_script, dest_script ) elif dest_script == "Tamil" and src_script == "Malayalam": trans_text = dial_comparison_transliteration_other_tamil( text, src_lang, src_script, dest_script ) elif dest_script == "Tamil" and src_script == "Telugu": trans_text = dial_comparison_transliteration_other_tamil( text, src_lang, src_script, dest_script ) elif dest_script == "Malayalam" and src_script == "Telugu": trans_text = dial_comparison_transliteration_te_to_ml( text, src_lang, src_script, dest_script ) elif dest_script == "Telugu" and src_script == "Malayalam": trans_text = dial_comparison_transliteration_ml_to_te( text, src_lang, src_script, dest_script ) elif dest_script == "Gurmukhi" and src_script == "Gujarati": trans_text = dial_comparison_transliteration_guj_or_to_gur( text, src_lang, src_script, dest_script ) elif dest_script == "Gujarati" and src_script == "Gurmukhi": trans_text = dial_comparison_transliteration_gur_or_to_guj( text, src_lang, src_script, dest_script ) elif dest_script == "Gujarati" and src_script == "Oriya": trans_text = dial_comparison_transliteration_gur_or_to_guj( text, src_lang, src_script, dest_script ) elif dest_script == "Gurmukhi" and src_script == "Oriya": trans_text = dial_comparison_transliteration_guj_or_to_gur( text, src_lang, src_script, dest_script ) elif dest_script == "Oriya" and src_script == "Gujarati": trans_text = dial_comparison_transliteration_guj_gur_to_or( text, src_lang, src_script, dest_script ) elif dest_script == "Oriya" and src_script == "Gurmukhi": trans_text = dial_comparison_transliteration_guj_gur_to_or( text, src_lang, src_script, dest_script ) elif dest_script == "Bengali" and src_script == "Kannada": trans_text = dial_comparison_transliteration_kannada_ml_ta_te_ben( text, src_lang, src_script, dest_script ) elif dest_script == "Kannada" and src_script == "Bengali": trans_text = dial_comparison_transliteration_ml_ta_te_ben_kannada( text, src_lang, src_script, dest_script ) elif dest_script == "Devanagari" and src_script == "Arabic": temp_dest_script = "Latin" temp_text = dial_comparison_transliteration_arbic_to_rom_ph1( text, src_lang, src_script, temp_dest_script ) trans_text = dial_comparison_transliteration_rom_dev_ph1( temp_text, src_lang, temp_dest_script, dest_script ) elif dest_script == "Gurmukhi" and src_script == "Arabic": temp_dest_script = "Latin" temp_text = dial_comparison_transliteration_arbic_to_rom_ph1( text, src_lang, src_script, temp_dest_script ) trans_text = dial_comparison_transliteration_latin_gurmukhi( temp_text, src_lang, temp_dest_script, dest_script ) elif dest_script == "Gujarati" and src_script == "Arabic": temp_dest_script = "Latin" temp_text = dial_comparison_transliteration_arbic_to_rom_ph1( text, src_lang, src_script, temp_dest_script ) trans_text = dial_comparison_transliteration_latin_to_gu_or_ml_ta_bn( temp_text, src_lang, temp_dest_script, dest_script ) elif dest_script == "Cyrillic" and src_script == "Arabic": temp_dest_script = "Latin" temp_text = dial_comparison_transliteration_arbic_to_rom_ph1( text, src_lang, src_script, temp_dest_script ) trans_text = dial_comparison_transliteration_latin_cyrillic( temp_text, src_lang, temp_dest_script, dest_script ) elif dest_script == "Arabic" and src_script == "Latin": trans_text = dial_comparison_transliteration_latin_arabic( text, src_lang, src_script, dest_script ) elif dest_script == "Cyrillic" and src_script == "Devanagari": temp_dest_script = "Latin" temp_text = dial_comparison_transliteration_dev_rom_ph1_sentence_wise( text, src_lang, src_script, temp_dest_script ) trans_text = dial_comparison_transliteration_latin_cyrillic( temp_text, src_lang, temp_dest_script, dest_script ) elif dest_script == "Kannada" and src_script == "Arabic": temp_dest_script = "Latin" temp_text = dial_comparison_transliteration_arbic_to_rom_ph1( text, src_lang, src_script, temp_dest_script ) trans_text = dial_comparison_transliteration_latin_to_gu_or_ml_ta_bn( temp_text, src_lang, temp_dest_script, dest_script ) elif dest_script == "Tamil" and src_script == "Arabic": temp_dest_script = "Latin" temp_text = dial_comparison_transliteration_arbic_to_rom_ph1( text, src_lang, src_script, temp_dest_script ) trans_text = dial_comparison_transliteration_latin_to_gu_or_ml_ta_bn( temp_text, src_lang, temp_dest_script, dest_script ) elif dest_script == "Telugu" and src_script == "Arabic": temp_dest_script = "Latin" temp_text = dial_comparison_transliteration_arbic_to_rom_ph1( text, src_lang, src_script, temp_dest_script ) trans_text = dial_comparison_transliteration_latin_telugu_sentence_wise( temp_text, src_lang, temp_dest_script, dest_script ) elif dest_script == "Malayalam" and src_script == "Arabic": temp_dest_script = "Latin" temp_text = dial_comparison_transliteration_arbic_to_rom_ph1( text, src_lang, src_script, temp_dest_script ) trans_text = dial_comparison_transliteration_latin_to_gu_or_ml_ta_bn( temp_text, src_lang, temp_dest_script, dest_script ) elif dest_script == "Bengali" and src_script == "Arabic": temp_dest_script = "Latin" temp_text = dial_comparison_transliteration_arbic_to_rom_ph1( text, src_lang, src_script, temp_dest_script ) trans_text = dial_comparison_transliteration_latin_to_gu_or_ml_ta_bn( temp_text, src_lang, temp_dest_script, dest_script ) elif dest_script == "Oriya" and src_script == "Arabic": temp_dest_script = "Latin" temp_text = dial_comparison_transliteration_arbic_to_rom_ph1( text, src_lang, src_script, temp_dest_script ) trans_text = dial_comparison_transliteration_latin_to_gu_or_ml_ta_bn( temp_text, src_lang, temp_dest_script, dest_script ) elif dest_script == "Cyrillic" and src_script == "Kannada": temp_dest_script = "Latin" temp_text = dial_comparison_transliteration_kann_to_rom_ph1( text, src_lang, src_script, temp_dest_script ) trans_text = dial_comparison_transliteration_latin_cyrillic( temp_text, src_lang, temp_dest_script, dest_script ) elif dest_script == "Gujarati" and src_script == "Kannada": temp_dest_script = "Latin" temp_text = dial_comparison_transliteration_kann_to_rom_ph1( text, src_lang, src_script, temp_dest_script ) trans_text = dial_comparison_transliteration_latin_to_gu_or_ml_ta_bn( temp_text, src_lang, temp_dest_script, dest_script ) elif dest_script == "Gurmukhi" and src_script == "Kannada": temp_dest_script = "Latin" temp_text = dial_comparison_transliteration_kann_to_rom_ph1( text, src_lang, src_script, temp_dest_script ) trans_text = dial_comparison_transliteration_latin_gurmukhi( temp_text, src_lang, temp_dest_script, dest_script ) elif dest_script == "Oriya" and src_script == "Kannada": temp_dest_script = "Latin" temp_text = dial_comparison_transliteration_kann_to_rom_ph1( text, src_lang, src_script, temp_dest_script ) trans_text = dial_comparison_transliteration_latin_to_gu_or_ml_ta_bn( temp_text, src_lang, temp_dest_script, dest_script ) elif dest_script == "Cyrillic" and src_script == "Tamil": temp_dest_script = "Latin" temp_text = dial_comparison_transliteration_tamil_to_rom_ph1( text, src_lang, src_script, temp_dest_script ) trans_text = dial_comparison_transliteration_latin_cyrillic( temp_text, src_lang, temp_dest_script, dest_script ) elif dest_script == "Tamil" and src_script == "Cyrillic": temp_dest_script = "Latin" temp_text = dial_comparison_transliteration_cyrilic_latin_sentence_wise( text, src_lang, src_script, temp_dest_script ) trans_text = dial_comparison_transliteration_latin_to_gu_or_ml_ta_bn( temp_text, src_lang, temp_dest_script, dest_script ) elif dest_script == "Tamil" and src_script == "Bengali": temp_dest_script = "Latin" temp_text = dial_comparison_transliteration_beng_tel_mal_to_rom_ph1( text, src_lang, src_script, temp_dest_script ) trans_text = dial_comparison_transliteration_latin_to_gu_or_ml_ta_bn( temp_text, src_lang, temp_dest_script, dest_script ) elif dest_script == "Telugu" and src_script == "Bengali": temp_dest_script = "Latin" temp_text = dial_comparison_transliteration_beng_tel_mal_to_rom_ph1( text, src_lang, src_script, temp_dest_script ) trans_text = dial_comparison_transliteration_latin_telugu_sentence_wise( temp_text, src_lang, temp_dest_script, dest_script ) elif dest_script == "Malayalam" and src_script == "Bengali": temp_dest_script = "Latin" temp_text = dial_comparison_transliteration_beng_tel_mal_to_rom_ph1( text, src_lang, src_script, temp_dest_script ) trans_text = dial_comparison_transliteration_latin_to_gu_or_ml_ta_bn( temp_text, src_lang, temp_dest_script, dest_script ) elif dest_script == "Arabic" and src_script == "Devanagari": temp_dest_script = "Latin" temp_text = dial_comparison_transliteration_dev_rom_ph1_sentence_wise( text, src_lang, src_script, temp_dest_script ) trans_text = dial_comparison_transliteration_latin_arabic( temp_text, src_lang, temp_dest_script, dest_script ) elif dest_script == "Arabic" and src_script == "Cyrillic": temp_dest_script = "Latin" temp_text = dial_comparison_transliteration_cyrilic_latin_sentence_wise( text, src_lang, src_script, temp_dest_script ) trans_text = dial_comparison_transliteration_latin_arabic( temp_text, src_lang, temp_dest_script, dest_script ) elif dest_script == "Arabic" and src_script == "Gurmukhi": temp_dest_script = "Latin" temp_text = dial_comparison_transliteration_gurmukhi_latin_sentence_wise( text, src_lang, src_script, temp_dest_script ) trans_text = dial_comparison_transliteration_latin_arabic( temp_text, src_lang, temp_dest_script, dest_script ) elif dest_script == "Arabic" and src_script == "Gujarati": temp_dest_script = "Latin" temp_text = dial_comparison_transliteration_beng_tel_mal_to_rom_ph1( text, src_lang, src_script, temp_dest_script ) trans_text = dial_comparison_transliteration_latin_arabic( temp_text, src_lang, temp_dest_script, dest_script ) elif dest_script == "Latin" and src_script == "Hanji": if src_lang == "zh-CN": src_lang = "zh-Hans" trans_text = dial_comparison_transliteration_chinese_latin( text, src_lang, src_script, dest_script ) elif dest_script == "Devanagari" and src_script == "Hanji": if src_lang == "zh-CN": src_lang = "zh-Hans" temp_dest_script = "Latin" temp_text = dial_comparison_transliteration_chinese_latin( text, src_lang, src_script, temp_dest_script ) trans_text = dial_comparison_transliteration_rom_dev_ph1( temp_text, src_lang, temp_dest_script, dest_script ) elif dest_script == "Arabic" and src_script == "Hanji": if src_lang == "zh-CN": src_lang = "zh-Hans" temp_dest_script = "Latin" temp_text = dial_comparison_transliteration_chinese_latin( text, src_lang, src_script, temp_dest_script ) trans_text = dial_comparison_transliteration_latin_arabic( temp_text, src_lang, temp_dest_script, dest_script ) elif dest_script == "Gurmukhi" and src_script == "Hanji": if src_lang == "zh-CN": src_lang = "zh-Hans" temp_dest_script = "Latin" temp_text = dial_comparison_transliteration_chinese_latin( text, src_lang, src_script, temp_dest_script ) trans_text = dial_comparison_transliteration_latin_gurmukhi( temp_text, src_lang, temp_dest_script, dest_script ) elif dest_script == "Gujarati" and src_script == "Hanji": if src_lang == "zh-CN": src_lang = "zh-Hans" temp_dest_script = "Latin" temp_text = dial_comparison_transliteration_chinese_latin( text, src_lang, src_script, temp_dest_script ) trans_text = dial_comparison_transliteration_latin_to_gu_or_ml_ta_bn( temp_text, src_lang, temp_dest_script, dest_script ) elif dest_script == "Oriya" and src_script == "Hanji": if src_lang == "zh-CN": src_lang = "zh-Hans" temp_dest_script = "Latin" temp_text = dial_comparison_transliteration_chinese_latin( text, src_lang, src_script, temp_dest_script ) trans_text = dial_comparison_transliteration_latin_to_gu_or_ml_ta_bn( temp_text, src_lang, temp_dest_script, dest_script ) elif dest_script == "Latin" and src_script == "Thai": trans_text = dial_comparison_transliteration_th_sin_mng_heb_latin( text, src_lang, src_script, dest_script ) elif dest_script == "Latin" and src_script == "Sinhala": trans_text = dial_comparison_transliteration_th_sin_mng_heb_latin( text, src_lang, src_script, dest_script ) elif dest_script == "Latin" and src_script == "Hebrew": trans_text = dial_comparison_transliteration_th_sin_mng_heb_latin( text, src_lang, src_script, dest_script ) elif dest_script == "Latin" and src_script == "Mongolian": src_lang = "mn-Cyrl" trans_text = dial_comparison_transliteration_th_sin_mng_heb_latin( text, src_lang, src_script, dest_script ) return trans_text # -> Main Transliteration Function to co-ordingate all the functions def makeTransliteration_only(**kwargs): # Seting the Variables Required for Transliteration dial_dest_script = kwargs.get("dial_dest_script") dual_dial_script = kwargs.get("dual_dial_script") original_file = kwargs.get("original_file") dial_dest_lang = kwargs.get("dial_dest_lang") is_dialogue_transliteration_required = kwargs.get( "is_dialogue_transliteration_required" ) is_action_line_transliteration_required = kwargs.get( "is_action_line_transliteration_required" ) action_line_dest_script = kwargs.get("action_line_dest_script") action_line_src_lang = kwargs.get("action_line_src_lang") action_line_src_script = kwargs.get("action_line_src_script") scenes_original = kwargs.get("scenes_original") restrict_to_five = kwargs.get("restrict_to_five") filename2 = original_file # -> Checking if Transliteration is really Required or not if ( is_dialogue_transliteration_required == False and is_action_line_transliteration_required == False and dual_dial_script == "No" ): return original_file, scenes_original # create an instance of a word document doc = docx.Document() x = datetime.datetime.now(timezone("UTC")).astimezone( timezone("Asia/Kolkata")) if kwargs.get('ignore_because_sample_script') == True: doc_file = filename2 else: doc_file = ( basePath + "/media/scripts/translated/" + "trans_" + str(dial_dest_lang) + "_" + str(x.strftime("%d")) + "_" + str(x.strftime("%b")) + "_" + str(x.strftime("%H")) + str(x.strftime("%I")) + "_" + "trans" + "_of_" + ntpath.basename(filename2) ) # -> Getting All the scenes form the Script File with updated actionlines from whichever previously concluded steps refined, total_scenes = getRefined(filename2) sluglines, without_slug = getSlugAndNonSlug(refined) characters = getSpeakers(without_slug) scenes1, actionline, parenthetical_lis, speakers, dialogues = getScenes( refined, total_scenes, characters ) # -> Restricitng Number of scenes to five if user only wants sample of script if restrict_to_five == "yes": scenes1 = scenes1[:5] # -> This forloop detects actionline source language, dialogue source language and dialogue source script # to avoid the load for detection of language in each and every line in next code(for-loop) for scene in tqdm(scenes1): x = "False" y = "False" for i, line in enumerate(scene): if i == 0: continue if isinstance(line, str): x = "True" non_dial_src_lang = language_detector(line) else: [speaker] = line.keys() if speaker == "Transition": continue if line[speaker][0] != "NONE": continue if line[speaker][2] == "": continue y = "True" dial_src_lang = language_detector(line[speaker][2]) dial_src_script = script_det(line[speaker][2]) if x == "True" and y == "True": break scenes_current = scenes1 if scenes_original: scenes1 = zip(scenes1, scenes_original) else: scenes1 = zip(scenes1, scenes1) # -> Transliterating The Text Begins here for scene, scene_original in tqdm(scenes1): for i, (line, line_original) in enumerate(zip(scene, scene_original)): if i == 0: addSlugLine(doc, line) continue if isinstance(line, str): print("transliterating action lines ",action_line_dest_script, action_line_src_script, action_line_src_lang, line) if is_action_line_transliteration_required: trans_text = transliterate( action_line_dest_script, action_line_src_script, action_line_src_lang, line, ) else: trans_text = line addActionLine(doc, trans_text, non_dial_src_lang) else: print("In dialogue") [speaker] = line.keys() if speaker == "Transition": # if want to translate transition also along with action line use addTransition # (doc,translator.translate(speaker,dest = gtrans_dict[actionline_dest_lang]).text) addTransition(doc, line[speaker]) continue addSpeaker(doc, speaker) if line[speaker][0] != "NONE": # In parenthitical part addParenthetical(doc, line[speaker][0]) print("dialogue to be transliterated ", line[speaker][2]) if line[speaker][2] == "": continue trans_text = line[speaker][2] if is_dialogue_transliteration_required: if dial_dest_script == dial_src_script: trans_text = trans_text else: trans_text = transliterate( dial_dest_script, dial_src_script, dial_src_lang, trans_text ) if dual_dial_script == "Yes": dual_script( doc, line_original[speaker][2], trans_text, dial_src_lang ) else: addDialogue(doc, trans_text, dial_src_lang) # Saving the Docfile doc.save(doc_file) print("done file is saved") return doc_file, scenes_current