320 lines
10 KiB
Python
Executable File
320 lines
10 KiB
Python
Executable File
# from google_srt import google_sub
|
|
from .translation_metric import diff_score
|
|
import os
|
|
import sys
|
|
import re
|
|
from .translation_metric import (
|
|
manual_diff_score,
|
|
bleu_diff_score,
|
|
gleu_diff_score,
|
|
meteor_diff_score,
|
|
rouge_diff_score,
|
|
diff_score,
|
|
critera4_5,
|
|
)
|
|
from .script_detector import script_cat
|
|
from .translation_metric import (
|
|
manual_diff_score,
|
|
bleu_diff_score,
|
|
gleu_diff_score,
|
|
meteor_diff_score,
|
|
rouge_diff_score,
|
|
diff_score,
|
|
critera4_5,
|
|
)
|
|
from .selection_source import (
|
|
selection_source,
|
|
function5,
|
|
function41,
|
|
function311,
|
|
function221,
|
|
function2111,
|
|
function11111,
|
|
selection_source_transliteration,
|
|
two_sources_two_outputs,
|
|
)
|
|
from .script_writing import default_script
|
|
|
|
|
|
def compare_subs(current):
|
|
def script_det(text):
|
|
punctuations = """!()-[]{};:'"\,<>./?@#$%^&*_~“"”"""
|
|
no_punct = ""
|
|
for char in text:
|
|
if char not in punctuations:
|
|
no_punct = char
|
|
break
|
|
# print("alphabet", no_punct)
|
|
script = script_cat(no_punct)[0]
|
|
# print("script", script)
|
|
return script
|
|
|
|
def punct_remover(string):
|
|
# punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~…।“”'''
|
|
punctuations = """!()-[]{};:'"\,<>./?@#$%^&*_~…।1234567890"""
|
|
for x in string.lower():
|
|
if x in punctuations:
|
|
string = string.replace(x, " ")
|
|
return string
|
|
|
|
def word_transliterate(sentence, dest_script):
|
|
return sentence
|
|
|
|
def final_out(output1, output2, output3, dest_lang):
|
|
temp_output1 = punct_remover(output1)
|
|
temp_output2 = punct_remover(output2)
|
|
temp_output3 = punct_remover(output3)
|
|
# for word in regexp_tokenize(output1, "[\w']+")
|
|
|
|
for word in temp_output1.split():
|
|
if script_det(word) != default_script[dest_lang]:
|
|
for word in temp_output2.split():
|
|
if script_det(word) != default_script[dest_lang]:
|
|
for word in temp_output3.split():
|
|
if script_det(word) != default_script[dest_lang]:
|
|
# print("in3")
|
|
output1 = word_transliterate(
|
|
output1, default_script[dest_lang]
|
|
)
|
|
return output1
|
|
return output3
|
|
return output2
|
|
return output1
|
|
|
|
def compare_outputs(sentence, t0, trans, sources_name, target_lang):
|
|
k = []
|
|
s = []
|
|
methods_name = {
|
|
"0": "MNF",
|
|
"1": "Gleu",
|
|
"2": "Meteor",
|
|
"3": "Rougen",
|
|
"4": "Rougel",
|
|
}
|
|
google_output = t0
|
|
# print("google", google_output)
|
|
output1, source1 = manual_diff_score(trans, sources_name)
|
|
# print("MNF", output1)
|
|
output2, source2 = gleu_diff_score(trans, sources_name)
|
|
# print("gleu", output2)
|
|
output3, source3 = meteor_diff_score(trans, sources_name)
|
|
# print("meteor", output3)
|
|
output4, source4, output5, source5 = rouge_diff_score(trans, sources_name)
|
|
# print("rougen", output4)
|
|
# print("rougel", output5)
|
|
|
|
if google_output == output1 == output2 == output3 == output4 == output5:
|
|
# print("all output are same as google")
|
|
return google_output
|
|
else:
|
|
if google_output != output1:
|
|
k.append(output1)
|
|
s.append(source1)
|
|
else:
|
|
k.append(" ")
|
|
s.append(" ")
|
|
if google_output != output2:
|
|
k.append(output2)
|
|
s.append(source2)
|
|
else:
|
|
k.append(" ")
|
|
s.append(" ")
|
|
if google_output != output3:
|
|
k.append(output3)
|
|
s.append(source3)
|
|
else:
|
|
k.append(" ")
|
|
s.append(" ")
|
|
if google_output != output4:
|
|
k.append(output4)
|
|
s.append(source4)
|
|
else:
|
|
k.append(" ")
|
|
s.append(" ")
|
|
if google_output != output5:
|
|
k.append(output5)
|
|
s.append(source5)
|
|
else:
|
|
k.append(" ")
|
|
s.append(" ")
|
|
|
|
k.insert(0, sentence)
|
|
k.insert(1, google_output)
|
|
s1ANDm1, s2ANDm2, s3ANDm3 = selection_source(
|
|
s, sources_name, trans, methods_name
|
|
)
|
|
# print("s1", s1ANDm1)
|
|
# print("s2", s2ANDm2)
|
|
# print("s3", s3ANDm3)
|
|
# print(s1ANDm1[0])
|
|
# print(sources_name)
|
|
|
|
# add_dial_comparison_doc1a(doc1a, table1a , k, s, s1ANDm1[0])
|
|
# add_dial_comparison_doc1b(doc1b, table1b , k, s, s1ANDm1[0])
|
|
# add_dial_comparison_doc2(doc2, table2, sentence, s1ANDm1, s2ANDm2, s3ANDm3, sources_name, trans)
|
|
|
|
for a, b in sources_name.items():
|
|
if b == s1ANDm1[0]:
|
|
k = a
|
|
output1 = trans[str(k)]
|
|
|
|
if s2ANDm2[0] != "":
|
|
for c, d in sources_name.items():
|
|
if d == s2ANDm2[0]:
|
|
l = c
|
|
output2 = trans[str(l)]
|
|
else:
|
|
output2 = output1
|
|
|
|
if s3ANDm3[0] != "":
|
|
for e, f in sources_name.items():
|
|
if f == s3ANDm3[0]:
|
|
m = e
|
|
output3 = trans[str(m)]
|
|
else:
|
|
output3 = output1
|
|
|
|
# print("output1", output1)
|
|
# print("output2", output2)
|
|
# print("output3", output3)
|
|
|
|
output = final_out(output1, output2, output3, target_lang)
|
|
|
|
# print("output", output)
|
|
|
|
return output
|
|
|
|
####
|
|
google_srt = current + "/g_subtitles.srt"
|
|
aws_srt = current + "/a_subtitles.srt"
|
|
azure_srt = current + "/az_subtitles.srt"
|
|
|
|
print(google_srt)
|
|
print(aws_srt)
|
|
print(azure_srt)
|
|
|
|
# trans={"0":[],"1":[],"2":[]}
|
|
trans = {}
|
|
|
|
def main1():
|
|
# read file line by line
|
|
# file = open( "google_subtitle.srt", "r",encoding="utf8")
|
|
|
|
try:
|
|
file = open(google_srt, "r", encoding="utf8")
|
|
file_size = os.path.getsize(file)
|
|
if file_size != 0:
|
|
lines = file.readlines()
|
|
file.close()
|
|
|
|
text = ""
|
|
for line in lines:
|
|
if (
|
|
re.search("^[0-9]+$", line) is None
|
|
and re.search("^[0-9]{2}:[0-9]{2}:[0-9]{2}", line) is None
|
|
and re.search("^$", line) is None
|
|
):
|
|
text += " " + line.rstrip("\n")
|
|
text = text.lstrip()
|
|
trans["0"] = text
|
|
else:
|
|
trans["0"] = "."
|
|
except:
|
|
trans["0"] = "."
|
|
try:
|
|
file = open(aws_srt, "r", encoding="utf8")
|
|
file_size = os.path.getsize(file)
|
|
if file_size != 0:
|
|
lines = file.readlines()
|
|
file.close()
|
|
|
|
text = ""
|
|
for line in lines:
|
|
if (
|
|
re.search("^[0-9]+$", line) is None
|
|
and re.search("^[0-9]{2}:[0-9]{2}:[0-9]{2}", line) is None
|
|
and re.search("^$", line) is None
|
|
):
|
|
text += " " + line.rstrip("\n")
|
|
text = text.lstrip()
|
|
trans["1"] = text
|
|
else:
|
|
trans["0"] = "."
|
|
except:
|
|
trans["1"] = "."
|
|
try:
|
|
file = open(azure_srt, "r", encoding="utf8")
|
|
file_size = os.path.getsize(file)
|
|
if file_size != 0:
|
|
lines = file.readlines()
|
|
file.close()
|
|
|
|
text = ""
|
|
for line in lines:
|
|
if (
|
|
re.search("^[0-9]+$", line) is None
|
|
and re.search("^[0-9]{2}:[0-9]{2}:[0-9]{2}", line) is None
|
|
and re.search("^$", line) is None
|
|
):
|
|
text += " " + line.rstrip("\n")
|
|
text = text.lstrip()
|
|
trans["2"] = text
|
|
else:
|
|
trans["0"] = "."
|
|
except:
|
|
trans["2"] = "."
|
|
return trans
|
|
|
|
main1()
|
|
|
|
sentence = ""
|
|
t0 = " "
|
|
sources_name = {"0": "google", "1": "aws", "2": "azure"}
|
|
lang = {
|
|
"Arabic": "ar",
|
|
"Bengali": "bn",
|
|
"Catalan": "ca",
|
|
"English": "en",
|
|
"French": "fr",
|
|
"Gujarati": "gr",
|
|
"Hindi": "hi",
|
|
"Kannada": "kn",
|
|
"Marathi": "mr",
|
|
"Spanish": "es",
|
|
"Tamil": "ta",
|
|
"Telugu": "te",
|
|
}
|
|
# target_lang = lang[lang_code]
|
|
target_lang = "en"
|
|
|
|
selected_para = compare_outputs(sentence, t0, trans, sources_name, target_lang)
|
|
# print(trans)
|
|
# print(selected_para)
|
|
key_list = list(trans.keys())
|
|
val_list = list(trans.values())
|
|
position = val_list.index(selected_para)
|
|
# print("Position", position)
|
|
|
|
def final_srt(position):
|
|
if position == 0:
|
|
if trans["0"] != ".":
|
|
return google_srt
|
|
else:
|
|
position += 1
|
|
if position == 1:
|
|
if trans["1"] != ".":
|
|
return aws_srt
|
|
else:
|
|
position += 1
|
|
if position == 2:
|
|
if trans["2"] != ".":
|
|
return azure_srt
|
|
else:
|
|
position += 1
|
|
return azure_srt
|
|
|
|
# print(final_srt(position))
|
|
selected_subtitle = final_srt(position)
|
|
print(selected_subtitle)
|
|
return selected_subtitle
|