Conversion_Kitchen_Code/kitchen_counter/conversion/subtitling/compare_srt_output.py

# from google_srt import google_sub
from .translation_metric import diff_score
import os
import sys
import re
from .translation_metric import (
    manual_diff_score,
    bleu_diff_score,
    gleu_diff_score,
    meteor_diff_score,
    rouge_diff_score,
    diff_score,
    critera4_5,
)
from .script_detector import script_cat
from .translation_metric import (
    manual_diff_score,
    bleu_diff_score,
    gleu_diff_score,
    meteor_diff_score,
    rouge_diff_score,
    diff_score,
    critera4_5,
)
from .selection_source import (
    selection_source,
    function5,
    function41,
    function311,
    function221,
    function2111,
    function11111,
    selection_source_transliteration,
    two_sources_two_outputs,
)
from .script_writing import default_script


def compare_subs(current):
    def script_det(text):
        punctuations = """!()-[]{};:'"\,<>./?@#$%^&*_~“"”"""
        no_punct = ""
        for char in text:
            if char not in punctuations:
                no_punct = char
                break
        # print("alphabet", no_punct)
        script = script_cat(no_punct)[0]
        # print("script", script)
        return script

    def punct_remover(string):
        # punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~…।“”'''
        punctuations = """!()-[]{};:'"\,<>./?@#$%^&*_~…।1234567890"""
        for x in string.lower():
            if x in punctuations:
                string = string.replace(x, " ")
        return string

    def word_transliterate(sentence, dest_script):
        return sentence

    def final_out(output1, output2, output3, dest_lang):
        temp_output1 = punct_remover(output1)
        temp_output2 = punct_remover(output2)
        temp_output3 = punct_remover(output3)
        # for word in regexp_tokenize(output1, "[\w']+")

        for word in temp_output1.split():
            if script_det(word) != default_script[dest_lang]:
                for word in temp_output2.split():
                    if script_det(word) != default_script[dest_lang]:
                        for word in temp_output3.split():
                            if script_det(word) != default_script[dest_lang]:
                                # print("in3")
                                output1 = word_transliterate(
                                    output1, default_script[dest_lang]
                                )
                                return output1
                        return output3
                return output2
        return output1

    def compare_outputs(sentence, t0, trans, sources_name, target_lang):
        k = []
        s = []
        methods_name = {
            "0": "MNF",
            "1": "Gleu",
            "2": "Meteor",
            "3": "Rougen",
            "4": "Rougel",
        }
        google_output = t0
        # print("google", google_output)
        output1, source1 = manual_diff_score(trans, sources_name)
        # print("MNF", output1)
        output2, source2 = gleu_diff_score(trans, sources_name)
        # print("gleu", output2)
        output3, source3 = meteor_diff_score(trans, sources_name)
        # print("meteor", output3)
        output4, source4, output5, source5 = rouge_diff_score(trans, sources_name)
        # print("rougen", output4)
        # print("rougel", output5)

        if google_output == output1 == output2 == output3 == output4 == output5:
            # print("all output are same as google")
            return google_output
        else:
            if google_output != output1:
                k.append(output1)
                s.append(source1)
            else:
                k.append(" ")
                s.append(" ")
            if google_output != output2:
                k.append(output2)
                s.append(source2)
            else:
                k.append(" ")
                s.append(" ")
            if google_output != output3:
                k.append(output3)
                s.append(source3)
            else:
                k.append(" ")
                s.append(" ")
            if google_output != output4:
                k.append(output4)
                s.append(source4)
            else:
                k.append(" ")
                s.append(" ")
            if google_output != output5:
                k.append(output5)
                s.append(source5)
            else:
                k.append(" ")
                s.append(" ")

            k.insert(0, sentence)
            k.insert(1, google_output)
            s1ANDm1, s2ANDm2, s3ANDm3 = selection_source(
                s, sources_name, trans, methods_name
            )
            # print("s1", s1ANDm1)
            # print("s2", s2ANDm2)
            # print("s3", s3ANDm3)
            # print(s1ANDm1[0])
            # print(sources_name)

            # add_dial_comparison_doc1a(doc1a, table1a , k, s, s1ANDm1[0])
            # add_dial_comparison_doc1b(doc1b, table1b , k, s, s1ANDm1[0])
            # add_dial_comparison_doc2(doc2, table2, sentence, s1ANDm1, s2ANDm2, s3ANDm3, sources_name, trans)

            for a, b in sources_name.items():
                if b == s1ANDm1[0]:
                    k = a
            output1 = trans[str(k)]

            if s2ANDm2[0] != "":
                for c, d in sources_name.items():
                    if d == s2ANDm2[0]:
                        l = c
                output2 = trans[str(l)]
            else:
                output2 = output1

            if s3ANDm3[0] != "":
                for e, f in sources_name.items():
                    if f == s3ANDm3[0]:
                        m = e
                output3 = trans[str(m)]
            else:
                output3 = output1

            # print("output1", output1)
            # print("output2", output2)
            # print("output3", output3)

            output = final_out(output1, output2, output3, target_lang)

            # print("output", output)

            return output

    ####
    google_srt = current + "/g_subtitles.srt"
    aws_srt = current + "/a_subtitles.srt"
    azure_srt = current + "/az_subtitles.srt"

    print(google_srt)
    print(aws_srt)
    print(azure_srt)

    # trans={"0":[],"1":[],"2":[]}
    trans = {}

    def main1():
        # read file line by line
        # file = open( "google_subtitle.srt", "r",encoding="utf8")

        try:
            file = open(google_srt, "r", encoding="utf8")
            file_size = os.path.getsize(file)
            if file_size != 0:
                lines = file.readlines()
                file.close()

                text = ""
                for line in lines:
                    if (
                        re.search("^[0-9]+$", line) is None
                        and re.search("^[0-9]{2}:[0-9]{2}:[0-9]{2}", line) is None
                        and re.search("^$", line) is None
                    ):
                        text += " " + line.rstrip("\n")
                    text = text.lstrip()
                trans["0"] = text
            else:
                trans["0"] = "."
        except:
            trans["0"] = "."
        try:
            file = open(aws_srt, "r", encoding="utf8")
            file_size = os.path.getsize(file)
            if file_size != 0:
                lines = file.readlines()
                file.close()

                text = ""
                for line in lines:
                    if (
                        re.search("^[0-9]+$", line) is None
                        and re.search("^[0-9]{2}:[0-9]{2}:[0-9]{2}", line) is None
                        and re.search("^$", line) is None
                    ):
                        text += " " + line.rstrip("\n")
                    text = text.lstrip()
                trans["1"] = text
            else:
                trans["0"] = "."
        except:
            trans["1"] = "."
        try:
            file = open(azure_srt, "r", encoding="utf8")
            file_size = os.path.getsize(file)
            if file_size != 0:
                lines = file.readlines()
                file.close()

                text = ""
                for line in lines:
                    if (
                        re.search("^[0-9]+$", line) is None
                        and re.search("^[0-9]{2}:[0-9]{2}:[0-9]{2}", line) is None
                        and re.search("^$", line) is None
                    ):
                        text += " " + line.rstrip("\n")
                    text = text.lstrip()
                trans["2"] = text
            else:
                trans["0"] = "."
        except:
            trans["2"] = "."
        return trans

    main1()

    sentence = ""
    t0 = " "
    sources_name = {"0": "google", "1": "aws", "2": "azure"}
    lang = {
        "Arabic": "ar",
        "Bengali": "bn",
        "Catalan": "ca",
        "English": "en",
        "French": "fr",
        "Gujarati": "gr",
        "Hindi": "hi",
        "Kannada": "kn",
        "Marathi": "mr",
        "Spanish": "es",
        "Tamil": "ta",
        "Telugu": "te",
    }
    # target_lang = lang[lang_code]
    target_lang = "en"

    selected_para = compare_outputs(sentence, t0, trans, sources_name, target_lang)
    # print(trans)
    # print(selected_para)
    key_list = list(trans.keys())
    val_list = list(trans.values())
    position = val_list.index(selected_para)
    # print("Position", position)

    def final_srt(position):
        if position == 0:
            if trans["0"] != ".":
                return google_srt
            else:
                position += 1
        if position == 1:
            if trans["1"] != ".":
                return aws_srt
            else:
                position += 1
        if position == 2:
            if trans["2"] != ".":
                return azure_srt
            else:
                position += 1
        return azure_srt

    # print(final_srt(position))
    selected_subtitle = final_srt(position)
    print(selected_subtitle)
    return selected_subtitle