Conversion_Kitchen_Code/kitchen_counter/conversion/translation/script_reading.py

# import textract
import re
import docx
import os
doc = docx.Document()
from docx.shared import Inches, Cm, Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.table import WD_TABLE_ALIGNMENT, WD_ALIGN_VERTICAL
from MNF.settings import BasePath
basePath = BasePath()

# -> Patterns for Detection of Sluglines,Transition,actionline and Dialogues
slug_pattern = r'[\d]*[.]?[\s]*[IE][NX]T'
pat = r'[\d]*[\s]*[IE]/[IE][.]?'
transitions = ['CUT TO:', 'FADE IN:', 'FADE OUT:', 'DISSOLVE TO:', 'JUMP CUT TO:',
               'JUMP TO:', 'CUT BACK TO:', 'INTERCUT WITH:', 'I/C WITH:', 'BACK TO:', 'INTERVAL']
reserved_words = ['MONTAGE', 'PBS', 'FADE',
                  'FADE', 'TITLE', 'SPLIT', 'SCREEN', 'CUT']


# -> Random function - no use
def breaksen(s):
    l = []
    if len(s.split()) <= 256:
        l.append(s)
    else:
        n = len(s.split())
        for i in range(n // 32 + 1):
            l.append(" ".join(s.split()[32 * i:32 * (i + 1)]))
    return l


# -> Function for Getting Languages and Scripts related to a Script(Docx)
def getRefined(filename1):
    print("Get_Refined_Called")
    total_scenes = 0
    print("filname", filename1)
    #filename1 = "/home/user/mnf/project/MNF/media/scripts_folder/9a97e7dc-bd18-416f-b2a6-bbfcd8a3887b/b1_a4weGyR.docx"
    text = textract.process(filename1, encoding="utf8", errors='ignore')
    filename = rf"{basePath}/conversion/translation/file.txt"
    f = open(filename, 'wb')
    f.write(text)
    f.close()
    dialog_coming = False
    f = open(filename, 'r', encoding="utf8", errors='ignore')
    doc11 = f.read()
    f.close()
    f1 = open(rf"{basePath}/conversion/translation/file1.txt",
              'w', encoding="utf8", errors='ignore')
    flag = False
    for line in doc11.split("\n"):
        line = line.strip()
        print("Original Line:", line)

        # -> For Detection of Slug lines
        if (line.strip().startswith(('INT.', 'INT ')) or
                line.strip().startswith(('I/E', 'E/I')) or
                line.strip().startswith(('EXT.', 'EXT ')) or
                line.strip().startswith('EXT/INT') or
                line.strip().startswith('INT/EXT') or
                re.match(slug_pattern, line.strip()) or re.match(pat, line.strip())) and not (
                line.strip().startswith(('INTERCUT', 'INTERMISSION', 'INTERVAL'))):

            flag = True
            f1.write(line)
            f1.write('\n')
            continue
        # -> for Detection of Probable Dialogues,Speakers and Actionlines
        else:
            print("line 88: Other than Slugline:", line)
            if flag:
                print("line 90: else-if condition:", line)
                print("stuck here 1")
                if line.strip() == '\n':
                    continue
                print("stuck here 1.1")
                if dialog_coming and (line == '\n' or line.strip() == ""):
                    print("stuck here 2")
                    print("line empty or just have newline", line)
                    continue
                print("stuck here 3")
                # -> for Detection of Probable Dialogues
                if dialog_coming:
                    print("Probable dialogue or PC: ", line)
                    f1.write(line)
                    f1.write('\n')
                    if re.match(r"\(.*\)", line):
                        print("stuck here 4")
                        continue
                    else:
                        print("stuck here 5")
                        dialog_coming = False
                        print("line 107: else of PCs and dialog over")
                        continue

                # -> Detection of Speaker which implies that next line will be dialogue (dialog_coming=True)
                print("stuck here 6")
                # old_regex = "([A-Z'’]+[\s]*[-]*)*([#]*[\s]*[1-9])*(\(.*\))*"
                # old_regex2 = "([A-Z'’\s]*-?[#]*\s*[1-9]*\s*\([^)]*\))?"
                if line.isupper():
                    print("Qualified a certian category1")
                else:
                    print("not Qualified a certian category1")

                if re.fullmatch(r"([A-Z'’]+[\s]*[-]*)*([#]*[\s]*[1-9])*(\(.*\))*", line.strip()):
                    print("Qualified a certian category2")
                else:
                    print("bot Qualified a certian categor2")
                if re.fullmatch(
                        r"(MRS?|DR|ER|PHD|ESQ|HON|JR|MS|MESSRS|MMES|MSGR|PROF|REV|RT. HON|SR|ST)\. [A-Z]+",
                        line.strip()):
                    print("Qualified a certian category3")
                else:
                    print("bot Qualified a certian category3")
                if line.isupper() and (
                        re.fullmatch(r"([A-Z'’]+[\s]*[-]*)*([#]*[\s]*[1-9])*(\(.*\))*", line.strip()) or re.fullmatch(
                        r"(MRS?|DR|ER|PHD|ESQ|HON|JR|MS|MESSRS|MMES|MSGR|PROF|REV|RT. HON|SR|ST)\. [A-Z]+",
                        line.strip())):
                    print("line 111: May be speaker: ", line)
                    f1.write(line)
                    f1.write('\n')
                    dialog_coming = True
                    continue
                print("stuck here 7")
                # -> Detection of Actionline , etc.
                if not line == '\n':
                    print("line 120 Actionline or Something else", line)
                    f1.write(line)
                    f1.write('\n')
    print("stuck here 8")
    print("line 125 file closed")
    f1.close()

    # -> copying all the data in file1.txt to file.txt with bytes included
    filename1 = rf"{basePath}/conversion/translation/file1.txt"
    text = textract.process(filename1, encoding="utf8", errors='ignore')
    print("line 130: ", text)
    filename = rf"{basePath}/conversion/translation/file.txt"
    f = open(filename, 'wb')
    f.write(text)
    f.close()

    with open(filename, "r") as input:
        input_ = input.read().split('\n\n')
    # -> Creating Refined List of Scenes and its data
    refined = []
    for line in input_:
        refined.append(line.strip())

    refined = list(filter(lambda a: a != "", refined))

    for i in range(len(refined)):
        if not (refined[i].strip().startswith(('INT.', 'INT ')) or refined[i].strip().startswith(('EXT.', 'EXT ')) or
                refined[i].strip().startswith('EXT/INT') or refined[i].strip().startswith('INT/EXT') or refined[
                    i].strip().startswith(('I/E', 'E/I')) or re.match(slug_pattern, refined[i].strip()) or re.match(pat,
                                                                                                                    refined[
                                                                                                                        i].strip())):
            total_scenes = total_scenes + 1
            continue
        # -> This indicates that if a part of refined data does not have a Slug-line then
        # that data should be rejected
        refined = refined[i:]
        break

    refined = list(filter(lambda a: a != "", refined))
    print("line 156:Refined", refined)
    return refined, total_scenes


# -> For Getting Slug lines and Non-Slug lines from Refined Data
def getSlugAndNonSlug(refined):
    sluglines = []
    without_slug = []
    for para in refined:
        para = para.strip()
        if para.strip().startswith(('INT.', 'INT')) or para.strip().startswith(
                ('EXT.', 'EXT')) or para.strip().startswith('EXT/INT') or para.strip().startswith(
                ('I/E', 'E/I')) or para.strip().startswith('INT/EXT') or re.match(slug_pattern,
                                                                                  para.strip()) or re.match(pat,
                                                                                                            para.strip()):
            sluglines.append(para)
            continue
        without_slug.append(para)
    return sluglines, without_slug


# -> For Extracting the Speakers from Refined Data
def getSpeakers(without_slug):
    characters = []
    for para in without_slug:
        lis = para.split('\n')
        i = 0
        for item in lis:
            i = i + 1
            i = min(i, len(lis) - 2)
            if item.isupper() and not (lis[i + 1].strip() == ""):
                if re.match(r"[A-Z'’]+[\s]*[-]*[A-Z'’]*([#]*[\s]*[1-9])*(\(.*\))*", item):
                    tem = item.split("(")[0].strip()
                    characters.append(tem.strip())
                else:
                    continue

    characters = list(set(characters))
    characters = list(filter(lambda x: len(x) > 0, characters))
    characters = [character for character in characters if set(
        character.split(" ")).intersection(reserved_words) == set()]
    return characters


# ->Use this function For getting the Scenes with all appropriate data extracted
def getScenes(refined, total_scenes, characters):
    # To find scenes data structure and prev and next scenes numbers
    i = 0
    scene = []
    dialogues = []
    speakers = []
    slugline_dic = {}
    pc = 0
    scene_no = 0
    actionline = []
    parenthetical_lis = []

    scenes = []
    speaker = ""
    parenthetical = 'NONE'

    for line in refined:
        # -> For Detection of Slug lines
        if line.strip().startswith(('INT.', 'INT')) or line.strip().startswith(
                ('EXT.', 'EXT')) or line.strip().startswith('EXT/INT') or line.strip().startswith(
                'INT/EXT') or line.strip().startswith(('I/E', 'E/I')) or re.match(slug_pattern,
                                                                                  line.strip()) or re.match(pat,
                                                                                                            line.strip()):
            print("Slug-line Case")
            scenes.append(scene)
            scene = []
            i = 0
            scene_no += 1
            scene.append(line)
            slugline_dic[scene_no] = line.split("\n")[0].strip('0123456789.- ')
        # -> For Detection of Actionlines, Speakers, Dialogues,Transitions
        else:
            print("Not Slug-line Case")
            lis = line.split("\n")
            lis = [l.strip() for l in lis]
            print(" \n Line 232 probable dialogue list", lis)
            word = lis[0]
            # -> For Extraction of Dialogues, Paranthetical lines, Speakers of Previous line and Next line
            if word.split('(')[0].strip() in characters:
                mydic = {}
                speaker = word.split('(')[0].strip()
                print("Speaker 238", speaker)
                # -> if line has a speaker, dialogues and parenthetical lines then it goes in this part of code
                if len(lis) > 1 and re.match(r"\(.*\)", lis[1]):
                    pc = pc + 1
                    parenthetical = lis[1]
                    parenthetical = parenthetical.replace("\n", "")
                    dia = ' '.join(lis[2:])
                    dia = dia.replace("\n", "")
                    dia = dia.replace("\"", '')
                else:
                    dia = ''.join(lis[1:])
                    dia = dia.replace("\n", "")
                    dia = dia.replace("\"", '')
                    print(" length dia\n", len(dia))
                # -> if no dialogues and no paranthetical lines were found then this code
                if not (len(dia) == 0 and parenthetical == "NONE"):
                    print(" len dia != and Parenthetical == NONE: 384 ")
                    if i - 1 >= 0:
                        try:
                            prev = main_lis[scene_no - 1][i - 1]
                        except:
                            prev = ""
                    else:
                        prev = ""
                    try:
                        next = main_lis[scene_no - 1][i + 1]
                    except:
                        next = ""

                    # prev is previous speaker and next is next speaker of the dialogue
                    mydic[speaker] = [parenthetical,
                                      scene_no, dia, len(dia), prev, next]
                    print("line 270", mydic)
                    i = i + 1
                    speakers.append(speaker)
                    parenthetical_lis.append(parenthetical)
                    dialogues.append(mydic)
                    scene.append(mydic)
                parenthetical = "NONE"
            else:
                line = line.replace("\n", " ")
                line = ' '.join(line.split())
                pattern = re.compile(r'.*(' + '|'.join(re.escape(t) for t in transitions) + r').*', re.IGNORECASE)
                match = pattern.match(line.strip())
                if match:
                    scene.append({'Transition': line.strip()})
                else:
                    actionline.append(line)
                    scene.append(line.strip())

    # ->Appending the leftover last scene data in scenes which didn't get append in above for-loop
    scenes.append(scene)

    speakers = list(set(speakers))
    scenes = scenes[1:]
    s = []
    # -> Adding the Actionlines and Other lines separately by splitting the Actionlines by new lines.
    for scene in scenes:
        s1 = []
        for ele in scene:
            if type(ele) == type(""):
                s1.extend(ele.split("\n"))
            else:
                s1.append(ele)
        s.append(s1)
    print("dialogue: ", dialogues)
    return s, actionline, parenthetical_lis, speakers, dialogues