# import textract import re import docx import os doc = docx.Document() from docx.shared import Inches, Cm, Pt from docx.enum.text import WD_ALIGN_PARAGRAPH from docx.enum.table import WD_TABLE_ALIGNMENT, WD_ALIGN_VERTICAL from MNF.settings import BasePath basePath = BasePath() # -> Patterns for Detection of Sluglines,Transition,actionline and Dialogues slug_pattern = r'[\d]*[.]?[\s]*[IE][NX]T' pat = r'[\d]*[\s]*[IE]/[IE][.]?' transitions = ['CUT TO:', 'FADE IN:', 'FADE OUT:', 'DISSOLVE TO:', 'JUMP CUT TO:', 'JUMP TO:', 'CUT BACK TO:', 'INTERCUT WITH:', 'I/C WITH:', 'BACK TO:', 'INTERVAL'] reserved_words = ['MONTAGE', 'PBS', 'FADE', 'FADE', 'TITLE', 'SPLIT', 'SCREEN', 'CUT'] # -> Random function - no use def breaksen(s): l = [] if len(s.split()) <= 256: l.append(s) else: n = len(s.split()) for i in range(n // 32 + 1): l.append(" ".join(s.split()[32 * i:32 * (i + 1)])) return l # -> Function for Getting Languages and Scripts related to a Script(Docx) def getRefined(filename1): print("Get_Refined_Called") total_scenes = 0 print("filname", filename1) #filename1 = "/home/user/mnf/project/MNF/media/scripts_folder/9a97e7dc-bd18-416f-b2a6-bbfcd8a3887b/b1_a4weGyR.docx" text = textract.process(filename1, encoding="utf8", errors='ignore') filename = rf"{basePath}/conversion/translation/file.txt" f = open(filename, 'wb') f.write(text) f.close() dialog_coming = False f = open(filename, 'r', encoding="utf8", errors='ignore') doc11 = f.read() f.close() f1 = open(rf"{basePath}/conversion/translation/file1.txt", 'w', encoding="utf8", errors='ignore') flag = False for line in doc11.split("\n"): line = line.strip() print("Original Line:", line) # -> For Detection of Slug lines if (line.strip().startswith(('INT.', 'INT ')) or line.strip().startswith(('I/E', 'E/I')) or line.strip().startswith(('EXT.', 'EXT ')) or line.strip().startswith('EXT/INT') or line.strip().startswith('INT/EXT') or re.match(slug_pattern, line.strip()) or re.match(pat, line.strip())) and not ( line.strip().startswith(('INTERCUT', 'INTERMISSION', 'INTERVAL'))): flag = True f1.write(line) f1.write('\n') continue # -> for Detection of Probable Dialogues,Speakers and Actionlines else: print("line 88: Other than Slugline:", line) if flag: print("line 90: else-if condition:", line) print("stuck here 1") if line.strip() == '\n': continue print("stuck here 1.1") if dialog_coming and (line == '\n' or line.strip() == ""): print("stuck here 2") print("line empty or just have newline", line) continue print("stuck here 3") # -> for Detection of Probable Dialogues if dialog_coming: print("Probable dialogue or PC: ", line) f1.write(line) f1.write('\n') if re.match(r"\(.*\)", line): print("stuck here 4") continue else: print("stuck here 5") dialog_coming = False print("line 107: else of PCs and dialog over") continue # -> Detection of Speaker which implies that next line will be dialogue (dialog_coming=True) print("stuck here 6") # old_regex = "([A-Z'’]+[\s]*[-]*)*([#]*[\s]*[1-9])*(\(.*\))*" # old_regex2 = "([A-Z'’\s]*-?[#]*\s*[1-9]*\s*\([^)]*\))?" if line.isupper(): print("Qualified a certian category1") else: print("not Qualified a certian category1") if re.fullmatch(r"([A-Z'’]+[\s]*[-]*)*([#]*[\s]*[1-9])*(\(.*\))*", line.strip()): print("Qualified a certian category2") else: print("bot Qualified a certian categor2") if re.fullmatch( r"(MRS?|DR|ER|PHD|ESQ|HON|JR|MS|MESSRS|MMES|MSGR|PROF|REV|RT. HON|SR|ST)\. [A-Z]+", line.strip()): print("Qualified a certian category3") else: print("bot Qualified a certian category3") if line.isupper() and ( re.fullmatch(r"([A-Z'’]+[\s]*[-]*)*([#]*[\s]*[1-9])*(\(.*\))*", line.strip()) or re.fullmatch( r"(MRS?|DR|ER|PHD|ESQ|HON|JR|MS|MESSRS|MMES|MSGR|PROF|REV|RT. HON|SR|ST)\. [A-Z]+", line.strip())): print("line 111: May be speaker: ", line) f1.write(line) f1.write('\n') dialog_coming = True continue print("stuck here 7") # -> Detection of Actionline , etc. if not line == '\n': print("line 120 Actionline or Something else", line) f1.write(line) f1.write('\n') print("stuck here 8") print("line 125 file closed") f1.close() # -> copying all the data in file1.txt to file.txt with bytes included filename1 = rf"{basePath}/conversion/translation/file1.txt" text = textract.process(filename1, encoding="utf8", errors='ignore') print("line 130: ", text) filename = rf"{basePath}/conversion/translation/file.txt" f = open(filename, 'wb') f.write(text) f.close() with open(filename, "r") as input: input_ = input.read().split('\n\n') # -> Creating Refined List of Scenes and its data refined = [] for line in input_: refined.append(line.strip()) refined = list(filter(lambda a: a != "", refined)) for i in range(len(refined)): if not (refined[i].strip().startswith(('INT.', 'INT ')) or refined[i].strip().startswith(('EXT.', 'EXT ')) or refined[i].strip().startswith('EXT/INT') or refined[i].strip().startswith('INT/EXT') or refined[ i].strip().startswith(('I/E', 'E/I')) or re.match(slug_pattern, refined[i].strip()) or re.match(pat, refined[ i].strip())): total_scenes = total_scenes + 1 continue # -> This indicates that if a part of refined data does not have a Slug-line then # that data should be rejected refined = refined[i:] break refined = list(filter(lambda a: a != "", refined)) print("line 156:Refined", refined) return refined, total_scenes # -> For Getting Slug lines and Non-Slug lines from Refined Data def getSlugAndNonSlug(refined): sluglines = [] without_slug = [] for para in refined: para = para.strip() if para.strip().startswith(('INT.', 'INT')) or para.strip().startswith( ('EXT.', 'EXT')) or para.strip().startswith('EXT/INT') or para.strip().startswith( ('I/E', 'E/I')) or para.strip().startswith('INT/EXT') or re.match(slug_pattern, para.strip()) or re.match(pat, para.strip()): sluglines.append(para) continue without_slug.append(para) return sluglines, without_slug # -> For Extracting the Speakers from Refined Data def getSpeakers(without_slug): characters = [] for para in without_slug: lis = para.split('\n') i = 0 for item in lis: i = i + 1 i = min(i, len(lis) - 2) if item.isupper() and not (lis[i + 1].strip() == ""): if re.match(r"[A-Z'’]+[\s]*[-]*[A-Z'’]*([#]*[\s]*[1-9])*(\(.*\))*", item): tem = item.split("(")[0].strip() characters.append(tem.strip()) else: continue characters = list(set(characters)) characters = list(filter(lambda x: len(x) > 0, characters)) characters = [character for character in characters if set( character.split(" ")).intersection(reserved_words) == set()] return characters # ->Use this function For getting the Scenes with all appropriate data extracted def getScenes(refined, total_scenes, characters): # To find scenes data structure and prev and next scenes numbers i = 0 scene = [] dialogues = [] speakers = [] slugline_dic = {} pc = 0 scene_no = 0 actionline = [] parenthetical_lis = [] scenes = [] speaker = "" parenthetical = 'NONE' for line in refined: # -> For Detection of Slug lines if line.strip().startswith(('INT.', 'INT')) or line.strip().startswith( ('EXT.', 'EXT')) or line.strip().startswith('EXT/INT') or line.strip().startswith( 'INT/EXT') or line.strip().startswith(('I/E', 'E/I')) or re.match(slug_pattern, line.strip()) or re.match(pat, line.strip()): print("Slug-line Case") scenes.append(scene) scene = [] i = 0 scene_no += 1 scene.append(line) slugline_dic[scene_no] = line.split("\n")[0].strip('0123456789.- ') # -> For Detection of Actionlines, Speakers, Dialogues,Transitions else: print("Not Slug-line Case") lis = line.split("\n") lis = [l.strip() for l in lis] print(" \n Line 232 probable dialogue list", lis) word = lis[0] # -> For Extraction of Dialogues, Paranthetical lines, Speakers of Previous line and Next line if word.split('(')[0].strip() in characters: mydic = {} speaker = word.split('(')[0].strip() print("Speaker 238", speaker) # -> if line has a speaker, dialogues and parenthetical lines then it goes in this part of code if len(lis) > 1 and re.match(r"\(.*\)", lis[1]): pc = pc + 1 parenthetical = lis[1] parenthetical = parenthetical.replace("\n", "") dia = ' '.join(lis[2:]) dia = dia.replace("\n", "") dia = dia.replace("\"", '') else: dia = ''.join(lis[1:]) dia = dia.replace("\n", "") dia = dia.replace("\"", '') print(" length dia\n", len(dia)) # -> if no dialogues and no paranthetical lines were found then this code if not (len(dia) == 0 and parenthetical == "NONE"): print(" len dia != and Parenthetical == NONE: 384 ") if i - 1 >= 0: try: prev = main_lis[scene_no - 1][i - 1] except: prev = "" else: prev = "" try: next = main_lis[scene_no - 1][i + 1] except: next = "" # prev is previous speaker and next is next speaker of the dialogue mydic[speaker] = [parenthetical, scene_no, dia, len(dia), prev, next] print("line 270", mydic) i = i + 1 speakers.append(speaker) parenthetical_lis.append(parenthetical) dialogues.append(mydic) scene.append(mydic) parenthetical = "NONE" else: line = line.replace("\n", " ") line = ' '.join(line.split()) pattern = re.compile(r'.*(' + '|'.join(re.escape(t) for t in transitions) + r').*', re.IGNORECASE) match = pattern.match(line.strip()) if match: scene.append({'Transition': line.strip()}) else: actionline.append(line) scene.append(line.strip()) # ->Appending the leftover last scene data in scenes which didn't get append in above for-loop scenes.append(scene) speakers = list(set(speakers)) scenes = scenes[1:] s = [] # -> Adding the Actionlines and Other lines separately by splitting the Actionlines by new lines. for scene in scenes: s1 = [] for ele in scene: if type(ele) == type(""): s1.extend(ele.split("\n")) else: s1.append(ele) s.append(s1) print("dialogue: ", dialogues) return s, actionline, parenthetical_lis, speakers, dialogues