import re # import textract import os # from MNF.settings import BasePath # basePath = BasePath() basePath = "/home/user/mnf/project/MNF" slug_pattern = r'[\d]*[.]?[\s]*[IE][NX]T' pat = r'[\d]*[\s]*[IE]/[IE][.]?' reserved_words = ['MONTAGE', 'PBS', 'FADE TO BLACK:', 'Beat.', 'VOX POP', 'CUT TO', 'CUT TO:', 'CUT TO BLACK', 'FADE', 'TITLE', 'SPLIT', 'SCREEN', 'SHOTS', 'INTERVAL', 'END CREDITS', 'INTERVAL'] transitions = ['CUT TO:', 'CUT TO', 'CUT TO BLACK', 'FADE TO BLACK:', 'FADE IN:', 'FADE OUT:', 'DISSOLVE TO:', 'JUMP CUT TO:', 'JUMP TO:', 'CUT BACK TO:', 'INTERCUT WITH:', 'I/C WITH:', 'BACK TO:', 'END CREDITS', 'INTERVAL'] class dialogueBreakdown: def getRefined(self, filename1): total_scenes = 0 file = os.path.basename(filename1) file1 = os.path.splitext(file)[0] text = textract.process(filename1) filename = rf"{basePath}/conversion/subtitling/files/{file1}.txt" f = open(filename, 'wb') f.write(text) f.close() dialog_coming = False f = open(filename, encoding="utf-8") doc1 = f.read() f.close() f1 = open( rf"{basePath}/conversion/subtitling/files/{file1}1.txt", 'w') c = 0 flag = False for line in doc1.split("\n"): if ((line.strip().startswith(('INT.', 'INT ')) or line.strip().startswith(('I/E', 'E/I')) or line.strip().startswith(('EXT.', 'EXT ')) or line.strip().startswith('EXT/INT') or line.strip().startswith('INT/EXT') or re.match(slug_pattern, line.strip()) or re.match(pat, line.strip())) and not (line.strip().startswith(('INTERCUT', 'INTERMISSION', 'INTERVAL')))): flag = True f1.write(line) f1.write('\n') continue else: if flag: if line.strip() == '\n': continue if dialog_coming and (line == '\n' or line.strip() == ""): print("line empty or just have newline", line) continue if dialog_coming: print("200 probable dialog or PC", line) f1.write(line) f1.write('\n') if re.match(r"\(.*\)", line.strip()): print(" line 203 matched regular expression\n") continue else: print(" line 207: else of PCs", line) dialog_coming = False print(" line 208 dialog over") continue continue if line.isupper() and re.fullmatch(r"([A-Z'’]+[\s]*[-]*)*([#]*[\s]*[1-9])*(\(.*\))*", line.strip()): f1.write(line) f1.write('\n') print(" Probable dialogue speaker 211", line) dialog_coming = True continue if not line == '\n': print( "218 probably action or something else so just write it", line) f1.write(line) f1.write('\n') f1.close() filename1 = rf"{basePath}/conversion/subtitling/files/{file1}1.txt" # sys.exit(0) # file.txt contains the data of file1.txt , no usage as of now may be change the mame of the file.txt to file1.txt text = textract.process(filename1) filename = rf"{basePath}/conversion/subtitling/files/{file1}.txt" _, file_extension = os.path.splitext(filename1) f = open(filename, 'wb') f.write(text) f.close() with open(filename, "r") as input: input_ = input.read().split('\n\n') refined = [] for line in input_: # print(line.strip()) refined.append(line.strip()) refined = list(filter(lambda a: a != "", refined)) for i in range(len(refined)): if not (refined[i].strip().startswith(('INT.', 'INT ')) or refined[i].strip().startswith(('EXT.', 'EXT ')) or refined[i].strip().startswith('EXT/INT') or refined[i].strip().startswith('INT/EXT') or refined[i].strip().startswith(('I/E', 'E/I')) or re.match(slug_pattern, refined[i].strip()) or re.match(pat, refined[i].strip())): total_scenes = total_scenes + 1 continue refined = refined[i:] break # refined.append(line.strip()) refined = list(filter(lambda a: a != "", refined)) return refined, total_scenes def getSlugAndNonSlug(self, refined): sluglines = [] without_slug = [] for para in refined: para = para.strip() if ((para.strip().startswith(('INT.', 'INT')) or para.strip().startswith(('EXT.', 'EXT')) or para.strip().startswith('EXT/INT') or para.strip().startswith(('I/E', 'E/I')) or para.strip().startswith('INT/EXT') or re.match(slug_pattern, para.strip()) or re.match(pat, para.strip())) and not (para.strip().startswith(('INTERCUT', 'INTERVAL', 'INTERMISSION')))): sluglines.append(para) continue without_slug.append(para) return sluglines, without_slug def getSpeakers(self, without_slug): characters = [] for para in without_slug: lis = para.split('\n') i = 0 for item in lis: i = i+1 i = min(i, len(lis)-2) if item.isupper() and not(lis[i+1].strip() == ""): if re.match(r"[A-Z'’]+[\s]*[-]*[A-Z'’]*([#]*[\s]*[1-9])*(\(.*\))*", item): tem = item.split("(")[0].strip() characters.append(tem.strip()) # elif re.match(r"[A-Z]*[-]*[A-Z]*([#][1-9])*\(*.*\)*",item): # tem = item.split("(")[0].strip() # characters.append(tem.strip()) else: continue characters = list(set(characters)) characters = list(filter(lambda x: len(x) > 0, characters)) characters = [character for character in characters if set( character.split(" ")).intersection(reserved_words) == set()] return characters def getListForPrevAndNextDialogue(self, refined, characters): speaker_having_dia = [] main_lis = [] # to update the mainlist containing speakers having dialogues, to find prev and next speakers, may require a chnae for line in refined: if ((line.strip().startswith(('INT.', 'INT')) or line.strip().startswith(('EXT.', 'EXT')) or line.strip().startswith(('I/E', 'E/I')) or line.strip().startswith('EXT/INT') or line.strip().startswith('INT/EXT') or re.match(slug_pattern, line.strip()) or re.match(pat, line.strip())) and not (line.strip().startswith(('INTERCUT', 'INTERVAL', 'INTERMISSION')))): main_lis.append(speaker_having_dia) speaker_having_dia = [] continue else: lis = line.split("\n") lis = [l.strip() for l in lis] word = lis[0] if word.split('(')[0].strip() in characters: speaker = word.split('(')[0].strip() speaker_having_dia.append(speaker) main_lis = main_lis[1:] return main_lis, speaker_having_dia def getScenes(self, refined, total_scenes, characters): # To find scenes data structure and prev and next scenes numbers i = 0 scene = [] dialogues = [] speakers = [] slugline_dic = {} prev_dial_speaker = "" next_dial_speaker = "" pc = 0 scene_no = 0 actionline = [] successor_scene_no = 0 predecessor_scene_no = 0 parenthetical_lis = [] scenes = [] speaker = "" parenthetical = 'NONE' predecessor_scene_no_dict = { 'Scene '+str(i+1): 0 for i in range(total_scenes)} dia_count = {'Scene '+str(i+1): 0 for i in range(total_scenes)} successor_scene_no_dict = { 'Scene '+str(i+1): 0 for i in range(total_scenes)} parenthetical_count_dict = { 'Scene '+str(i+1): 0 for i in range(total_scenes)} patttern = r'[\d]*[.]?[\s]*[IE][NX]T' for line in refined: if ((line.strip().startswith(('INT.', 'INT')) or line.strip().startswith(('EXT.', 'EXT')) or line.strip().startswith('EXT/INT') or line.strip().startswith('INT/EXT') or line.strip().startswith(('I/E', 'E/I')) or re.match(slug_pattern, line.strip()) or re.match(pat, line.strip())) and not (line.strip().startswith(('INTERCUT', 'INTERVAL', 'INTERMISSION')))): # if re.match(patttern,line): # current_scene=line.split(" ")[0] # current_scene=current_scene[:1] # current_scene=int(current_scene) # successor_scene_no=min(int(current_scene)+2,total_scenes) # predecessor_scene_no=max(int(current_scene),-1) # else: successor_scene_no = min(scene_no+3, total_scenes) predecessor_scene_no = max(scene_no+1, 0) if scene_no > 0: parenthetical_count_dict['Scene '+str(scene_no+1)] = pc pc = 0 scenes.append(scene) scene = [] i = 0 scene_no += 1 predecessor_scene_no_dict['Scene ' + str(scene_no+1)] = predecessor_scene_no successor_scene_no_dict['Scene ' + str(scene_no+1)] = successor_scene_no successor_scene_no_dict['Scene '+str(1)] = 2 scene.append(line) slugline_dic[scene_no] = line.split( "\n")[0].strip('0123456789.- ') else: lis = line.split("\n") lis = [l.strip() for l in lis] print(" \n Line 363 probable dialogue list", lis) word = lis[0] extendedSpeaker = "" if word.split('(')[0].strip() in characters: mydic = {} prev_dial_speaker = speaker speakerline = word.split('(') # speaker = word.split('(')[0].strip() speaker = speakerline[0].strip() print("Speaker 378", speaker) extendedSpeaker = word.strip() if len(lis) > 1 and re.match(r"\(.*\)", lis[1]): pc = pc+1 parenthetical = lis[1] parenthetical = parenthetical.replace("\n", "") dia = ' '.join(lis[2:]) dia = dia.replace("\n", "") # renu #dia=dia.replace("\"", '') else: dia = ''.join(lis[1:]) dia = dia.replace("\n", "") dia = dia.replace("\"", '') print(" length dia\n", len(dia)) if not (len(dia) == 0 and parenthetical == "NONE"): print(" len dia != and Parenthetical == NONE: 384 ") if i-1 >= 0: try: prev = main_lis[scene_no-1][i-1] except: prev = "" else: prev = "" try: next = main_lis[scene_no-1][i+1] except: next = "" # prev is previous speaker and next is next speaker of the dialogue mydic[speaker] = [parenthetical, scene_no, dia, len(dia), prev, next, extendedSpeaker] print("mydic 398", speaker, mydic[speaker]) dia_count['Scene '+str(scene_no)] += 1 # print(mydic) prev, next = "", "" i = i+1 speakers.append(speaker) parenthetical_lis.append(parenthetical) dialogues.append(mydic) scene.append(mydic) parenthetical = "NONE" else: line = line.replace("\n", " ") line = ' '.join(line.split()) if line.strip() in transitions: scene.append({'Transition': line.strip()}) continue actionline.append(line) scene.append(line.strip()) scenes.append(scene) parenthetical_count_dict['Scene '+str(scene_no)] = pc speakers = list(set(speakers)) scenes = scenes[1:] print("Scenes:", scenes) # for removing '\n' from action lines # return scenes also if '\n' required and modify practice_with_db also # s = [] # for scene in scenes: # s1=[] # for ele in scene: # if type(ele) == type(""): # s1.extend(ele.split("\n")) # else: # s1.append(ele) # s.append(s1) return scenes, parenthetical_count_dict, predecessor_scene_no_dict, successor_scene_no_dict, actionline, parenthetical_lis, speakers, dia_count, dialogues, slugline_dic