# import textract import re import docx import os doc = docx.Document() from docx.shared import Inches, Cm, Pt from docx.enum.text import WD_ALIGN_PARAGRAPH from docx.enum.table import WD_TABLE_ALIGNMENT, WD_ALIGN_VERTICAL # imporrrrrrrrrrrrt base from MNF.settings import BasePath basePath = BasePath() #basePath = '/home/user/mnf/project/MNF' slug_pattern= r'[\d]*[.]?[\s]*[IE][NX]T' pat = r'[\d]*[\s]*[IE]/[IE][.]?' transitions = ['CUT TO:','FADE IN:','FADE OUT:','DISSOLVE TO:','JUMP CUT TO:','JUMP TO:','CUT BACK TO:','INTERCUT WITH:','I/C WITH:','BACK TO:', 'INTERVAL'] reserved_words = ['MONTAGE','PBS','FADE','FADE','TITLE','SPLIT', 'SCREEN','CUT'] style = doc.styles['Normal'] font = style.font font.name = 'Courier New' font.size = Pt(12) def breaksen(s): l =[] #if len(s.split())<=256: if len(s.split())<=256: l.append(s) else: n = len(s.split()) for i in range(n//32 + 1): l.append(" ".join(s.split()[32*i:32*(i+1)])) return l def getRefined(filename1): print("get_refined_called") total_scenes = 0 text = textract.process(filename1, encoding="utf8", errors='ignore') filename=rf"{basePath}/conversion/translation/file.txt" f=open(filename, 'wb') f.write(text) f.close() dialog_coming=False f=open(filename, 'r', encoding="utf8", errors='ignore') doc11=f.read() f.close() f1=open(rf"{basePath}/conversion/translation/file1.txt",'w', encoding="utf8", errors='ignore') c=0 flag=False for line in doc11.split("\n"): if (line.strip().startswith(('INT.','INT ')) or \ line.strip().startswith(('I/E','E/I')) or \ line.strip().startswith(('EXT.','EXT ')) or \ line.strip().startswith('EXT/INT') or \ line.strip().startswith('INT/EXT') or \ re.match(slug_pattern,line.strip())): flag=True f1.write(line) f1.write('\n') continue else: #here we have changed line = line.strip() if flag: if line.strip()=='\n': continue if dialog_coming and (line=='\n' or line.strip()==""): continue if dialog_coming: f1.write(line) f1.write('\n') if re.match(r"\(.*\)",line): continue else: dialog_coming=False continue continue if line.isupper() and re.fullmatch(r"([A-Z'’]+[\s]*[-]*)*([#]*[\s]*[1-9])*(\(.*\))*",line.strip()): f1.write(line) f1.write('\n') dialog_coming=True continue if not line=='\n': f1.write(line) f1.write('\n') f1.close() filename1=rf"{basePath}/conversion/translation/file1.txt" #file.txt contains the data of file1.txt , no usage as of now may be change the mame of the file.txt to file1.txt text = textract.process(filename1, encoding="utf8", errors='ignore') filename=rf"{basePath}/conversion/translation/file.txt" _, file_extension = os.path.splitext(filename1) f=open(filename, 'wb') f.write(text) f.close() with open(filename, "r") as input: input_ = input.read().split('\n\n') refined=[] for line in input_: refined.append(line.strip()) refined=list(filter(lambda a: a != "", refined)) print("processing the script") for i in range(len(refined)): if not (refined[i].strip().startswith(('INT.','INT ')) or refined[i].strip().startswith(('EXT.','EXT ')) or refined[i].strip().startswith('EXT/INT') or refined[i].strip().startswith('INT/EXT') or refined[i].strip().startswith(('I/E','E/I')) or re.match(slug_pattern,refined[i].strip()) or re.match(pat,refined[i].strip())): total_scenes = total_scenes + 1 continue refined=refined[i:] break # refined.append(line.strip()) refined=list(filter(lambda a: a != "", refined)) return refined,total_scenes def getSlugAndNonSlug(refined): sluglines=[] without_slug=[] for para in refined: para=para.strip() if para.strip().startswith(('INT.','INT')) or para.strip().startswith(('EXT.','EXT')) or para.strip().startswith('EXT/INT') or para.strip().startswith(('I/E','E/I')) or para.strip().startswith('INT/EXT') or re.match(slug_pattern,para.strip()) or re.match(pat,para.strip()): sluglines.append(para) continue without_slug.append(para) return sluglines,without_slug def getSpeakers(without_slug): characters=[] for para in without_slug: lis=para.split('\n') i=0 for item in lis: i=i+1 i=min(i,len(lis)-2) if item.isupper() and not(lis[i+1].strip()==""): if re.match(r"[A-Z'’]+[\s]*[-]*[A-Z'’]*([#]*[\s]*[1-9])*(\(.*\))*",item): tem = item.split("(")[0].strip() characters.append(tem.strip()) else: continue characters=list(set(characters)) characters=list(filter(lambda x: len(x) >0,characters)) characters = [character for character in characters if set(character.split(" ")).intersection(reserved_words) == set()] return characters def getScenes(refined,total_scenes,characters): # To find scenes data structure and prev and next scenes numbers i=0 scene=[] dialogues=[] speakers=[] slugline_dic={} prev_dial_speaker="" next_dial_speaker="" pc=0 scene_no=0 actionline=[] successor_scene_no=0 predecessor_scene_no=0 parenthetical_lis=[] scenes=[] speaker="" parenthetical='NONE' patttern=r'[\d]*[.]?[\s]*[IE][NX]T' for line in refined: if line.strip().startswith(('INT.','INT')) or line.strip().startswith(('EXT.','EXT')) or line.strip().startswith('EXT/INT') or line.strip().startswith('INT/EXT') or line.strip().startswith(('I/E','E/I')) or re.match(slug_pattern,line.strip()) or re.match(pat,line.strip()): scenes.append(scene) scene=[] i=0 scene_no+=1 scene.append(line) slugline_dic[scene_no]=line.split("\n")[0].strip('0123456789.- ') else: lis=line.split("\n") lis=[l.strip() for l in lis] word=lis[0] if word.split('(')[0].strip() in characters: mydic={} prev_dial_speaker=speaker speaker=word.split('(')[0].strip() if len(lis)>1 and re.match(r"\(.*\)",lis[1]): pc=pc+1 parenthetical=lis[1] parenthetical=parenthetical.replace("\n","") dia=' '.join(lis[2:]) dia=dia.replace("\n","") ##renu dia=dia.replace("\"", '') else: dia=''.join(lis[1:]) dia=dia.replace("\n","") dia=dia.replace("\"", '') if not (len(dia)==0 and parenthetical=="NONE"): if i-1 >= 0: try: prev=main_lis[scene_no-1][i-1] except: prev="" else: prev="" try: next=main_lis[scene_no-1][i+1] except: next="" #prev is previous speaker and next is next speaker of the dialogue mydic[speaker]=[parenthetical,scene_no,dia,len(dia),prev,next] # print(mydic) prev,next="","" i=i+1 speakers.append(speaker) parenthetical_lis.append(parenthetical) dialogues.append(mydic) scene.append(mydic) parenthetical="NONE" else: line=line.replace("\n"," ") line=' '.join(line.split()) if line.strip() in transitions: scene.append({'Transition':line.strip()}) continue actionline.append(line) scene.append(line.strip()) scenes.append(scene) speakers=list(set(speakers)) scenes=scenes[1:] s = [] for scene in scenes: s1=[] for ele in scene: if type(ele) == type(""): s1.extend(ele.split("\n")) else: s1.append(ele) s.append(s1) return s,actionline,parenthetical_lis,speakers,dialogues