249 lines
9.0 KiB
Python
Executable File
249 lines
9.0 KiB
Python
Executable File
# import textract
|
||
import re
|
||
import docx
|
||
import os
|
||
doc = docx.Document()
|
||
from docx.shared import Inches, Cm, Pt
|
||
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
||
from docx.enum.table import WD_TABLE_ALIGNMENT, WD_ALIGN_VERTICAL
|
||
# imporrrrrrrrrrrrt base
|
||
from MNF.settings import BasePath
|
||
basePath = BasePath()
|
||
#basePath = '/home/user/mnf/project/MNF'
|
||
|
||
slug_pattern= r'[\d]*[.]?[\s]*[IE][NX]T'
|
||
pat = r'[\d]*[\s]*[IE]/[IE][.]?'
|
||
transitions = ['CUT TO:','FADE IN:','FADE OUT:','DISSOLVE TO:','JUMP CUT TO:','JUMP TO:','CUT BACK TO:','INTERCUT WITH:','I/C WITH:','BACK TO:', 'INTERVAL']
|
||
reserved_words = ['MONTAGE','PBS','FADE','FADE','TITLE','SPLIT', 'SCREEN','CUT']
|
||
style = doc.styles['Normal']
|
||
font = style.font
|
||
font.name = 'Courier New'
|
||
font.size = Pt(12)
|
||
|
||
def breaksen(s):
|
||
l =[]
|
||
#if len(s.split())<=256:
|
||
if len(s.split())<=256:
|
||
l.append(s)
|
||
else:
|
||
n = len(s.split())
|
||
for i in range(n//32 + 1):
|
||
l.append(" ".join(s.split()[32*i:32*(i+1)]))
|
||
return l
|
||
|
||
def getRefined(filename1):
|
||
print("get_refined_called")
|
||
total_scenes = 0
|
||
text = textract.process(filename1, encoding="utf8", errors='ignore')
|
||
filename=rf"{basePath}/conversion/translation/file.txt"
|
||
f=open(filename, 'wb')
|
||
f.write(text)
|
||
f.close()
|
||
dialog_coming=False
|
||
f=open(filename, 'r', encoding="utf8", errors='ignore')
|
||
doc11=f.read()
|
||
|
||
f.close()
|
||
f1=open(rf"{basePath}/conversion/translation/file1.txt",'w', encoding="utf8", errors='ignore')
|
||
c=0
|
||
flag=False
|
||
for line in doc11.split("\n"):
|
||
if (line.strip().startswith(('INT.','INT ')) or \
|
||
line.strip().startswith(('I/E','E/I')) or \
|
||
line.strip().startswith(('EXT.','EXT ')) or \
|
||
line.strip().startswith('EXT/INT') or \
|
||
line.strip().startswith('INT/EXT') or \
|
||
re.match(slug_pattern,line.strip())):
|
||
|
||
flag=True
|
||
f1.write(line)
|
||
f1.write('\n')
|
||
continue
|
||
else:
|
||
#here we have changed
|
||
line = line.strip()
|
||
if flag:
|
||
if line.strip()=='\n':
|
||
continue
|
||
if dialog_coming and (line=='\n' or line.strip()==""):
|
||
continue
|
||
if dialog_coming:
|
||
f1.write(line)
|
||
f1.write('\n')
|
||
if re.match(r"\(.*\)",line):
|
||
|
||
continue
|
||
else:
|
||
dialog_coming=False
|
||
continue
|
||
continue
|
||
if line.isupper() and re.fullmatch(r"([A-Z'’]+[\s]*[-]*)*([#]*[\s]*[1-9])*(\(.*\))*",line.strip()):
|
||
f1.write(line)
|
||
f1.write('\n')
|
||
dialog_coming=True
|
||
continue
|
||
if not line=='\n':
|
||
f1.write(line)
|
||
f1.write('\n')
|
||
|
||
|
||
f1.close()
|
||
filename1=rf"{basePath}/conversion/translation/file1.txt"
|
||
#file.txt contains the data of file1.txt , no usage as of now may be change the mame of the file.txt to file1.txt
|
||
text = textract.process(filename1, encoding="utf8", errors='ignore')
|
||
|
||
filename=rf"{basePath}/conversion/translation/file.txt"
|
||
_, file_extension = os.path.splitext(filename1)
|
||
f=open(filename, 'wb')
|
||
f.write(text)
|
||
f.close()
|
||
|
||
with open(filename, "r") as input:
|
||
input_ = input.read().split('\n\n')
|
||
|
||
refined=[]
|
||
|
||
for line in input_:
|
||
refined.append(line.strip())
|
||
refined=list(filter(lambda a: a != "", refined))
|
||
print("processing the script")
|
||
|
||
for i in range(len(refined)):
|
||
if not (refined[i].strip().startswith(('INT.','INT ')) or refined[i].strip().startswith(('EXT.','EXT ')) or refined[i].strip().startswith('EXT/INT') or refined[i].strip().startswith('INT/EXT') or refined[i].strip().startswith(('I/E','E/I')) or re.match(slug_pattern,refined[i].strip()) or re.match(pat,refined[i].strip())):
|
||
total_scenes = total_scenes + 1
|
||
continue
|
||
refined=refined[i:]
|
||
break
|
||
# refined.append(line.strip())
|
||
|
||
refined=list(filter(lambda a: a != "", refined))
|
||
return refined,total_scenes
|
||
|
||
def getSlugAndNonSlug(refined):
|
||
sluglines=[]
|
||
without_slug=[]
|
||
for para in refined:
|
||
para=para.strip()
|
||
if para.strip().startswith(('INT.','INT')) or para.strip().startswith(('EXT.','EXT')) or para.strip().startswith('EXT/INT') or para.strip().startswith(('I/E','E/I')) or para.strip().startswith('INT/EXT') or re.match(slug_pattern,para.strip()) or re.match(pat,para.strip()):
|
||
sluglines.append(para)
|
||
continue
|
||
without_slug.append(para)
|
||
return sluglines,without_slug
|
||
|
||
def getSpeakers(without_slug):
|
||
characters=[]
|
||
for para in without_slug:
|
||
lis=para.split('\n')
|
||
i=0
|
||
for item in lis:
|
||
i=i+1
|
||
i=min(i,len(lis)-2)
|
||
if item.isupper() and not(lis[i+1].strip()==""):
|
||
if re.match(r"[A-Z'’]+[\s]*[-]*[A-Z'’]*([#]*[\s]*[1-9])*(\(.*\))*",item):
|
||
tem = item.split("(")[0].strip()
|
||
characters.append(tem.strip())
|
||
else:
|
||
continue
|
||
|
||
characters=list(set(characters))
|
||
characters=list(filter(lambda x: len(x) >0,characters))
|
||
characters = [character for character in characters if set(character.split(" ")).intersection(reserved_words) == set()]
|
||
return characters
|
||
|
||
|
||
def getScenes(refined,total_scenes,characters):
|
||
# To find scenes data structure and prev and next scenes numbers
|
||
i=0
|
||
scene=[]
|
||
dialogues=[]
|
||
speakers=[]
|
||
slugline_dic={}
|
||
prev_dial_speaker=""
|
||
next_dial_speaker=""
|
||
pc=0
|
||
scene_no=0
|
||
actionline=[]
|
||
successor_scene_no=0
|
||
predecessor_scene_no=0
|
||
parenthetical_lis=[]
|
||
|
||
scenes=[]
|
||
speaker=""
|
||
parenthetical='NONE'
|
||
patttern=r'[\d]*[.]?[\s]*[IE][NX]T'
|
||
for line in refined:
|
||
if line.strip().startswith(('INT.','INT')) or line.strip().startswith(('EXT.','EXT')) or line.strip().startswith('EXT/INT') or line.strip().startswith('INT/EXT') or line.strip().startswith(('I/E','E/I')) or re.match(slug_pattern,line.strip()) or re.match(pat,line.strip()):
|
||
scenes.append(scene)
|
||
scene=[]
|
||
i=0
|
||
scene_no+=1
|
||
scene.append(line)
|
||
slugline_dic[scene_no]=line.split("\n")[0].strip('0123456789.- ')
|
||
|
||
else:
|
||
lis=line.split("\n")
|
||
lis=[l.strip() for l in lis]
|
||
word=lis[0]
|
||
if word.split('(')[0].strip() in characters:
|
||
mydic={}
|
||
prev_dial_speaker=speaker
|
||
speaker=word.split('(')[0].strip()
|
||
if len(lis)>1 and re.match(r"\(.*\)",lis[1]):
|
||
pc=pc+1
|
||
parenthetical=lis[1]
|
||
parenthetical=parenthetical.replace("\n","")
|
||
dia=' '.join(lis[2:])
|
||
dia=dia.replace("\n","")
|
||
##renu
|
||
dia=dia.replace("\"", '')
|
||
|
||
else:
|
||
dia=''.join(lis[1:])
|
||
dia=dia.replace("\n","")
|
||
dia=dia.replace("\"", '')
|
||
if not (len(dia)==0 and parenthetical=="NONE"):
|
||
|
||
if i-1 >= 0:
|
||
try:
|
||
prev=main_lis[scene_no-1][i-1]
|
||
except:
|
||
prev=""
|
||
else:
|
||
prev=""
|
||
try:
|
||
next=main_lis[scene_no-1][i+1]
|
||
except:
|
||
next=""
|
||
#prev is previous speaker and next is next speaker of the dialogue
|
||
mydic[speaker]=[parenthetical,scene_no,dia,len(dia),prev,next]
|
||
# print(mydic)
|
||
prev,next="",""
|
||
i=i+1
|
||
speakers.append(speaker)
|
||
parenthetical_lis.append(parenthetical)
|
||
dialogues.append(mydic)
|
||
scene.append(mydic)
|
||
parenthetical="NONE"
|
||
else:
|
||
line=line.replace("\n"," ")
|
||
line=' '.join(line.split())
|
||
if line.strip() in transitions:
|
||
scene.append({'Transition':line.strip()})
|
||
continue
|
||
actionline.append(line)
|
||
scene.append(line.strip())
|
||
|
||
|
||
scenes.append(scene)
|
||
speakers=list(set(speakers))
|
||
scenes=scenes[1:]
|
||
s = []
|
||
for scene in scenes:
|
||
s1=[]
|
||
for ele in scene:
|
||
if type(ele) == type(""):
|
||
s1.extend(ele.split("\n"))
|
||
else:
|
||
s1.append(ele)
|
||
s.append(s1)
|
||
return s,actionline,parenthetical_lis,speakers,dialogues |