302 lines
14 KiB
Python
302 lines
14 KiB
Python
|
import re
|
|||
|
# import textract
|
|||
|
import os
|
|||
|
# from MNF.settings import BasePath
|
|||
|
# basePath = BasePath()
|
|||
|
|
|||
|
basePath = "/home/user/mnf/project/MNF"
|
|||
|
|
|||
|
slug_pattern = r'[\d]*[.]?[\s]*[IE][NX]T'
|
|||
|
pat = r'[\d]*[\s]*[IE]/[IE][.]?'
|
|||
|
reserved_words = ['MONTAGE', 'PBS', 'FADE TO BLACK:', 'Beat.', 'VOX POP', 'CUT TO', 'CUT TO:',
|
|||
|
'CUT TO BLACK', 'FADE', 'TITLE', 'SPLIT', 'SCREEN', 'SHOTS', 'INTERVAL', 'END CREDITS', 'INTERVAL']
|
|||
|
transitions = ['CUT TO:', 'CUT TO', 'CUT TO BLACK', 'FADE TO BLACK:', 'FADE IN:', 'FADE OUT:', 'DISSOLVE TO:',
|
|||
|
'JUMP CUT TO:', 'JUMP TO:', 'CUT BACK TO:', 'INTERCUT WITH:', 'I/C WITH:', 'BACK TO:', 'END CREDITS', 'INTERVAL']
|
|||
|
|
|||
|
|
|||
|
class dialogueBreakdown:
|
|||
|
def getRefined(self, filename1):
|
|||
|
total_scenes = 0
|
|||
|
file = os.path.basename(filename1)
|
|||
|
file1 = os.path.splitext(file)[0]
|
|||
|
text = textract.process(filename1)
|
|||
|
filename = rf"{basePath}/conversion/subtitling/files/{file1}.txt"
|
|||
|
f = open(filename, 'wb')
|
|||
|
f.write(text)
|
|||
|
f.close()
|
|||
|
dialog_coming = False
|
|||
|
f = open(filename, encoding="utf-8")
|
|||
|
doc1 = f.read()
|
|||
|
|
|||
|
f.close()
|
|||
|
f1 = open(
|
|||
|
rf"{basePath}/conversion/subtitling/files/{file1}1.txt", 'w')
|
|||
|
c = 0
|
|||
|
flag = False
|
|||
|
for line in doc1.split("\n"):
|
|||
|
if ((line.strip().startswith(('INT.', 'INT ')) or
|
|||
|
line.strip().startswith(('I/E', 'E/I')) or
|
|||
|
line.strip().startswith(('EXT.', 'EXT ')) or
|
|||
|
line.strip().startswith('EXT/INT') or
|
|||
|
line.strip().startswith('INT/EXT') or
|
|||
|
re.match(slug_pattern, line.strip()) or re.match(pat, line.strip())) and not (line.strip().startswith(('INTERCUT', 'INTERMISSION', 'INTERVAL')))):
|
|||
|
|
|||
|
flag = True
|
|||
|
f1.write(line)
|
|||
|
f1.write('\n')
|
|||
|
continue
|
|||
|
else:
|
|||
|
if flag:
|
|||
|
if line.strip() == '\n':
|
|||
|
continue
|
|||
|
if dialog_coming and (line == '\n' or line.strip() == ""):
|
|||
|
print("line empty or just have newline", line)
|
|||
|
continue
|
|||
|
if dialog_coming:
|
|||
|
print("200 probable dialog or PC", line)
|
|||
|
f1.write(line)
|
|||
|
f1.write('\n')
|
|||
|
if re.match(r"\(.*\)", line.strip()):
|
|||
|
print(" line 203 matched regular expression\n")
|
|||
|
continue
|
|||
|
else:
|
|||
|
print(" line 207: else of PCs", line)
|
|||
|
dialog_coming = False
|
|||
|
print(" line 208 dialog over")
|
|||
|
continue
|
|||
|
continue
|
|||
|
if line.isupper() and re.fullmatch(r"([A-Z'’]+[\s]*[-]*)*([#]*[\s]*[1-9])*(\(.*\))*", line.strip()):
|
|||
|
f1.write(line)
|
|||
|
f1.write('\n')
|
|||
|
print(" Probable dialogue speaker 211", line)
|
|||
|
dialog_coming = True
|
|||
|
continue
|
|||
|
if not line == '\n':
|
|||
|
print(
|
|||
|
"218 probably action or something else so just write it", line)
|
|||
|
f1.write(line)
|
|||
|
f1.write('\n')
|
|||
|
|
|||
|
f1.close()
|
|||
|
filename1 = rf"{basePath}/conversion/subtitling/files/{file1}1.txt"
|
|||
|
# sys.exit(0)
|
|||
|
# file.txt contains the data of file1.txt , no usage as of now may be change the mame of the file.txt to file1.txt
|
|||
|
text = textract.process(filename1)
|
|||
|
filename = rf"{basePath}/conversion/subtitling/files/{file1}.txt"
|
|||
|
_, file_extension = os.path.splitext(filename1)
|
|||
|
f = open(filename, 'wb')
|
|||
|
f.write(text)
|
|||
|
f.close()
|
|||
|
|
|||
|
with open(filename, "r") as input:
|
|||
|
input_ = input.read().split('\n\n')
|
|||
|
|
|||
|
refined = []
|
|||
|
|
|||
|
for line in input_:
|
|||
|
# print(line.strip())
|
|||
|
refined.append(line.strip())
|
|||
|
refined = list(filter(lambda a: a != "", refined))
|
|||
|
|
|||
|
for i in range(len(refined)):
|
|||
|
if not (refined[i].strip().startswith(('INT.', 'INT ')) or refined[i].strip().startswith(('EXT.', 'EXT ')) or refined[i].strip().startswith('EXT/INT') or refined[i].strip().startswith('INT/EXT') or refined[i].strip().startswith(('I/E', 'E/I')) or re.match(slug_pattern, refined[i].strip()) or re.match(pat, refined[i].strip())):
|
|||
|
total_scenes = total_scenes + 1
|
|||
|
continue
|
|||
|
refined = refined[i:]
|
|||
|
break
|
|||
|
# refined.append(line.strip())
|
|||
|
|
|||
|
refined = list(filter(lambda a: a != "", refined))
|
|||
|
return refined, total_scenes
|
|||
|
|
|||
|
def getSlugAndNonSlug(self, refined):
|
|||
|
sluglines = []
|
|||
|
without_slug = []
|
|||
|
for para in refined:
|
|||
|
para = para.strip()
|
|||
|
if ((para.strip().startswith(('INT.', 'INT')) or para.strip().startswith(('EXT.', 'EXT')) or para.strip().startswith('EXT/INT') or para.strip().startswith(('I/E', 'E/I')) or para.strip().startswith('INT/EXT') or re.match(slug_pattern, para.strip()) or re.match(pat, para.strip())) and not (para.strip().startswith(('INTERCUT', 'INTERVAL', 'INTERMISSION')))):
|
|||
|
sluglines.append(para)
|
|||
|
continue
|
|||
|
without_slug.append(para)
|
|||
|
return sluglines, without_slug
|
|||
|
|
|||
|
def getSpeakers(self, without_slug):
|
|||
|
characters = []
|
|||
|
for para in without_slug:
|
|||
|
lis = para.split('\n')
|
|||
|
i = 0
|
|||
|
for item in lis:
|
|||
|
i = i+1
|
|||
|
i = min(i, len(lis)-2)
|
|||
|
if item.isupper() and not(lis[i+1].strip() == ""):
|
|||
|
if re.match(r"[A-Z'’]+[\s]*[-]*[A-Z'’]*([#]*[\s]*[1-9])*(\(.*\))*", item):
|
|||
|
tem = item.split("(")[0].strip()
|
|||
|
characters.append(tem.strip())
|
|||
|
# elif re.match(r"[A-Z]*[-]*[A-Z]*([#][1-9])*\(*.*\)*",item):
|
|||
|
# tem = item.split("(")[0].strip()
|
|||
|
# characters.append(tem.strip())
|
|||
|
else:
|
|||
|
continue
|
|||
|
|
|||
|
characters = list(set(characters))
|
|||
|
characters = list(filter(lambda x: len(x) > 0, characters))
|
|||
|
characters = [character for character in characters if set(
|
|||
|
character.split(" ")).intersection(reserved_words) == set()]
|
|||
|
return characters
|
|||
|
|
|||
|
def getListForPrevAndNextDialogue(self, refined, characters):
|
|||
|
speaker_having_dia = []
|
|||
|
main_lis = []
|
|||
|
|
|||
|
# to update the mainlist containing speakers having dialogues, to find prev and next speakers, may require a chnae
|
|||
|
for line in refined:
|
|||
|
if ((line.strip().startswith(('INT.', 'INT')) or line.strip().startswith(('EXT.', 'EXT')) or line.strip().startswith(('I/E', 'E/I')) or line.strip().startswith('EXT/INT') or line.strip().startswith('INT/EXT') or re.match(slug_pattern, line.strip()) or re.match(pat, line.strip())) and not (line.strip().startswith(('INTERCUT', 'INTERVAL', 'INTERMISSION')))):
|
|||
|
main_lis.append(speaker_having_dia)
|
|||
|
speaker_having_dia = []
|
|||
|
continue
|
|||
|
else:
|
|||
|
lis = line.split("\n")
|
|||
|
lis = [l.strip() for l in lis]
|
|||
|
word = lis[0]
|
|||
|
if word.split('(')[0].strip() in characters:
|
|||
|
speaker = word.split('(')[0].strip()
|
|||
|
speaker_having_dia.append(speaker)
|
|||
|
main_lis = main_lis[1:]
|
|||
|
return main_lis, speaker_having_dia
|
|||
|
|
|||
|
def getScenes(self, refined, total_scenes, characters):
|
|||
|
# To find scenes data structure and prev and next scenes numbers
|
|||
|
i = 0
|
|||
|
scene = []
|
|||
|
dialogues = []
|
|||
|
speakers = []
|
|||
|
slugline_dic = {}
|
|||
|
prev_dial_speaker = ""
|
|||
|
next_dial_speaker = ""
|
|||
|
pc = 0
|
|||
|
scene_no = 0
|
|||
|
actionline = []
|
|||
|
successor_scene_no = 0
|
|||
|
predecessor_scene_no = 0
|
|||
|
parenthetical_lis = []
|
|||
|
|
|||
|
scenes = []
|
|||
|
speaker = ""
|
|||
|
parenthetical = 'NONE'
|
|||
|
predecessor_scene_no_dict = {
|
|||
|
'Scene '+str(i+1): 0 for i in range(total_scenes)}
|
|||
|
dia_count = {'Scene '+str(i+1): 0 for i in range(total_scenes)}
|
|||
|
successor_scene_no_dict = {
|
|||
|
'Scene '+str(i+1): 0 for i in range(total_scenes)}
|
|||
|
parenthetical_count_dict = {
|
|||
|
'Scene '+str(i+1): 0 for i in range(total_scenes)}
|
|||
|
patttern = r'[\d]*[.]?[\s]*[IE][NX]T'
|
|||
|
for line in refined:
|
|||
|
if ((line.strip().startswith(('INT.', 'INT')) or line.strip().startswith(('EXT.', 'EXT')) or line.strip().startswith('EXT/INT') or line.strip().startswith('INT/EXT') or line.strip().startswith(('I/E', 'E/I')) or re.match(slug_pattern, line.strip()) or re.match(pat, line.strip())) and not (line.strip().startswith(('INTERCUT', 'INTERVAL', 'INTERMISSION')))):
|
|||
|
# if re.match(patttern,line):
|
|||
|
# current_scene=line.split(" ")[0]
|
|||
|
# current_scene=current_scene[:1]
|
|||
|
# current_scene=int(current_scene)
|
|||
|
# successor_scene_no=min(int(current_scene)+2,total_scenes)
|
|||
|
# predecessor_scene_no=max(int(current_scene),-1)
|
|||
|
# else:
|
|||
|
|
|||
|
successor_scene_no = min(scene_no+3, total_scenes)
|
|||
|
predecessor_scene_no = max(scene_no+1, 0)
|
|||
|
if scene_no > 0:
|
|||
|
parenthetical_count_dict['Scene '+str(scene_no+1)] = pc
|
|||
|
pc = 0
|
|||
|
scenes.append(scene)
|
|||
|
scene = []
|
|||
|
i = 0
|
|||
|
scene_no += 1
|
|||
|
predecessor_scene_no_dict['Scene ' +
|
|||
|
str(scene_no+1)] = predecessor_scene_no
|
|||
|
successor_scene_no_dict['Scene ' +
|
|||
|
str(scene_no+1)] = successor_scene_no
|
|||
|
successor_scene_no_dict['Scene '+str(1)] = 2
|
|||
|
scene.append(line)
|
|||
|
slugline_dic[scene_no] = line.split(
|
|||
|
"\n")[0].strip('0123456789.- ')
|
|||
|
|
|||
|
else:
|
|||
|
lis = line.split("\n")
|
|||
|
lis = [l.strip() for l in lis]
|
|||
|
print(" \n Line 363 probable dialogue list", lis)
|
|||
|
word = lis[0]
|
|||
|
extendedSpeaker = ""
|
|||
|
if word.split('(')[0].strip() in characters:
|
|||
|
mydic = {}
|
|||
|
prev_dial_speaker = speaker
|
|||
|
speakerline = word.split('(')
|
|||
|
# speaker = word.split('(')[0].strip()
|
|||
|
speaker = speakerline[0].strip()
|
|||
|
print("Speaker 378", speaker)
|
|||
|
extendedSpeaker = word.strip()
|
|||
|
if len(lis) > 1 and re.match(r"\(.*\)", lis[1]):
|
|||
|
pc = pc+1
|
|||
|
parenthetical = lis[1]
|
|||
|
parenthetical = parenthetical.replace("\n", "")
|
|||
|
dia = ' '.join(lis[2:])
|
|||
|
dia = dia.replace("\n", "")
|
|||
|
# renu
|
|||
|
#dia=dia.replace("\"", '')
|
|||
|
|
|||
|
else:
|
|||
|
dia = ''.join(lis[1:])
|
|||
|
dia = dia.replace("\n", "")
|
|||
|
dia = dia.replace("\"", '')
|
|||
|
print(" length dia\n", len(dia))
|
|||
|
if not (len(dia) == 0 and parenthetical == "NONE"):
|
|||
|
print(" len dia != and Parenthetical == NONE: 384 ")
|
|||
|
if i-1 >= 0:
|
|||
|
try:
|
|||
|
prev = main_lis[scene_no-1][i-1]
|
|||
|
except:
|
|||
|
prev = ""
|
|||
|
else:
|
|||
|
prev = ""
|
|||
|
try:
|
|||
|
next = main_lis[scene_no-1][i+1]
|
|||
|
except:
|
|||
|
next = ""
|
|||
|
# prev is previous speaker and next is next speaker of the dialogue
|
|||
|
mydic[speaker] = [parenthetical,
|
|||
|
scene_no, dia, len(dia), prev, next, extendedSpeaker]
|
|||
|
print("mydic 398", speaker, mydic[speaker])
|
|||
|
dia_count['Scene '+str(scene_no)] += 1
|
|||
|
# print(mydic)
|
|||
|
prev, next = "", ""
|
|||
|
i = i+1
|
|||
|
speakers.append(speaker)
|
|||
|
parenthetical_lis.append(parenthetical)
|
|||
|
dialogues.append(mydic)
|
|||
|
scene.append(mydic)
|
|||
|
parenthetical = "NONE"
|
|||
|
else:
|
|||
|
line = line.replace("\n", " ")
|
|||
|
line = ' '.join(line.split())
|
|||
|
if line.strip() in transitions:
|
|||
|
scene.append({'Transition': line.strip()})
|
|||
|
continue
|
|||
|
actionline.append(line)
|
|||
|
scene.append(line.strip())
|
|||
|
|
|||
|
scenes.append(scene)
|
|||
|
parenthetical_count_dict['Scene '+str(scene_no)] = pc
|
|||
|
speakers = list(set(speakers))
|
|||
|
scenes = scenes[1:]
|
|||
|
print("Scenes:", scenes)
|
|||
|
# for removing '\n' from action lines
|
|||
|
# return scenes also if '\n' required and modify practice_with_db also
|
|||
|
# s = []
|
|||
|
# for scene in scenes:
|
|||
|
# s1=[]
|
|||
|
# for ele in scene:
|
|||
|
# if type(ele) == type(""):
|
|||
|
# s1.extend(ele.split("\n"))
|
|||
|
# else:
|
|||
|
# s1.append(ele)
|
|||
|
# s.append(s1)
|
|||
|
return scenes, parenthetical_count_dict, predecessor_scene_no_dict, successor_scene_no_dict, actionline, parenthetical_lis, speakers, dia_count, dialogues, slugline_dic
|