Conversion_Kitchen_Code/kitchen_counter/conversion/subtitling/dia_function.py

302 lines
14 KiB
Python
Raw Normal View History

2024-04-27 09:33:09 +00:00
import re
# import textract
import os
# from MNF.settings import BasePath
# basePath = BasePath()
basePath = "/home/user/mnf/project/MNF"
slug_pattern = r'[\d]*[.]?[\s]*[IE][NX]T'
pat = r'[\d]*[\s]*[IE]/[IE][.]?'
reserved_words = ['MONTAGE', 'PBS', 'FADE TO BLACK:', 'Beat.', 'VOX POP', 'CUT TO', 'CUT TO:',
'CUT TO BLACK', 'FADE', 'TITLE', 'SPLIT', 'SCREEN', 'SHOTS', 'INTERVAL', 'END CREDITS', 'INTERVAL']
transitions = ['CUT TO:', 'CUT TO', 'CUT TO BLACK', 'FADE TO BLACK:', 'FADE IN:', 'FADE OUT:', 'DISSOLVE TO:',
'JUMP CUT TO:', 'JUMP TO:', 'CUT BACK TO:', 'INTERCUT WITH:', 'I/C WITH:', 'BACK TO:', 'END CREDITS', 'INTERVAL']
class dialogueBreakdown:
def getRefined(self, filename1):
total_scenes = 0
file = os.path.basename(filename1)
file1 = os.path.splitext(file)[0]
text = textract.process(filename1)
filename = rf"{basePath}/conversion/subtitling/files/{file1}.txt"
f = open(filename, 'wb')
f.write(text)
f.close()
dialog_coming = False
f = open(filename, encoding="utf-8")
doc1 = f.read()
f.close()
f1 = open(
rf"{basePath}/conversion/subtitling/files/{file1}1.txt", 'w')
c = 0
flag = False
for line in doc1.split("\n"):
if ((line.strip().startswith(('INT.', 'INT ')) or
line.strip().startswith(('I/E', 'E/I')) or
line.strip().startswith(('EXT.', 'EXT ')) or
line.strip().startswith('EXT/INT') or
line.strip().startswith('INT/EXT') or
re.match(slug_pattern, line.strip()) or re.match(pat, line.strip())) and not (line.strip().startswith(('INTERCUT', 'INTERMISSION', 'INTERVAL')))):
flag = True
f1.write(line)
f1.write('\n')
continue
else:
if flag:
if line.strip() == '\n':
continue
if dialog_coming and (line == '\n' or line.strip() == ""):
print("line empty or just have newline", line)
continue
if dialog_coming:
print("200 probable dialog or PC", line)
f1.write(line)
f1.write('\n')
if re.match(r"\(.*\)", line.strip()):
print(" line 203 matched regular expression\n")
continue
else:
print(" line 207: else of PCs", line)
dialog_coming = False
print(" line 208 dialog over")
continue
continue
if line.isupper() and re.fullmatch(r"([A-Z']+[\s]*[-]*)*([#]*[\s]*[1-9])*(\(.*\))*", line.strip()):
f1.write(line)
f1.write('\n')
print(" Probable dialogue speaker 211", line)
dialog_coming = True
continue
if not line == '\n':
print(
"218 probably action or something else so just write it", line)
f1.write(line)
f1.write('\n')
f1.close()
filename1 = rf"{basePath}/conversion/subtitling/files/{file1}1.txt"
# sys.exit(0)
# file.txt contains the data of file1.txt , no usage as of now may be change the mame of the file.txt to file1.txt
text = textract.process(filename1)
filename = rf"{basePath}/conversion/subtitling/files/{file1}.txt"
_, file_extension = os.path.splitext(filename1)
f = open(filename, 'wb')
f.write(text)
f.close()
with open(filename, "r") as input:
input_ = input.read().split('\n\n')
refined = []
for line in input_:
# print(line.strip())
refined.append(line.strip())
refined = list(filter(lambda a: a != "", refined))
for i in range(len(refined)):
if not (refined[i].strip().startswith(('INT.', 'INT ')) or refined[i].strip().startswith(('EXT.', 'EXT ')) or refined[i].strip().startswith('EXT/INT') or refined[i].strip().startswith('INT/EXT') or refined[i].strip().startswith(('I/E', 'E/I')) or re.match(slug_pattern, refined[i].strip()) or re.match(pat, refined[i].strip())):
total_scenes = total_scenes + 1
continue
refined = refined[i:]
break
# refined.append(line.strip())
refined = list(filter(lambda a: a != "", refined))
return refined, total_scenes
def getSlugAndNonSlug(self, refined):
sluglines = []
without_slug = []
for para in refined:
para = para.strip()
if ((para.strip().startswith(('INT.', 'INT')) or para.strip().startswith(('EXT.', 'EXT')) or para.strip().startswith('EXT/INT') or para.strip().startswith(('I/E', 'E/I')) or para.strip().startswith('INT/EXT') or re.match(slug_pattern, para.strip()) or re.match(pat, para.strip())) and not (para.strip().startswith(('INTERCUT', 'INTERVAL', 'INTERMISSION')))):
sluglines.append(para)
continue
without_slug.append(para)
return sluglines, without_slug
def getSpeakers(self, without_slug):
characters = []
for para in without_slug:
lis = para.split('\n')
i = 0
for item in lis:
i = i+1
i = min(i, len(lis)-2)
if item.isupper() and not(lis[i+1].strip() == ""):
if re.match(r"[A-Z']+[\s]*[-]*[A-Z']*([#]*[\s]*[1-9])*(\(.*\))*", item):
tem = item.split("(")[0].strip()
characters.append(tem.strip())
# elif re.match(r"[A-Z]*[-]*[A-Z]*([#][1-9])*\(*.*\)*",item):
# tem = item.split("(")[0].strip()
# characters.append(tem.strip())
else:
continue
characters = list(set(characters))
characters = list(filter(lambda x: len(x) > 0, characters))
characters = [character for character in characters if set(
character.split(" ")).intersection(reserved_words) == set()]
return characters
def getListForPrevAndNextDialogue(self, refined, characters):
speaker_having_dia = []
main_lis = []
# to update the mainlist containing speakers having dialogues, to find prev and next speakers, may require a chnae
for line in refined:
if ((line.strip().startswith(('INT.', 'INT')) or line.strip().startswith(('EXT.', 'EXT')) or line.strip().startswith(('I/E', 'E/I')) or line.strip().startswith('EXT/INT') or line.strip().startswith('INT/EXT') or re.match(slug_pattern, line.strip()) or re.match(pat, line.strip())) and not (line.strip().startswith(('INTERCUT', 'INTERVAL', 'INTERMISSION')))):
main_lis.append(speaker_having_dia)
speaker_having_dia = []
continue
else:
lis = line.split("\n")
lis = [l.strip() for l in lis]
word = lis[0]
if word.split('(')[0].strip() in characters:
speaker = word.split('(')[0].strip()
speaker_having_dia.append(speaker)
main_lis = main_lis[1:]
return main_lis, speaker_having_dia
def getScenes(self, refined, total_scenes, characters):
# To find scenes data structure and prev and next scenes numbers
i = 0
scene = []
dialogues = []
speakers = []
slugline_dic = {}
prev_dial_speaker = ""
next_dial_speaker = ""
pc = 0
scene_no = 0
actionline = []
successor_scene_no = 0
predecessor_scene_no = 0
parenthetical_lis = []
scenes = []
speaker = ""
parenthetical = 'NONE'
predecessor_scene_no_dict = {
'Scene '+str(i+1): 0 for i in range(total_scenes)}
dia_count = {'Scene '+str(i+1): 0 for i in range(total_scenes)}
successor_scene_no_dict = {
'Scene '+str(i+1): 0 for i in range(total_scenes)}
parenthetical_count_dict = {
'Scene '+str(i+1): 0 for i in range(total_scenes)}
patttern = r'[\d]*[.]?[\s]*[IE][NX]T'
for line in refined:
if ((line.strip().startswith(('INT.', 'INT')) or line.strip().startswith(('EXT.', 'EXT')) or line.strip().startswith('EXT/INT') or line.strip().startswith('INT/EXT') or line.strip().startswith(('I/E', 'E/I')) or re.match(slug_pattern, line.strip()) or re.match(pat, line.strip())) and not (line.strip().startswith(('INTERCUT', 'INTERVAL', 'INTERMISSION')))):
# if re.match(patttern,line):
# current_scene=line.split(" ")[0]
# current_scene=current_scene[:1]
# current_scene=int(current_scene)
# successor_scene_no=min(int(current_scene)+2,total_scenes)
# predecessor_scene_no=max(int(current_scene),-1)
# else:
successor_scene_no = min(scene_no+3, total_scenes)
predecessor_scene_no = max(scene_no+1, 0)
if scene_no > 0:
parenthetical_count_dict['Scene '+str(scene_no+1)] = pc
pc = 0
scenes.append(scene)
scene = []
i = 0
scene_no += 1
predecessor_scene_no_dict['Scene ' +
str(scene_no+1)] = predecessor_scene_no
successor_scene_no_dict['Scene ' +
str(scene_no+1)] = successor_scene_no
successor_scene_no_dict['Scene '+str(1)] = 2
scene.append(line)
slugline_dic[scene_no] = line.split(
"\n")[0].strip('0123456789.- ')
else:
lis = line.split("\n")
lis = [l.strip() for l in lis]
print(" \n Line 363 probable dialogue list", lis)
word = lis[0]
extendedSpeaker = ""
if word.split('(')[0].strip() in characters:
mydic = {}
prev_dial_speaker = speaker
speakerline = word.split('(')
# speaker = word.split('(')[0].strip()
speaker = speakerline[0].strip()
print("Speaker 378", speaker)
extendedSpeaker = word.strip()
if len(lis) > 1 and re.match(r"\(.*\)", lis[1]):
pc = pc+1
parenthetical = lis[1]
parenthetical = parenthetical.replace("\n", "")
dia = ' '.join(lis[2:])
dia = dia.replace("\n", "")
# renu
#dia=dia.replace("\"", '')
else:
dia = ''.join(lis[1:])
dia = dia.replace("\n", "")
dia = dia.replace("\"", '')
print(" length dia\n", len(dia))
if not (len(dia) == 0 and parenthetical == "NONE"):
print(" len dia != and Parenthetical == NONE: 384 ")
if i-1 >= 0:
try:
prev = main_lis[scene_no-1][i-1]
except:
prev = ""
else:
prev = ""
try:
next = main_lis[scene_no-1][i+1]
except:
next = ""
# prev is previous speaker and next is next speaker of the dialogue
mydic[speaker] = [parenthetical,
scene_no, dia, len(dia), prev, next, extendedSpeaker]
print("mydic 398", speaker, mydic[speaker])
dia_count['Scene '+str(scene_no)] += 1
# print(mydic)
prev, next = "", ""
i = i+1
speakers.append(speaker)
parenthetical_lis.append(parenthetical)
dialogues.append(mydic)
scene.append(mydic)
parenthetical = "NONE"
else:
line = line.replace("\n", " ")
line = ' '.join(line.split())
if line.strip() in transitions:
scene.append({'Transition': line.strip()})
continue
actionline.append(line)
scene.append(line.strip())
scenes.append(scene)
parenthetical_count_dict['Scene '+str(scene_no)] = pc
speakers = list(set(speakers))
scenes = scenes[1:]
print("Scenes:", scenes)
# for removing '\n' from action lines
# return scenes also if '\n' required and modify practice_with_db also
# s = []
# for scene in scenes:
# s1=[]
# for ele in scene:
# if type(ele) == type(""):
# s1.extend(ele.split("\n"))
# else:
# s1.append(ele)
# s.append(s1)
return scenes, parenthetical_count_dict, predecessor_scene_no_dict, successor_scene_no_dict, actionline, parenthetical_lis, speakers, dia_count, dialogues, slugline_dic