Conversion_Kitchen_Code/kitchen_counter/conversion/subtitling/dia_function.py

302 lines
14 KiB
Python
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
# import textract
import os
# from MNF.settings import BasePath
# basePath = BasePath()
basePath = "/home/user/mnf/project/MNF"
slug_pattern = r'[\d]*[.]?[\s]*[IE][NX]T'
pat = r'[\d]*[\s]*[IE]/[IE][.]?'
reserved_words = ['MONTAGE', 'PBS', 'FADE TO BLACK:', 'Beat.', 'VOX POP', 'CUT TO', 'CUT TO:',
'CUT TO BLACK', 'FADE', 'TITLE', 'SPLIT', 'SCREEN', 'SHOTS', 'INTERVAL', 'END CREDITS', 'INTERVAL']
transitions = ['CUT TO:', 'CUT TO', 'CUT TO BLACK', 'FADE TO BLACK:', 'FADE IN:', 'FADE OUT:', 'DISSOLVE TO:',
'JUMP CUT TO:', 'JUMP TO:', 'CUT BACK TO:', 'INTERCUT WITH:', 'I/C WITH:', 'BACK TO:', 'END CREDITS', 'INTERVAL']
class dialogueBreakdown:
def getRefined(self, filename1):
total_scenes = 0
file = os.path.basename(filename1)
file1 = os.path.splitext(file)[0]
text = textract.process(filename1)
filename = rf"{basePath}/conversion/subtitling/files/{file1}.txt"
f = open(filename, 'wb')
f.write(text)
f.close()
dialog_coming = False
f = open(filename, encoding="utf-8")
doc1 = f.read()
f.close()
f1 = open(
rf"{basePath}/conversion/subtitling/files/{file1}1.txt", 'w')
c = 0
flag = False
for line in doc1.split("\n"):
if ((line.strip().startswith(('INT.', 'INT ')) or
line.strip().startswith(('I/E', 'E/I')) or
line.strip().startswith(('EXT.', 'EXT ')) or
line.strip().startswith('EXT/INT') or
line.strip().startswith('INT/EXT') or
re.match(slug_pattern, line.strip()) or re.match(pat, line.strip())) and not (line.strip().startswith(('INTERCUT', 'INTERMISSION', 'INTERVAL')))):
flag = True
f1.write(line)
f1.write('\n')
continue
else:
if flag:
if line.strip() == '\n':
continue
if dialog_coming and (line == '\n' or line.strip() == ""):
print("line empty or just have newline", line)
continue
if dialog_coming:
print("200 probable dialog or PC", line)
f1.write(line)
f1.write('\n')
if re.match(r"\(.*\)", line.strip()):
print(" line 203 matched regular expression\n")
continue
else:
print(" line 207: else of PCs", line)
dialog_coming = False
print(" line 208 dialog over")
continue
continue
if line.isupper() and re.fullmatch(r"([A-Z']+[\s]*[-]*)*([#]*[\s]*[1-9])*(\(.*\))*", line.strip()):
f1.write(line)
f1.write('\n')
print(" Probable dialogue speaker 211", line)
dialog_coming = True
continue
if not line == '\n':
print(
"218 probably action or something else so just write it", line)
f1.write(line)
f1.write('\n')
f1.close()
filename1 = rf"{basePath}/conversion/subtitling/files/{file1}1.txt"
# sys.exit(0)
# file.txt contains the data of file1.txt , no usage as of now may be change the mame of the file.txt to file1.txt
text = textract.process(filename1)
filename = rf"{basePath}/conversion/subtitling/files/{file1}.txt"
_, file_extension = os.path.splitext(filename1)
f = open(filename, 'wb')
f.write(text)
f.close()
with open(filename, "r") as input:
input_ = input.read().split('\n\n')
refined = []
for line in input_:
# print(line.strip())
refined.append(line.strip())
refined = list(filter(lambda a: a != "", refined))
for i in range(len(refined)):
if not (refined[i].strip().startswith(('INT.', 'INT ')) or refined[i].strip().startswith(('EXT.', 'EXT ')) or refined[i].strip().startswith('EXT/INT') or refined[i].strip().startswith('INT/EXT') or refined[i].strip().startswith(('I/E', 'E/I')) or re.match(slug_pattern, refined[i].strip()) or re.match(pat, refined[i].strip())):
total_scenes = total_scenes + 1
continue
refined = refined[i:]
break
# refined.append(line.strip())
refined = list(filter(lambda a: a != "", refined))
return refined, total_scenes
def getSlugAndNonSlug(self, refined):
sluglines = []
without_slug = []
for para in refined:
para = para.strip()
if ((para.strip().startswith(('INT.', 'INT')) or para.strip().startswith(('EXT.', 'EXT')) or para.strip().startswith('EXT/INT') or para.strip().startswith(('I/E', 'E/I')) or para.strip().startswith('INT/EXT') or re.match(slug_pattern, para.strip()) or re.match(pat, para.strip())) and not (para.strip().startswith(('INTERCUT', 'INTERVAL', 'INTERMISSION')))):
sluglines.append(para)
continue
without_slug.append(para)
return sluglines, without_slug
def getSpeakers(self, without_slug):
characters = []
for para in without_slug:
lis = para.split('\n')
i = 0
for item in lis:
i = i+1
i = min(i, len(lis)-2)
if item.isupper() and not(lis[i+1].strip() == ""):
if re.match(r"[A-Z']+[\s]*[-]*[A-Z']*([#]*[\s]*[1-9])*(\(.*\))*", item):
tem = item.split("(")[0].strip()
characters.append(tem.strip())
# elif re.match(r"[A-Z]*[-]*[A-Z]*([#][1-9])*\(*.*\)*",item):
# tem = item.split("(")[0].strip()
# characters.append(tem.strip())
else:
continue
characters = list(set(characters))
characters = list(filter(lambda x: len(x) > 0, characters))
characters = [character for character in characters if set(
character.split(" ")).intersection(reserved_words) == set()]
return characters
def getListForPrevAndNextDialogue(self, refined, characters):
speaker_having_dia = []
main_lis = []
# to update the mainlist containing speakers having dialogues, to find prev and next speakers, may require a chnae
for line in refined:
if ((line.strip().startswith(('INT.', 'INT')) or line.strip().startswith(('EXT.', 'EXT')) or line.strip().startswith(('I/E', 'E/I')) or line.strip().startswith('EXT/INT') or line.strip().startswith('INT/EXT') or re.match(slug_pattern, line.strip()) or re.match(pat, line.strip())) and not (line.strip().startswith(('INTERCUT', 'INTERVAL', 'INTERMISSION')))):
main_lis.append(speaker_having_dia)
speaker_having_dia = []
continue
else:
lis = line.split("\n")
lis = [l.strip() for l in lis]
word = lis[0]
if word.split('(')[0].strip() in characters:
speaker = word.split('(')[0].strip()
speaker_having_dia.append(speaker)
main_lis = main_lis[1:]
return main_lis, speaker_having_dia
def getScenes(self, refined, total_scenes, characters):
# To find scenes data structure and prev and next scenes numbers
i = 0
scene = []
dialogues = []
speakers = []
slugline_dic = {}
prev_dial_speaker = ""
next_dial_speaker = ""
pc = 0
scene_no = 0
actionline = []
successor_scene_no = 0
predecessor_scene_no = 0
parenthetical_lis = []
scenes = []
speaker = ""
parenthetical = 'NONE'
predecessor_scene_no_dict = {
'Scene '+str(i+1): 0 for i in range(total_scenes)}
dia_count = {'Scene '+str(i+1): 0 for i in range(total_scenes)}
successor_scene_no_dict = {
'Scene '+str(i+1): 0 for i in range(total_scenes)}
parenthetical_count_dict = {
'Scene '+str(i+1): 0 for i in range(total_scenes)}
patttern = r'[\d]*[.]?[\s]*[IE][NX]T'
for line in refined:
if ((line.strip().startswith(('INT.', 'INT')) or line.strip().startswith(('EXT.', 'EXT')) or line.strip().startswith('EXT/INT') or line.strip().startswith('INT/EXT') or line.strip().startswith(('I/E', 'E/I')) or re.match(slug_pattern, line.strip()) or re.match(pat, line.strip())) and not (line.strip().startswith(('INTERCUT', 'INTERVAL', 'INTERMISSION')))):
# if re.match(patttern,line):
# current_scene=line.split(" ")[0]
# current_scene=current_scene[:1]
# current_scene=int(current_scene)
# successor_scene_no=min(int(current_scene)+2,total_scenes)
# predecessor_scene_no=max(int(current_scene),-1)
# else:
successor_scene_no = min(scene_no+3, total_scenes)
predecessor_scene_no = max(scene_no+1, 0)
if scene_no > 0:
parenthetical_count_dict['Scene '+str(scene_no+1)] = pc
pc = 0
scenes.append(scene)
scene = []
i = 0
scene_no += 1
predecessor_scene_no_dict['Scene ' +
str(scene_no+1)] = predecessor_scene_no
successor_scene_no_dict['Scene ' +
str(scene_no+1)] = successor_scene_no
successor_scene_no_dict['Scene '+str(1)] = 2
scene.append(line)
slugline_dic[scene_no] = line.split(
"\n")[0].strip('0123456789.- ')
else:
lis = line.split("\n")
lis = [l.strip() for l in lis]
print(" \n Line 363 probable dialogue list", lis)
word = lis[0]
extendedSpeaker = ""
if word.split('(')[0].strip() in characters:
mydic = {}
prev_dial_speaker = speaker
speakerline = word.split('(')
# speaker = word.split('(')[0].strip()
speaker = speakerline[0].strip()
print("Speaker 378", speaker)
extendedSpeaker = word.strip()
if len(lis) > 1 and re.match(r"\(.*\)", lis[1]):
pc = pc+1
parenthetical = lis[1]
parenthetical = parenthetical.replace("\n", "")
dia = ' '.join(lis[2:])
dia = dia.replace("\n", "")
# renu
#dia=dia.replace("\"", '')
else:
dia = ''.join(lis[1:])
dia = dia.replace("\n", "")
dia = dia.replace("\"", '')
print(" length dia\n", len(dia))
if not (len(dia) == 0 and parenthetical == "NONE"):
print(" len dia != and Parenthetical == NONE: 384 ")
if i-1 >= 0:
try:
prev = main_lis[scene_no-1][i-1]
except:
prev = ""
else:
prev = ""
try:
next = main_lis[scene_no-1][i+1]
except:
next = ""
# prev is previous speaker and next is next speaker of the dialogue
mydic[speaker] = [parenthetical,
scene_no, dia, len(dia), prev, next, extendedSpeaker]
print("mydic 398", speaker, mydic[speaker])
dia_count['Scene '+str(scene_no)] += 1
# print(mydic)
prev, next = "", ""
i = i+1
speakers.append(speaker)
parenthetical_lis.append(parenthetical)
dialogues.append(mydic)
scene.append(mydic)
parenthetical = "NONE"
else:
line = line.replace("\n", " ")
line = ' '.join(line.split())
if line.strip() in transitions:
scene.append({'Transition': line.strip()})
continue
actionline.append(line)
scene.append(line.strip())
scenes.append(scene)
parenthetical_count_dict['Scene '+str(scene_no)] = pc
speakers = list(set(speakers))
scenes = scenes[1:]
print("Scenes:", scenes)
# for removing '\n' from action lines
# return scenes also if '\n' required and modify practice_with_db also
# s = []
# for scene in scenes:
# s1=[]
# for ele in scene:
# if type(ele) == type(""):
# s1.extend(ele.split("\n"))
# else:
# s1.append(ele)
# s.append(s1)
return scenes, parenthetical_count_dict, predecessor_scene_no_dict, successor_scene_no_dict, actionline, parenthetical_lis, speakers, dia_count, dialogues, slugline_dic