Conversion_Kitchen_Code/kitchen_counter/conversion/translation/script_reading.py

319 lines
14 KiB
Python
Raw Normal View History

2024-04-27 09:33:09 +00:00
# import textract
import re
import docx
import os
doc = docx.Document()
from docx.shared import Inches, Cm, Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.table import WD_TABLE_ALIGNMENT, WD_ALIGN_VERTICAL
from MNF.settings import BasePath
basePath = BasePath()
# -> Patterns for Detection of Sluglines,Transition,actionline and Dialogues
slug_pattern = r'[\d]*[.]?[\s]*[IE][NX]T'
pat = r'[\d]*[\s]*[IE]/[IE][.]?'
transitions = ['CUT TO:', 'FADE IN:', 'FADE OUT:', 'DISSOLVE TO:', 'JUMP CUT TO:',
'JUMP TO:', 'CUT BACK TO:', 'INTERCUT WITH:', 'I/C WITH:', 'BACK TO:', 'INTERVAL']
reserved_words = ['MONTAGE', 'PBS', 'FADE',
'FADE', 'TITLE', 'SPLIT', 'SCREEN', 'CUT']
# -> Random function - no use
def breaksen(s):
l = []
if len(s.split()) <= 256:
l.append(s)
else:
n = len(s.split())
for i in range(n // 32 + 1):
l.append(" ".join(s.split()[32 * i:32 * (i + 1)]))
return l
# -> Function for Getting Languages and Scripts related to a Script(Docx)
def getRefined(filename1):
print("Get_Refined_Called")
total_scenes = 0
print("filname", filename1)
#filename1 = "/home/user/mnf/project/MNF/media/scripts_folder/9a97e7dc-bd18-416f-b2a6-bbfcd8a3887b/b1_a4weGyR.docx"
text = textract.process(filename1, encoding="utf8", errors='ignore')
filename = rf"{basePath}/conversion/translation/file.txt"
f = open(filename, 'wb')
f.write(text)
f.close()
dialog_coming = False
f = open(filename, 'r', encoding="utf8", errors='ignore')
doc11 = f.read()
f.close()
f1 = open(rf"{basePath}/conversion/translation/file1.txt",
'w', encoding="utf8", errors='ignore')
flag = False
for line in doc11.split("\n"):
line = line.strip()
print("Original Line:", line)
# -> For Detection of Slug lines
if (line.strip().startswith(('INT.', 'INT ')) or
line.strip().startswith(('I/E', 'E/I')) or
line.strip().startswith(('EXT.', 'EXT ')) or
line.strip().startswith('EXT/INT') or
line.strip().startswith('INT/EXT') or
re.match(slug_pattern, line.strip()) or re.match(pat, line.strip())) and not (
line.strip().startswith(('INTERCUT', 'INTERMISSION', 'INTERVAL'))):
flag = True
f1.write(line)
f1.write('\n')
continue
# -> for Detection of Probable Dialogues,Speakers and Actionlines
else:
print("line 88: Other than Slugline:", line)
if flag:
print("line 90: else-if condition:", line)
print("stuck here 1")
if line.strip() == '\n':
continue
print("stuck here 1.1")
if dialog_coming and (line == '\n' or line.strip() == ""):
print("stuck here 2")
print("line empty or just have newline", line)
continue
print("stuck here 3")
# -> for Detection of Probable Dialogues
if dialog_coming:
print("Probable dialogue or PC: ", line)
f1.write(line)
f1.write('\n')
if re.match(r"\(.*\)", line):
print("stuck here 4")
continue
else:
print("stuck here 5")
dialog_coming = False
print("line 107: else of PCs and dialog over")
continue
# -> Detection of Speaker which implies that next line will be dialogue (dialog_coming=True)
print("stuck here 6")
# old_regex = "([A-Z']+[\s]*[-]*)*([#]*[\s]*[1-9])*(\(.*\))*"
# old_regex2 = "([A-Z'\s]*-?[#]*\s*[1-9]*\s*\([^)]*\))?"
if line.isupper():
print("Qualified a certian category1")
else:
print("not Qualified a certian category1")
if re.fullmatch(r"([A-Z']+[\s]*[-]*)*([#]*[\s]*[1-9])*(\(.*\))*", line.strip()):
print("Qualified a certian category2")
else:
print("bot Qualified a certian categor2")
if re.fullmatch(
r"(MRS?|DR|ER|PHD|ESQ|HON|JR|MS|MESSRS|MMES|MSGR|PROF|REV|RT. HON|SR|ST)\. [A-Z]+",
line.strip()):
print("Qualified a certian category3")
else:
print("bot Qualified a certian category3")
if line.isupper() and (
re.fullmatch(r"([A-Z']+[\s]*[-]*)*([#]*[\s]*[1-9])*(\(.*\))*", line.strip()) or re.fullmatch(
r"(MRS?|DR|ER|PHD|ESQ|HON|JR|MS|MESSRS|MMES|MSGR|PROF|REV|RT. HON|SR|ST)\. [A-Z]+",
line.strip())):
print("line 111: May be speaker: ", line)
f1.write(line)
f1.write('\n')
dialog_coming = True
continue
print("stuck here 7")
# -> Detection of Actionline , etc.
if not line == '\n':
print("line 120 Actionline or Something else", line)
f1.write(line)
f1.write('\n')
print("stuck here 8")
print("line 125 file closed")
f1.close()
# -> copying all the data in file1.txt to file.txt with bytes included
filename1 = rf"{basePath}/conversion/translation/file1.txt"
text = textract.process(filename1, encoding="utf8", errors='ignore')
print("line 130: ", text)
filename = rf"{basePath}/conversion/translation/file.txt"
f = open(filename, 'wb')
f.write(text)
f.close()
with open(filename, "r") as input:
input_ = input.read().split('\n\n')
# -> Creating Refined List of Scenes and its data
refined = []
for line in input_:
refined.append(line.strip())
refined = list(filter(lambda a: a != "", refined))
for i in range(len(refined)):
if not (refined[i].strip().startswith(('INT.', 'INT ')) or refined[i].strip().startswith(('EXT.', 'EXT ')) or
refined[i].strip().startswith('EXT/INT') or refined[i].strip().startswith('INT/EXT') or refined[
i].strip().startswith(('I/E', 'E/I')) or re.match(slug_pattern, refined[i].strip()) or re.match(pat,
refined[
i].strip())):
total_scenes = total_scenes + 1
continue
# -> This indicates that if a part of refined data does not have a Slug-line then
# that data should be rejected
refined = refined[i:]
break
refined = list(filter(lambda a: a != "", refined))
print("line 156:Refined", refined)
return refined, total_scenes
# -> For Getting Slug lines and Non-Slug lines from Refined Data
def getSlugAndNonSlug(refined):
sluglines = []
without_slug = []
for para in refined:
para = para.strip()
if para.strip().startswith(('INT.', 'INT')) or para.strip().startswith(
('EXT.', 'EXT')) or para.strip().startswith('EXT/INT') or para.strip().startswith(
('I/E', 'E/I')) or para.strip().startswith('INT/EXT') or re.match(slug_pattern,
para.strip()) or re.match(pat,
para.strip()):
sluglines.append(para)
continue
without_slug.append(para)
return sluglines, without_slug
# -> For Extracting the Speakers from Refined Data
def getSpeakers(without_slug):
characters = []
for para in without_slug:
lis = para.split('\n')
i = 0
for item in lis:
i = i + 1
i = min(i, len(lis) - 2)
if item.isupper() and not (lis[i + 1].strip() == ""):
if re.match(r"[A-Z']+[\s]*[-]*[A-Z']*([#]*[\s]*[1-9])*(\(.*\))*", item):
tem = item.split("(")[0].strip()
characters.append(tem.strip())
else:
continue
characters = list(set(characters))
characters = list(filter(lambda x: len(x) > 0, characters))
characters = [character for character in characters if set(
character.split(" ")).intersection(reserved_words) == set()]
return characters
# ->Use this function For getting the Scenes with all appropriate data extracted
def getScenes(refined, total_scenes, characters):
# To find scenes data structure and prev and next scenes numbers
i = 0
scene = []
dialogues = []
speakers = []
slugline_dic = {}
pc = 0
scene_no = 0
actionline = []
parenthetical_lis = []
scenes = []
speaker = ""
parenthetical = 'NONE'
for line in refined:
# -> For Detection of Slug lines
if line.strip().startswith(('INT.', 'INT')) or line.strip().startswith(
('EXT.', 'EXT')) or line.strip().startswith('EXT/INT') or line.strip().startswith(
'INT/EXT') or line.strip().startswith(('I/E', 'E/I')) or re.match(slug_pattern,
line.strip()) or re.match(pat,
line.strip()):
print("Slug-line Case")
scenes.append(scene)
scene = []
i = 0
scene_no += 1
scene.append(line)
slugline_dic[scene_no] = line.split("\n")[0].strip('0123456789.- ')
# -> For Detection of Actionlines, Speakers, Dialogues,Transitions
else:
print("Not Slug-line Case")
lis = line.split("\n")
lis = [l.strip() for l in lis]
print(" \n Line 232 probable dialogue list", lis)
word = lis[0]
# -> For Extraction of Dialogues, Paranthetical lines, Speakers of Previous line and Next line
if word.split('(')[0].strip() in characters:
mydic = {}
speaker = word.split('(')[0].strip()
print("Speaker 238", speaker)
# -> if line has a speaker, dialogues and parenthetical lines then it goes in this part of code
if len(lis) > 1 and re.match(r"\(.*\)", lis[1]):
pc = pc + 1
parenthetical = lis[1]
parenthetical = parenthetical.replace("\n", "")
dia = ' '.join(lis[2:])
dia = dia.replace("\n", "")
dia = dia.replace("\"", '')
else:
dia = ''.join(lis[1:])
dia = dia.replace("\n", "")
dia = dia.replace("\"", '')
print(" length dia\n", len(dia))
# -> if no dialogues and no paranthetical lines were found then this code
if not (len(dia) == 0 and parenthetical == "NONE"):
print(" len dia != and Parenthetical == NONE: 384 ")
if i - 1 >= 0:
try:
prev = main_lis[scene_no - 1][i - 1]
except:
prev = ""
else:
prev = ""
try:
next = main_lis[scene_no - 1][i + 1]
except:
next = ""
# prev is previous speaker and next is next speaker of the dialogue
mydic[speaker] = [parenthetical,
scene_no, dia, len(dia), prev, next]
print("line 270", mydic)
i = i + 1
speakers.append(speaker)
parenthetical_lis.append(parenthetical)
dialogues.append(mydic)
scene.append(mydic)
parenthetical = "NONE"
else:
line = line.replace("\n", " ")
line = ' '.join(line.split())
pattern = re.compile(r'.*(' + '|'.join(re.escape(t) for t in transitions) + r').*', re.IGNORECASE)
match = pattern.match(line.strip())
if match:
scene.append({'Transition': line.strip()})
else:
actionline.append(line)
scene.append(line.strip())
# ->Appending the leftover last scene data in scenes which didn't get append in above for-loop
scenes.append(scene)
speakers = list(set(speakers))
scenes = scenes[1:]
s = []
# -> Adding the Actionlines and Other lines separately by splitting the Actionlines by new lines.
for scene in scenes:
s1 = []
for ele in scene:
if type(ele) == type(""):
s1.extend(ele.split("\n"))
else:
s1.append(ele)
s.append(s1)
print("dialogue: ", dialogues)
return s, actionline, parenthetical_lis, speakers, dialogues