Conversion_Kitchen_Code/kitchen_counter/conversion/translation/script_reading.py

319 lines
14 KiB
Python
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# import textract
import re
import docx
import os
doc = docx.Document()
from docx.shared import Inches, Cm, Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.table import WD_TABLE_ALIGNMENT, WD_ALIGN_VERTICAL
from MNF.settings import BasePath
basePath = BasePath()
# -> Patterns for Detection of Sluglines,Transition,actionline and Dialogues
slug_pattern = r'[\d]*[.]?[\s]*[IE][NX]T'
pat = r'[\d]*[\s]*[IE]/[IE][.]?'
transitions = ['CUT TO:', 'FADE IN:', 'FADE OUT:', 'DISSOLVE TO:', 'JUMP CUT TO:',
'JUMP TO:', 'CUT BACK TO:', 'INTERCUT WITH:', 'I/C WITH:', 'BACK TO:', 'INTERVAL']
reserved_words = ['MONTAGE', 'PBS', 'FADE',
'FADE', 'TITLE', 'SPLIT', 'SCREEN', 'CUT']
# -> Random function - no use
def breaksen(s):
l = []
if len(s.split()) <= 256:
l.append(s)
else:
n = len(s.split())
for i in range(n // 32 + 1):
l.append(" ".join(s.split()[32 * i:32 * (i + 1)]))
return l
# -> Function for Getting Languages and Scripts related to a Script(Docx)
def getRefined(filename1):
print("Get_Refined_Called")
total_scenes = 0
print("filname", filename1)
#filename1 = "/home/user/mnf/project/MNF/media/scripts_folder/9a97e7dc-bd18-416f-b2a6-bbfcd8a3887b/b1_a4weGyR.docx"
text = textract.process(filename1, encoding="utf8", errors='ignore')
filename = rf"{basePath}/conversion/translation/file.txt"
f = open(filename, 'wb')
f.write(text)
f.close()
dialog_coming = False
f = open(filename, 'r', encoding="utf8", errors='ignore')
doc11 = f.read()
f.close()
f1 = open(rf"{basePath}/conversion/translation/file1.txt",
'w', encoding="utf8", errors='ignore')
flag = False
for line in doc11.split("\n"):
line = line.strip()
print("Original Line:", line)
# -> For Detection of Slug lines
if (line.strip().startswith(('INT.', 'INT ')) or
line.strip().startswith(('I/E', 'E/I')) or
line.strip().startswith(('EXT.', 'EXT ')) or
line.strip().startswith('EXT/INT') or
line.strip().startswith('INT/EXT') or
re.match(slug_pattern, line.strip()) or re.match(pat, line.strip())) and not (
line.strip().startswith(('INTERCUT', 'INTERMISSION', 'INTERVAL'))):
flag = True
f1.write(line)
f1.write('\n')
continue
# -> for Detection of Probable Dialogues,Speakers and Actionlines
else:
print("line 88: Other than Slugline:", line)
if flag:
print("line 90: else-if condition:", line)
print("stuck here 1")
if line.strip() == '\n':
continue
print("stuck here 1.1")
if dialog_coming and (line == '\n' or line.strip() == ""):
print("stuck here 2")
print("line empty or just have newline", line)
continue
print("stuck here 3")
# -> for Detection of Probable Dialogues
if dialog_coming:
print("Probable dialogue or PC: ", line)
f1.write(line)
f1.write('\n')
if re.match(r"\(.*\)", line):
print("stuck here 4")
continue
else:
print("stuck here 5")
dialog_coming = False
print("line 107: else of PCs and dialog over")
continue
# -> Detection of Speaker which implies that next line will be dialogue (dialog_coming=True)
print("stuck here 6")
# old_regex = "([A-Z']+[\s]*[-]*)*([#]*[\s]*[1-9])*(\(.*\))*"
# old_regex2 = "([A-Z'\s]*-?[#]*\s*[1-9]*\s*\([^)]*\))?"
if line.isupper():
print("Qualified a certian category1")
else:
print("not Qualified a certian category1")
if re.fullmatch(r"([A-Z']+[\s]*[-]*)*([#]*[\s]*[1-9])*(\(.*\))*", line.strip()):
print("Qualified a certian category2")
else:
print("bot Qualified a certian categor2")
if re.fullmatch(
r"(MRS?|DR|ER|PHD|ESQ|HON|JR|MS|MESSRS|MMES|MSGR|PROF|REV|RT. HON|SR|ST)\. [A-Z]+",
line.strip()):
print("Qualified a certian category3")
else:
print("bot Qualified a certian category3")
if line.isupper() and (
re.fullmatch(r"([A-Z']+[\s]*[-]*)*([#]*[\s]*[1-9])*(\(.*\))*", line.strip()) or re.fullmatch(
r"(MRS?|DR|ER|PHD|ESQ|HON|JR|MS|MESSRS|MMES|MSGR|PROF|REV|RT. HON|SR|ST)\. [A-Z]+",
line.strip())):
print("line 111: May be speaker: ", line)
f1.write(line)
f1.write('\n')
dialog_coming = True
continue
print("stuck here 7")
# -> Detection of Actionline , etc.
if not line == '\n':
print("line 120 Actionline or Something else", line)
f1.write(line)
f1.write('\n')
print("stuck here 8")
print("line 125 file closed")
f1.close()
# -> copying all the data in file1.txt to file.txt with bytes included
filename1 = rf"{basePath}/conversion/translation/file1.txt"
text = textract.process(filename1, encoding="utf8", errors='ignore')
print("line 130: ", text)
filename = rf"{basePath}/conversion/translation/file.txt"
f = open(filename, 'wb')
f.write(text)
f.close()
with open(filename, "r") as input:
input_ = input.read().split('\n\n')
# -> Creating Refined List of Scenes and its data
refined = []
for line in input_:
refined.append(line.strip())
refined = list(filter(lambda a: a != "", refined))
for i in range(len(refined)):
if not (refined[i].strip().startswith(('INT.', 'INT ')) or refined[i].strip().startswith(('EXT.', 'EXT ')) or
refined[i].strip().startswith('EXT/INT') or refined[i].strip().startswith('INT/EXT') or refined[
i].strip().startswith(('I/E', 'E/I')) or re.match(slug_pattern, refined[i].strip()) or re.match(pat,
refined[
i].strip())):
total_scenes = total_scenes + 1
continue
# -> This indicates that if a part of refined data does not have a Slug-line then
# that data should be rejected
refined = refined[i:]
break
refined = list(filter(lambda a: a != "", refined))
print("line 156:Refined", refined)
return refined, total_scenes
# -> For Getting Slug lines and Non-Slug lines from Refined Data
def getSlugAndNonSlug(refined):
sluglines = []
without_slug = []
for para in refined:
para = para.strip()
if para.strip().startswith(('INT.', 'INT')) or para.strip().startswith(
('EXT.', 'EXT')) or para.strip().startswith('EXT/INT') or para.strip().startswith(
('I/E', 'E/I')) or para.strip().startswith('INT/EXT') or re.match(slug_pattern,
para.strip()) or re.match(pat,
para.strip()):
sluglines.append(para)
continue
without_slug.append(para)
return sluglines, without_slug
# -> For Extracting the Speakers from Refined Data
def getSpeakers(without_slug):
characters = []
for para in without_slug:
lis = para.split('\n')
i = 0
for item in lis:
i = i + 1
i = min(i, len(lis) - 2)
if item.isupper() and not (lis[i + 1].strip() == ""):
if re.match(r"[A-Z']+[\s]*[-]*[A-Z']*([#]*[\s]*[1-9])*(\(.*\))*", item):
tem = item.split("(")[0].strip()
characters.append(tem.strip())
else:
continue
characters = list(set(characters))
characters = list(filter(lambda x: len(x) > 0, characters))
characters = [character for character in characters if set(
character.split(" ")).intersection(reserved_words) == set()]
return characters
# ->Use this function For getting the Scenes with all appropriate data extracted
def getScenes(refined, total_scenes, characters):
# To find scenes data structure and prev and next scenes numbers
i = 0
scene = []
dialogues = []
speakers = []
slugline_dic = {}
pc = 0
scene_no = 0
actionline = []
parenthetical_lis = []
scenes = []
speaker = ""
parenthetical = 'NONE'
for line in refined:
# -> For Detection of Slug lines
if line.strip().startswith(('INT.', 'INT')) or line.strip().startswith(
('EXT.', 'EXT')) or line.strip().startswith('EXT/INT') or line.strip().startswith(
'INT/EXT') or line.strip().startswith(('I/E', 'E/I')) or re.match(slug_pattern,
line.strip()) or re.match(pat,
line.strip()):
print("Slug-line Case")
scenes.append(scene)
scene = []
i = 0
scene_no += 1
scene.append(line)
slugline_dic[scene_no] = line.split("\n")[0].strip('0123456789.- ')
# -> For Detection of Actionlines, Speakers, Dialogues,Transitions
else:
print("Not Slug-line Case")
lis = line.split("\n")
lis = [l.strip() for l in lis]
print(" \n Line 232 probable dialogue list", lis)
word = lis[0]
# -> For Extraction of Dialogues, Paranthetical lines, Speakers of Previous line and Next line
if word.split('(')[0].strip() in characters:
mydic = {}
speaker = word.split('(')[0].strip()
print("Speaker 238", speaker)
# -> if line has a speaker, dialogues and parenthetical lines then it goes in this part of code
if len(lis) > 1 and re.match(r"\(.*\)", lis[1]):
pc = pc + 1
parenthetical = lis[1]
parenthetical = parenthetical.replace("\n", "")
dia = ' '.join(lis[2:])
dia = dia.replace("\n", "")
dia = dia.replace("\"", '')
else:
dia = ''.join(lis[1:])
dia = dia.replace("\n", "")
dia = dia.replace("\"", '')
print(" length dia\n", len(dia))
# -> if no dialogues and no paranthetical lines were found then this code
if not (len(dia) == 0 and parenthetical == "NONE"):
print(" len dia != and Parenthetical == NONE: 384 ")
if i - 1 >= 0:
try:
prev = main_lis[scene_no - 1][i - 1]
except:
prev = ""
else:
prev = ""
try:
next = main_lis[scene_no - 1][i + 1]
except:
next = ""
# prev is previous speaker and next is next speaker of the dialogue
mydic[speaker] = [parenthetical,
scene_no, dia, len(dia), prev, next]
print("line 270", mydic)
i = i + 1
speakers.append(speaker)
parenthetical_lis.append(parenthetical)
dialogues.append(mydic)
scene.append(mydic)
parenthetical = "NONE"
else:
line = line.replace("\n", " ")
line = ' '.join(line.split())
pattern = re.compile(r'.*(' + '|'.join(re.escape(t) for t in transitions) + r').*', re.IGNORECASE)
match = pattern.match(line.strip())
if match:
scene.append({'Transition': line.strip()})
else:
actionline.append(line)
scene.append(line.strip())
# ->Appending the leftover last scene data in scenes which didn't get append in above for-loop
scenes.append(scene)
speakers = list(set(speakers))
scenes = scenes[1:]
s = []
# -> Adding the Actionlines and Other lines separately by splitting the Actionlines by new lines.
for scene in scenes:
s1 = []
for ele in scene:
if type(ele) == type(""):
s1.extend(ele.split("\n"))
else:
s1.append(ele)
s.append(s1)
print("dialogue: ", dialogues)
return s, actionline, parenthetical_lis, speakers, dialogues