319 lines
14 KiB
Python
Executable File
319 lines
14 KiB
Python
Executable File
# import textract
|
||
import re
|
||
import docx
|
||
import os
|
||
doc = docx.Document()
|
||
from docx.shared import Inches, Cm, Pt
|
||
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
||
from docx.enum.table import WD_TABLE_ALIGNMENT, WD_ALIGN_VERTICAL
|
||
from MNF.settings import BasePath
|
||
basePath = BasePath()
|
||
|
||
# -> Patterns for Detection of Sluglines,Transition,actionline and Dialogues
|
||
slug_pattern = r'[\d]*[.]?[\s]*[IE][NX]T'
|
||
pat = r'[\d]*[\s]*[IE]/[IE][.]?'
|
||
transitions = ['CUT TO:', 'FADE IN:', 'FADE OUT:', 'DISSOLVE TO:', 'JUMP CUT TO:',
|
||
'JUMP TO:', 'CUT BACK TO:', 'INTERCUT WITH:', 'I/C WITH:', 'BACK TO:', 'INTERVAL']
|
||
reserved_words = ['MONTAGE', 'PBS', 'FADE',
|
||
'FADE', 'TITLE', 'SPLIT', 'SCREEN', 'CUT']
|
||
|
||
|
||
# -> Random function - no use
|
||
def breaksen(s):
|
||
l = []
|
||
if len(s.split()) <= 256:
|
||
l.append(s)
|
||
else:
|
||
n = len(s.split())
|
||
for i in range(n // 32 + 1):
|
||
l.append(" ".join(s.split()[32 * i:32 * (i + 1)]))
|
||
return l
|
||
|
||
|
||
# -> Function for Getting Languages and Scripts related to a Script(Docx)
|
||
def getRefined(filename1):
|
||
print("Get_Refined_Called")
|
||
total_scenes = 0
|
||
print("filname", filename1)
|
||
#filename1 = "/home/user/mnf/project/MNF/media/scripts_folder/9a97e7dc-bd18-416f-b2a6-bbfcd8a3887b/b1_a4weGyR.docx"
|
||
text = textract.process(filename1, encoding="utf8", errors='ignore')
|
||
filename = rf"{basePath}/conversion/translation/file.txt"
|
||
f = open(filename, 'wb')
|
||
f.write(text)
|
||
f.close()
|
||
dialog_coming = False
|
||
f = open(filename, 'r', encoding="utf8", errors='ignore')
|
||
doc11 = f.read()
|
||
f.close()
|
||
f1 = open(rf"{basePath}/conversion/translation/file1.txt",
|
||
'w', encoding="utf8", errors='ignore')
|
||
flag = False
|
||
for line in doc11.split("\n"):
|
||
line = line.strip()
|
||
print("Original Line:", line)
|
||
|
||
# -> For Detection of Slug lines
|
||
if (line.strip().startswith(('INT.', 'INT ')) or
|
||
line.strip().startswith(('I/E', 'E/I')) or
|
||
line.strip().startswith(('EXT.', 'EXT ')) or
|
||
line.strip().startswith('EXT/INT') or
|
||
line.strip().startswith('INT/EXT') or
|
||
re.match(slug_pattern, line.strip()) or re.match(pat, line.strip())) and not (
|
||
line.strip().startswith(('INTERCUT', 'INTERMISSION', 'INTERVAL'))):
|
||
|
||
flag = True
|
||
f1.write(line)
|
||
f1.write('\n')
|
||
continue
|
||
# -> for Detection of Probable Dialogues,Speakers and Actionlines
|
||
else:
|
||
print("line 88: Other than Slugline:", line)
|
||
if flag:
|
||
print("line 90: else-if condition:", line)
|
||
print("stuck here 1")
|
||
if line.strip() == '\n':
|
||
continue
|
||
print("stuck here 1.1")
|
||
if dialog_coming and (line == '\n' or line.strip() == ""):
|
||
print("stuck here 2")
|
||
print("line empty or just have newline", line)
|
||
continue
|
||
print("stuck here 3")
|
||
# -> for Detection of Probable Dialogues
|
||
if dialog_coming:
|
||
print("Probable dialogue or PC: ", line)
|
||
f1.write(line)
|
||
f1.write('\n')
|
||
if re.match(r"\(.*\)", line):
|
||
print("stuck here 4")
|
||
continue
|
||
else:
|
||
print("stuck here 5")
|
||
dialog_coming = False
|
||
print("line 107: else of PCs and dialog over")
|
||
continue
|
||
|
||
# -> Detection of Speaker which implies that next line will be dialogue (dialog_coming=True)
|
||
print("stuck here 6")
|
||
# old_regex = "([A-Z'’]+[\s]*[-]*)*([#]*[\s]*[1-9])*(\(.*\))*"
|
||
# old_regex2 = "([A-Z'’\s]*-?[#]*\s*[1-9]*\s*\([^)]*\))?"
|
||
if line.isupper():
|
||
print("Qualified a certian category1")
|
||
else:
|
||
print("not Qualified a certian category1")
|
||
|
||
if re.fullmatch(r"([A-Z'’]+[\s]*[-]*)*([#]*[\s]*[1-9])*(\(.*\))*", line.strip()):
|
||
print("Qualified a certian category2")
|
||
else:
|
||
print("bot Qualified a certian categor2")
|
||
if re.fullmatch(
|
||
r"(MRS?|DR|ER|PHD|ESQ|HON|JR|MS|MESSRS|MMES|MSGR|PROF|REV|RT. HON|SR|ST)\. [A-Z]+",
|
||
line.strip()):
|
||
print("Qualified a certian category3")
|
||
else:
|
||
print("bot Qualified a certian category3")
|
||
if line.isupper() and (
|
||
re.fullmatch(r"([A-Z'’]+[\s]*[-]*)*([#]*[\s]*[1-9])*(\(.*\))*", line.strip()) or re.fullmatch(
|
||
r"(MRS?|DR|ER|PHD|ESQ|HON|JR|MS|MESSRS|MMES|MSGR|PROF|REV|RT. HON|SR|ST)\. [A-Z]+",
|
||
line.strip())):
|
||
print("line 111: May be speaker: ", line)
|
||
f1.write(line)
|
||
f1.write('\n')
|
||
dialog_coming = True
|
||
continue
|
||
print("stuck here 7")
|
||
# -> Detection of Actionline , etc.
|
||
if not line == '\n':
|
||
print("line 120 Actionline or Something else", line)
|
||
f1.write(line)
|
||
f1.write('\n')
|
||
print("stuck here 8")
|
||
print("line 125 file closed")
|
||
f1.close()
|
||
|
||
# -> copying all the data in file1.txt to file.txt with bytes included
|
||
filename1 = rf"{basePath}/conversion/translation/file1.txt"
|
||
text = textract.process(filename1, encoding="utf8", errors='ignore')
|
||
print("line 130: ", text)
|
||
filename = rf"{basePath}/conversion/translation/file.txt"
|
||
f = open(filename, 'wb')
|
||
f.write(text)
|
||
f.close()
|
||
|
||
with open(filename, "r") as input:
|
||
input_ = input.read().split('\n\n')
|
||
# -> Creating Refined List of Scenes and its data
|
||
refined = []
|
||
for line in input_:
|
||
refined.append(line.strip())
|
||
|
||
refined = list(filter(lambda a: a != "", refined))
|
||
|
||
for i in range(len(refined)):
|
||
if not (refined[i].strip().startswith(('INT.', 'INT ')) or refined[i].strip().startswith(('EXT.', 'EXT ')) or
|
||
refined[i].strip().startswith('EXT/INT') or refined[i].strip().startswith('INT/EXT') or refined[
|
||
i].strip().startswith(('I/E', 'E/I')) or re.match(slug_pattern, refined[i].strip()) or re.match(pat,
|
||
refined[
|
||
i].strip())):
|
||
total_scenes = total_scenes + 1
|
||
continue
|
||
# -> This indicates that if a part of refined data does not have a Slug-line then
|
||
# that data should be rejected
|
||
refined = refined[i:]
|
||
break
|
||
|
||
refined = list(filter(lambda a: a != "", refined))
|
||
print("line 156:Refined", refined)
|
||
return refined, total_scenes
|
||
|
||
|
||
# -> For Getting Slug lines and Non-Slug lines from Refined Data
|
||
def getSlugAndNonSlug(refined):
|
||
sluglines = []
|
||
without_slug = []
|
||
for para in refined:
|
||
para = para.strip()
|
||
if para.strip().startswith(('INT.', 'INT')) or para.strip().startswith(
|
||
('EXT.', 'EXT')) or para.strip().startswith('EXT/INT') or para.strip().startswith(
|
||
('I/E', 'E/I')) or para.strip().startswith('INT/EXT') or re.match(slug_pattern,
|
||
para.strip()) or re.match(pat,
|
||
para.strip()):
|
||
sluglines.append(para)
|
||
continue
|
||
without_slug.append(para)
|
||
return sluglines, without_slug
|
||
|
||
|
||
# -> For Extracting the Speakers from Refined Data
|
||
def getSpeakers(without_slug):
|
||
characters = []
|
||
for para in without_slug:
|
||
lis = para.split('\n')
|
||
i = 0
|
||
for item in lis:
|
||
i = i + 1
|
||
i = min(i, len(lis) - 2)
|
||
if item.isupper() and not (lis[i + 1].strip() == ""):
|
||
if re.match(r"[A-Z'’]+[\s]*[-]*[A-Z'’]*([#]*[\s]*[1-9])*(\(.*\))*", item):
|
||
tem = item.split("(")[0].strip()
|
||
characters.append(tem.strip())
|
||
else:
|
||
continue
|
||
|
||
characters = list(set(characters))
|
||
characters = list(filter(lambda x: len(x) > 0, characters))
|
||
characters = [character for character in characters if set(
|
||
character.split(" ")).intersection(reserved_words) == set()]
|
||
return characters
|
||
|
||
|
||
# ->Use this function For getting the Scenes with all appropriate data extracted
|
||
def getScenes(refined, total_scenes, characters):
|
||
# To find scenes data structure and prev and next scenes numbers
|
||
i = 0
|
||
scene = []
|
||
dialogues = []
|
||
speakers = []
|
||
slugline_dic = {}
|
||
pc = 0
|
||
scene_no = 0
|
||
actionline = []
|
||
parenthetical_lis = []
|
||
|
||
scenes = []
|
||
speaker = ""
|
||
parenthetical = 'NONE'
|
||
|
||
for line in refined:
|
||
# -> For Detection of Slug lines
|
||
if line.strip().startswith(('INT.', 'INT')) or line.strip().startswith(
|
||
('EXT.', 'EXT')) or line.strip().startswith('EXT/INT') or line.strip().startswith(
|
||
'INT/EXT') or line.strip().startswith(('I/E', 'E/I')) or re.match(slug_pattern,
|
||
line.strip()) or re.match(pat,
|
||
line.strip()):
|
||
print("Slug-line Case")
|
||
scenes.append(scene)
|
||
scene = []
|
||
i = 0
|
||
scene_no += 1
|
||
scene.append(line)
|
||
slugline_dic[scene_no] = line.split("\n")[0].strip('0123456789.- ')
|
||
# -> For Detection of Actionlines, Speakers, Dialogues,Transitions
|
||
else:
|
||
print("Not Slug-line Case")
|
||
lis = line.split("\n")
|
||
lis = [l.strip() for l in lis]
|
||
print(" \n Line 232 probable dialogue list", lis)
|
||
word = lis[0]
|
||
# -> For Extraction of Dialogues, Paranthetical lines, Speakers of Previous line and Next line
|
||
if word.split('(')[0].strip() in characters:
|
||
mydic = {}
|
||
speaker = word.split('(')[0].strip()
|
||
print("Speaker 238", speaker)
|
||
# -> if line has a speaker, dialogues and parenthetical lines then it goes in this part of code
|
||
if len(lis) > 1 and re.match(r"\(.*\)", lis[1]):
|
||
pc = pc + 1
|
||
parenthetical = lis[1]
|
||
parenthetical = parenthetical.replace("\n", "")
|
||
dia = ' '.join(lis[2:])
|
||
dia = dia.replace("\n", "")
|
||
dia = dia.replace("\"", '')
|
||
else:
|
||
dia = ''.join(lis[1:])
|
||
dia = dia.replace("\n", "")
|
||
dia = dia.replace("\"", '')
|
||
print(" length dia\n", len(dia))
|
||
# -> if no dialogues and no paranthetical lines were found then this code
|
||
if not (len(dia) == 0 and parenthetical == "NONE"):
|
||
print(" len dia != and Parenthetical == NONE: 384 ")
|
||
if i - 1 >= 0:
|
||
try:
|
||
prev = main_lis[scene_no - 1][i - 1]
|
||
except:
|
||
prev = ""
|
||
else:
|
||
prev = ""
|
||
try:
|
||
next = main_lis[scene_no - 1][i + 1]
|
||
except:
|
||
next = ""
|
||
|
||
# prev is previous speaker and next is next speaker of the dialogue
|
||
mydic[speaker] = [parenthetical,
|
||
scene_no, dia, len(dia), prev, next]
|
||
print("line 270", mydic)
|
||
i = i + 1
|
||
speakers.append(speaker)
|
||
parenthetical_lis.append(parenthetical)
|
||
dialogues.append(mydic)
|
||
scene.append(mydic)
|
||
parenthetical = "NONE"
|
||
else:
|
||
line = line.replace("\n", " ")
|
||
line = ' '.join(line.split())
|
||
pattern = re.compile(r'.*(' + '|'.join(re.escape(t) for t in transitions) + r').*', re.IGNORECASE)
|
||
match = pattern.match(line.strip())
|
||
if match:
|
||
scene.append({'Transition': line.strip()})
|
||
else:
|
||
actionline.append(line)
|
||
scene.append(line.strip())
|
||
|
||
# ->Appending the leftover last scene data in scenes which didn't get append in above for-loop
|
||
scenes.append(scene)
|
||
|
||
speakers = list(set(speakers))
|
||
scenes = scenes[1:]
|
||
s = []
|
||
# -> Adding the Actionlines and Other lines separately by splitting the Actionlines by new lines.
|
||
for scene in scenes:
|
||
s1 = []
|
||
for ele in scene:
|
||
if type(ele) == type(""):
|
||
s1.extend(ele.split("\n"))
|
||
else:
|
||
s1.append(ele)
|
||
s.append(s1)
|
||
print("dialogue: ", dialogues)
|
||
return s, actionline, parenthetical_lis, speakers, dialogues
|