319 lines
14 KiB
Python
319 lines
14 KiB
Python
|
# import textract
|
|||
|
import re
|
|||
|
import docx
|
|||
|
import os
|
|||
|
doc = docx.Document()
|
|||
|
from docx.shared import Inches, Cm, Pt
|
|||
|
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
|||
|
from docx.enum.table import WD_TABLE_ALIGNMENT, WD_ALIGN_VERTICAL
|
|||
|
from MNF.settings import BasePath
|
|||
|
basePath = BasePath()
|
|||
|
|
|||
|
# -> Patterns for Detection of Sluglines,Transition,actionline and Dialogues
|
|||
|
slug_pattern = r'[\d]*[.]?[\s]*[IE][NX]T'
|
|||
|
pat = r'[\d]*[\s]*[IE]/[IE][.]?'
|
|||
|
transitions = ['CUT TO:', 'FADE IN:', 'FADE OUT:', 'DISSOLVE TO:', 'JUMP CUT TO:',
|
|||
|
'JUMP TO:', 'CUT BACK TO:', 'INTERCUT WITH:', 'I/C WITH:', 'BACK TO:', 'INTERVAL']
|
|||
|
reserved_words = ['MONTAGE', 'PBS', 'FADE',
|
|||
|
'FADE', 'TITLE', 'SPLIT', 'SCREEN', 'CUT']
|
|||
|
|
|||
|
|
|||
|
# -> Random function - no use
|
|||
|
def breaksen(s):
|
|||
|
l = []
|
|||
|
if len(s.split()) <= 256:
|
|||
|
l.append(s)
|
|||
|
else:
|
|||
|
n = len(s.split())
|
|||
|
for i in range(n // 32 + 1):
|
|||
|
l.append(" ".join(s.split()[32 * i:32 * (i + 1)]))
|
|||
|
return l
|
|||
|
|
|||
|
|
|||
|
# -> Function for Getting Languages and Scripts related to a Script(Docx)
|
|||
|
def getRefined(filename1):
|
|||
|
print("Get_Refined_Called")
|
|||
|
total_scenes = 0
|
|||
|
print("filname", filename1)
|
|||
|
#filename1 = "/home/user/mnf/project/MNF/media/scripts_folder/9a97e7dc-bd18-416f-b2a6-bbfcd8a3887b/b1_a4weGyR.docx"
|
|||
|
text = textract.process(filename1, encoding="utf8", errors='ignore')
|
|||
|
filename = rf"{basePath}/conversion/translation/file.txt"
|
|||
|
f = open(filename, 'wb')
|
|||
|
f.write(text)
|
|||
|
f.close()
|
|||
|
dialog_coming = False
|
|||
|
f = open(filename, 'r', encoding="utf8", errors='ignore')
|
|||
|
doc11 = f.read()
|
|||
|
f.close()
|
|||
|
f1 = open(rf"{basePath}/conversion/translation/file1.txt",
|
|||
|
'w', encoding="utf8", errors='ignore')
|
|||
|
flag = False
|
|||
|
for line in doc11.split("\n"):
|
|||
|
line = line.strip()
|
|||
|
print("Original Line:", line)
|
|||
|
|
|||
|
# -> For Detection of Slug lines
|
|||
|
if (line.strip().startswith(('INT.', 'INT ')) or
|
|||
|
line.strip().startswith(('I/E', 'E/I')) or
|
|||
|
line.strip().startswith(('EXT.', 'EXT ')) or
|
|||
|
line.strip().startswith('EXT/INT') or
|
|||
|
line.strip().startswith('INT/EXT') or
|
|||
|
re.match(slug_pattern, line.strip()) or re.match(pat, line.strip())) and not (
|
|||
|
line.strip().startswith(('INTERCUT', 'INTERMISSION', 'INTERVAL'))):
|
|||
|
|
|||
|
flag = True
|
|||
|
f1.write(line)
|
|||
|
f1.write('\n')
|
|||
|
continue
|
|||
|
# -> for Detection of Probable Dialogues,Speakers and Actionlines
|
|||
|
else:
|
|||
|
print("line 88: Other than Slugline:", line)
|
|||
|
if flag:
|
|||
|
print("line 90: else-if condition:", line)
|
|||
|
print("stuck here 1")
|
|||
|
if line.strip() == '\n':
|
|||
|
continue
|
|||
|
print("stuck here 1.1")
|
|||
|
if dialog_coming and (line == '\n' or line.strip() == ""):
|
|||
|
print("stuck here 2")
|
|||
|
print("line empty or just have newline", line)
|
|||
|
continue
|
|||
|
print("stuck here 3")
|
|||
|
# -> for Detection of Probable Dialogues
|
|||
|
if dialog_coming:
|
|||
|
print("Probable dialogue or PC: ", line)
|
|||
|
f1.write(line)
|
|||
|
f1.write('\n')
|
|||
|
if re.match(r"\(.*\)", line):
|
|||
|
print("stuck here 4")
|
|||
|
continue
|
|||
|
else:
|
|||
|
print("stuck here 5")
|
|||
|
dialog_coming = False
|
|||
|
print("line 107: else of PCs and dialog over")
|
|||
|
continue
|
|||
|
|
|||
|
# -> Detection of Speaker which implies that next line will be dialogue (dialog_coming=True)
|
|||
|
print("stuck here 6")
|
|||
|
# old_regex = "([A-Z'’]+[\s]*[-]*)*([#]*[\s]*[1-9])*(\(.*\))*"
|
|||
|
# old_regex2 = "([A-Z'’\s]*-?[#]*\s*[1-9]*\s*\([^)]*\))?"
|
|||
|
if line.isupper():
|
|||
|
print("Qualified a certian category1")
|
|||
|
else:
|
|||
|
print("not Qualified a certian category1")
|
|||
|
|
|||
|
if re.fullmatch(r"([A-Z'’]+[\s]*[-]*)*([#]*[\s]*[1-9])*(\(.*\))*", line.strip()):
|
|||
|
print("Qualified a certian category2")
|
|||
|
else:
|
|||
|
print("bot Qualified a certian categor2")
|
|||
|
if re.fullmatch(
|
|||
|
r"(MRS?|DR|ER|PHD|ESQ|HON|JR|MS|MESSRS|MMES|MSGR|PROF|REV|RT. HON|SR|ST)\. [A-Z]+",
|
|||
|
line.strip()):
|
|||
|
print("Qualified a certian category3")
|
|||
|
else:
|
|||
|
print("bot Qualified a certian category3")
|
|||
|
if line.isupper() and (
|
|||
|
re.fullmatch(r"([A-Z'’]+[\s]*[-]*)*([#]*[\s]*[1-9])*(\(.*\))*", line.strip()) or re.fullmatch(
|
|||
|
r"(MRS?|DR|ER|PHD|ESQ|HON|JR|MS|MESSRS|MMES|MSGR|PROF|REV|RT. HON|SR|ST)\. [A-Z]+",
|
|||
|
line.strip())):
|
|||
|
print("line 111: May be speaker: ", line)
|
|||
|
f1.write(line)
|
|||
|
f1.write('\n')
|
|||
|
dialog_coming = True
|
|||
|
continue
|
|||
|
print("stuck here 7")
|
|||
|
# -> Detection of Actionline , etc.
|
|||
|
if not line == '\n':
|
|||
|
print("line 120 Actionline or Something else", line)
|
|||
|
f1.write(line)
|
|||
|
f1.write('\n')
|
|||
|
print("stuck here 8")
|
|||
|
print("line 125 file closed")
|
|||
|
f1.close()
|
|||
|
|
|||
|
# -> copying all the data in file1.txt to file.txt with bytes included
|
|||
|
filename1 = rf"{basePath}/conversion/translation/file1.txt"
|
|||
|
text = textract.process(filename1, encoding="utf8", errors='ignore')
|
|||
|
print("line 130: ", text)
|
|||
|
filename = rf"{basePath}/conversion/translation/file.txt"
|
|||
|
f = open(filename, 'wb')
|
|||
|
f.write(text)
|
|||
|
f.close()
|
|||
|
|
|||
|
with open(filename, "r") as input:
|
|||
|
input_ = input.read().split('\n\n')
|
|||
|
# -> Creating Refined List of Scenes and its data
|
|||
|
refined = []
|
|||
|
for line in input_:
|
|||
|
refined.append(line.strip())
|
|||
|
|
|||
|
refined = list(filter(lambda a: a != "", refined))
|
|||
|
|
|||
|
for i in range(len(refined)):
|
|||
|
if not (refined[i].strip().startswith(('INT.', 'INT ')) or refined[i].strip().startswith(('EXT.', 'EXT ')) or
|
|||
|
refined[i].strip().startswith('EXT/INT') or refined[i].strip().startswith('INT/EXT') or refined[
|
|||
|
i].strip().startswith(('I/E', 'E/I')) or re.match(slug_pattern, refined[i].strip()) or re.match(pat,
|
|||
|
refined[
|
|||
|
i].strip())):
|
|||
|
total_scenes = total_scenes + 1
|
|||
|
continue
|
|||
|
# -> This indicates that if a part of refined data does not have a Slug-line then
|
|||
|
# that data should be rejected
|
|||
|
refined = refined[i:]
|
|||
|
break
|
|||
|
|
|||
|
refined = list(filter(lambda a: a != "", refined))
|
|||
|
print("line 156:Refined", refined)
|
|||
|
return refined, total_scenes
|
|||
|
|
|||
|
|
|||
|
# -> For Getting Slug lines and Non-Slug lines from Refined Data
|
|||
|
def getSlugAndNonSlug(refined):
|
|||
|
sluglines = []
|
|||
|
without_slug = []
|
|||
|
for para in refined:
|
|||
|
para = para.strip()
|
|||
|
if para.strip().startswith(('INT.', 'INT')) or para.strip().startswith(
|
|||
|
('EXT.', 'EXT')) or para.strip().startswith('EXT/INT') or para.strip().startswith(
|
|||
|
('I/E', 'E/I')) or para.strip().startswith('INT/EXT') or re.match(slug_pattern,
|
|||
|
para.strip()) or re.match(pat,
|
|||
|
para.strip()):
|
|||
|
sluglines.append(para)
|
|||
|
continue
|
|||
|
without_slug.append(para)
|
|||
|
return sluglines, without_slug
|
|||
|
|
|||
|
|
|||
|
# -> For Extracting the Speakers from Refined Data
|
|||
|
def getSpeakers(without_slug):
|
|||
|
characters = []
|
|||
|
for para in without_slug:
|
|||
|
lis = para.split('\n')
|
|||
|
i = 0
|
|||
|
for item in lis:
|
|||
|
i = i + 1
|
|||
|
i = min(i, len(lis) - 2)
|
|||
|
if item.isupper() and not (lis[i + 1].strip() == ""):
|
|||
|
if re.match(r"[A-Z'’]+[\s]*[-]*[A-Z'’]*([#]*[\s]*[1-9])*(\(.*\))*", item):
|
|||
|
tem = item.split("(")[0].strip()
|
|||
|
characters.append(tem.strip())
|
|||
|
else:
|
|||
|
continue
|
|||
|
|
|||
|
characters = list(set(characters))
|
|||
|
characters = list(filter(lambda x: len(x) > 0, characters))
|
|||
|
characters = [character for character in characters if set(
|
|||
|
character.split(" ")).intersection(reserved_words) == set()]
|
|||
|
return characters
|
|||
|
|
|||
|
|
|||
|
# ->Use this function For getting the Scenes with all appropriate data extracted
|
|||
|
def getScenes(refined, total_scenes, characters):
|
|||
|
# To find scenes data structure and prev and next scenes numbers
|
|||
|
i = 0
|
|||
|
scene = []
|
|||
|
dialogues = []
|
|||
|
speakers = []
|
|||
|
slugline_dic = {}
|
|||
|
pc = 0
|
|||
|
scene_no = 0
|
|||
|
actionline = []
|
|||
|
parenthetical_lis = []
|
|||
|
|
|||
|
scenes = []
|
|||
|
speaker = ""
|
|||
|
parenthetical = 'NONE'
|
|||
|
|
|||
|
for line in refined:
|
|||
|
# -> For Detection of Slug lines
|
|||
|
if line.strip().startswith(('INT.', 'INT')) or line.strip().startswith(
|
|||
|
('EXT.', 'EXT')) or line.strip().startswith('EXT/INT') or line.strip().startswith(
|
|||
|
'INT/EXT') or line.strip().startswith(('I/E', 'E/I')) or re.match(slug_pattern,
|
|||
|
line.strip()) or re.match(pat,
|
|||
|
line.strip()):
|
|||
|
print("Slug-line Case")
|
|||
|
scenes.append(scene)
|
|||
|
scene = []
|
|||
|
i = 0
|
|||
|
scene_no += 1
|
|||
|
scene.append(line)
|
|||
|
slugline_dic[scene_no] = line.split("\n")[0].strip('0123456789.- ')
|
|||
|
# -> For Detection of Actionlines, Speakers, Dialogues,Transitions
|
|||
|
else:
|
|||
|
print("Not Slug-line Case")
|
|||
|
lis = line.split("\n")
|
|||
|
lis = [l.strip() for l in lis]
|
|||
|
print(" \n Line 232 probable dialogue list", lis)
|
|||
|
word = lis[0]
|
|||
|
# -> For Extraction of Dialogues, Paranthetical lines, Speakers of Previous line and Next line
|
|||
|
if word.split('(')[0].strip() in characters:
|
|||
|
mydic = {}
|
|||
|
speaker = word.split('(')[0].strip()
|
|||
|
print("Speaker 238", speaker)
|
|||
|
# -> if line has a speaker, dialogues and parenthetical lines then it goes in this part of code
|
|||
|
if len(lis) > 1 and re.match(r"\(.*\)", lis[1]):
|
|||
|
pc = pc + 1
|
|||
|
parenthetical = lis[1]
|
|||
|
parenthetical = parenthetical.replace("\n", "")
|
|||
|
dia = ' '.join(lis[2:])
|
|||
|
dia = dia.replace("\n", "")
|
|||
|
dia = dia.replace("\"", '')
|
|||
|
else:
|
|||
|
dia = ''.join(lis[1:])
|
|||
|
dia = dia.replace("\n", "")
|
|||
|
dia = dia.replace("\"", '')
|
|||
|
print(" length dia\n", len(dia))
|
|||
|
# -> if no dialogues and no paranthetical lines were found then this code
|
|||
|
if not (len(dia) == 0 and parenthetical == "NONE"):
|
|||
|
print(" len dia != and Parenthetical == NONE: 384 ")
|
|||
|
if i - 1 >= 0:
|
|||
|
try:
|
|||
|
prev = main_lis[scene_no - 1][i - 1]
|
|||
|
except:
|
|||
|
prev = ""
|
|||
|
else:
|
|||
|
prev = ""
|
|||
|
try:
|
|||
|
next = main_lis[scene_no - 1][i + 1]
|
|||
|
except:
|
|||
|
next = ""
|
|||
|
|
|||
|
# prev is previous speaker and next is next speaker of the dialogue
|
|||
|
mydic[speaker] = [parenthetical,
|
|||
|
scene_no, dia, len(dia), prev, next]
|
|||
|
print("line 270", mydic)
|
|||
|
i = i + 1
|
|||
|
speakers.append(speaker)
|
|||
|
parenthetical_lis.append(parenthetical)
|
|||
|
dialogues.append(mydic)
|
|||
|
scene.append(mydic)
|
|||
|
parenthetical = "NONE"
|
|||
|
else:
|
|||
|
line = line.replace("\n", " ")
|
|||
|
line = ' '.join(line.split())
|
|||
|
pattern = re.compile(r'.*(' + '|'.join(re.escape(t) for t in transitions) + r').*', re.IGNORECASE)
|
|||
|
match = pattern.match(line.strip())
|
|||
|
if match:
|
|||
|
scene.append({'Transition': line.strip()})
|
|||
|
else:
|
|||
|
actionline.append(line)
|
|||
|
scene.append(line.strip())
|
|||
|
|
|||
|
# ->Appending the leftover last scene data in scenes which didn't get append in above for-loop
|
|||
|
scenes.append(scene)
|
|||
|
|
|||
|
speakers = list(set(speakers))
|
|||
|
scenes = scenes[1:]
|
|||
|
s = []
|
|||
|
# -> Adding the Actionlines and Other lines separately by splitting the Actionlines by new lines.
|
|||
|
for scene in scenes:
|
|||
|
s1 = []
|
|||
|
for ele in scene:
|
|||
|
if type(ele) == type(""):
|
|||
|
s1.extend(ele.split("\n"))
|
|||
|
else:
|
|||
|
s1.append(ele)
|
|||
|
s.append(s1)
|
|||
|
print("dialogue: ", dialogues)
|
|||
|
return s, actionline, parenthetical_lis, speakers, dialogues
|