Conversion_Kitchen_Code/kitchen_counter/conversion/translation/detection_20_dec.py

340 lines
12 KiB
Python
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from google.cloud import translate_v2 as Translate
from google.cloud import translate
from .script_detector import script_cat
from docx.enum.table import WD_TABLE_ALIGNMENT, WD_ALIGN_VERTICAL
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.shared import Inches, Cm, Pt
# import textract
from tqdm import tqdm
import sys
import re
import docx
import os
doc = docx.Document()
from docx.shared import Inches, Cm, Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.table import WD_TABLE_ALIGNMENT, WD_ALIGN_VERTICAL
from MNF.settings import BasePath
basePath = BasePath()
#google
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]=rf"{basePath}/conversion/My First Project-2573112d5326.json"
from google.cloud import translate
from google.cloud import translate_v2 as Translate
translate_client = Translate.Client()
client = translate.TranslationServiceClient()
project_id = 'authentic-bongo-272808'
location = "global"
parent = f"projects/{project_id}/locations/{location}"
slug_pattern= r'[\d]*[.]?[\s]*[IE][NX]T'
pat = r'[\d]*[\s]*[IE]/[IE][.]?'
transitions = ['CUT TO:','FADE IN:','FADE OUT:','DISSOLVE TO:','JUMP CUT TO:','JUMP TO:','CUT BACK TO:','INTERCUT WITH:','I/C WITH:','BACK TO:', 'INTERVAL']
reserved_words = ['MONTAGE','PBS','FADE','FADE','TITLE','SPLIT', 'SCREEN','CUT']
style = doc.styles['Normal']
font = style.font
font.name = 'Courier New'
font.size = Pt(12)
def breaksen(s):
l =[]
#if len(s.split())<=256:
if len(s.split())<=256:
l.append(s)
else:
n = len(s.split())
for i in range(n//32 + 1):
l.append(" ".join(s.split()[32*i:32*(i+1)]))
return l
def getRefined(filename1):
#print("get_refined_called")
total_scenes = 0
text = textract.process(filename1, encoding="utf8", errors='ignore')
filename= rf"{basePath}/conversion/translation/file.txt"
f=open(filename, 'wb')
f.write(text)
f.close()
dialog_coming=False
f=open(filename, 'r', encoding="utf8", errors='ignore')
doc11=f.read()
f.close()
f1=open(rf"{basePath}/conversion/translation/file1.txt",'w', encoding="utf8", errors='ignore')
c=0
flag=False
for line in doc11.split("\n"):
if (line.strip().startswith(('INT.','INT ')) or \
line.strip().startswith(('I/E','E/I')) or \
line.strip().startswith(('EXT.','EXT ')) or \
line.strip().startswith('EXT/INT') or \
line.strip().startswith('INT/EXT') or \
re.match(slug_pattern,line.strip())):
flag=True
f1.write(line)
f1.write('\n')
continue
else:
line = line.strip()
if flag:
if line.strip()=='\n':
continue
if dialog_coming and (line=='\n' or line.strip()==""):
continue
if dialog_coming:
f1.write(line)
f1.write('\n')
if re.match(r"\(.*\)",line):
continue
else:
dialog_coming=False
continue
continue
if line.isupper() and re.fullmatch(r"([A-Z']+[\s]*[-]*)*([#]*[\s]*[1-9])*(\(.*\))*",line.strip()):
f1.write(line)
f1.write('\n')
dialog_coming=True
continue
if not line=='\n':
f1.write(line)
f1.write('\n')
f1.close()
filename1=rf"{basePath}/conversion/translation/file1.txt"
#file.txt contains the data of file1.txt , no usage as of now may be change the mame of the file.txt to file1.txt
text = textract.process(filename1, encoding="utf8", errors='ignore')
filename=rf"{basePath}/conversion/translation/file.txt"
_, file_extension = os.path.splitext(filename1)
f=open(filename, 'wb')
f.write(text)
f.close()
with open(filename, "r") as input:
input_ = input.read().split('\n\n')
refined=[]
for line in input_:
refined.append(line.strip())
refined=list(filter(lambda a: a != "", refined))
#print("processing the script")
for i in range(len(refined)):
if not (refined[i].strip().startswith(('INT.','INT ')) or refined[i].strip().startswith(('EXT.','EXT ')) or refined[i].strip().startswith('EXT/INT') or refined[i].strip().startswith('INT/EXT') or refined[i].strip().startswith(('I/E','E/I')) or re.match(slug_pattern,refined[i].strip()) or re.match(pat,refined[i].strip())):
total_scenes = total_scenes + 1
continue
refined=refined[i:]
break
# refined.append(line.strip())
refined=list(filter(lambda a: a != "", refined))
return refined,total_scenes
def getSlugAndNonSlug(refined):
sluglines=[]
without_slug=[]
for para in refined:
para=para.strip()
if para.strip().startswith(('INT.','INT')) or para.strip().startswith(('EXT.','EXT')) or para.strip().startswith('EXT/INT') or para.strip().startswith(('I/E','E/I')) or para.strip().startswith('INT/EXT') or re.match(slug_pattern,para.strip()) or re.match(pat,para.strip()):
sluglines.append(para)
continue
without_slug.append(para)
return sluglines,without_slug
def getSpeakers(without_slug):
characters=[]
for para in without_slug:
lis=para.split('\n')
i=0
for item in lis:
i=i+1
i=min(i,len(lis)-2)
if item.isupper() and not(lis[i+1].strip()==""):
if re.match(r"[A-Z']+[\s]*[-]*[A-Z']*([#]*[\s]*[1-9])*(\(.*\))*",item):
tem = item.split("(")[0].strip()
characters.append(tem.strip())
else:
continue
characters=list(set(characters))
characters=list(filter(lambda x: len(x) >0,characters))
characters = [character for character in characters if set(character.split(" ")).intersection(reserved_words) == set()]
return characters
def getScenes(refined,total_scenes,characters):
# To find scenes data structure and prev and next scenes numbers
i=0
scene=[]
dialogues=[]
speakers=[]
slugline_dic={}
prev_dial_speaker=""
next_dial_speaker=""
pc=0
scene_no=0
actionline=[]
successor_scene_no=0
predecessor_scene_no=0
parenthetical_lis=[]
scenes=[]
speaker=""
parenthetical='NONE'
patttern=r'[\d]*[.]?[\s]*[IE][NX]T'
for line in refined:
if line.strip().startswith(('INT.','INT')) or line.strip().startswith(('EXT.','EXT')) or line.strip().startswith('EXT/INT') or line.strip().startswith('INT/EXT') or line.strip().startswith(('I/E','E/I')) or re.match(slug_pattern,line.strip()) or re.match(pat,line.strip()):
scenes.append(scene)
scene=[]
i=0
scene_no+=1
scene.append(line)
slugline_dic[scene_no]=line.split("\n")[0].strip('0123456789.- ')
else:
lis=line.split("\n")
lis=[l.strip() for l in lis]
word=lis[0]
if word.split('(')[0].strip() in characters:
mydic={}
prev_dial_speaker=speaker
speaker=word.split('(')[0].strip()
if len(lis)>1 and re.match(r"\(.*\)",lis[1]):
pc=pc+1
parenthetical=lis[1]
parenthetical=parenthetical.replace("\n","")
dia=' '.join(lis[2:])
dia=dia.replace("\n","")
##renu
dia=dia.replace("\"", '')
else:
dia=''.join(lis[1:])
dia=dia.replace("\n","")
dia=dia.replace("\"", '')
if not (len(dia)==0 and parenthetical=="NONE"):
if i-1 >= 0:
try:
prev=main_lis[scene_no-1][i-1]
except:
prev=""
else:
prev=""
try:
next=main_lis[scene_no-1][i+1]
except:
next=""
#prev is previous speaker and next is next speaker of the dialogue
mydic[speaker]=[parenthetical,scene_no,dia,len(dia),prev,next]
# print(mydic)
prev,next="",""
i=i+1
speakers.append(speaker)
parenthetical_lis.append(parenthetical)
dialogues.append(mydic)
scene.append(mydic)
parenthetical="NONE"
else:
line=line.replace("\n"," ")
line=' '.join(line.split())
if line.strip() in transitions:
scene.append({'Transition':line.strip()})
continue
actionline.append(line)
scene.append(line.strip())
scenes.append(scene)
speakers=list(set(speakers))
scenes=scenes[1:]
s = []
for scene in scenes:
s1=[]
for ele in scene:
if type(ele) == type(""):
s1.extend(ele.split("\n"))
else:
s1.append(ele)
s.append(s1)
return s,actionline,parenthetical_lis,speakers,dialogues
def language_detector(text):
result = translate_client.translate(text, target_language='hi')
det_lang = result["detectedSourceLanguage"]
return det_lang
def getInputs(filename1):
'''
non_dial_src_lang =''
dial_src_lang =''
dial_src_script =''
'''
refined, total_scenes = getRefined(filename1)
sluglines, without_slug = getSlugAndNonSlug(refined)
characters = getSpeakers(without_slug)
scenes, actionline, parenthetical_lis, speakers, dialogues = getScenes(
refined, total_scenes, characters)
# print("scene")
# x = "False"
# y = "False"
# for i, line in enumerate(scene):
# if i == 0:
# continue
# if type(line) == type(""):
# x = "True"
# non_dial_src_lang = language_detector(line)
# else:
# y = "True"
# [speaker] = line.keys()
# if speaker == 'Transition':
# continue
# if line[speaker][0] != 'NONE':
# continue
# dial_src_lang = language_detector(line[speaker][2])
# dial_src_script = script_cat(line[speaker][2][1])[0]
for scene in tqdm(scenes):
#print("scene")
x = "False"
y = "False"
for i,line in enumerate(scene):
if i == 0:
continue
if type(line)==type(""):
x = "True"
non_dial_src_lang = language_detector(line)
non_dial_src_script = script_cat(line[1])[0]
else:
y = "True"
[speaker] = line.keys()
if speaker == 'Transition':
continue
# if line[speaker][0] != 'NONE':
# continue
dial_src_lang = language_detector(line[speaker][2])
dial_src_script = script_cat(line[speaker][2][1])[0]
if x == "True" and y == "True":
break
mydata = [non_dial_src_lang, dial_src_lang, dial_src_script, non_dial_src_script]
return mydata