Conversion_Kitchen_Code/kitchen_counter/conversion/translation/detection_20_dec.py

340 lines
12 KiB
Python
Raw Normal View History

2024-04-27 09:33:09 +00:00
from google.cloud import translate_v2 as Translate
from google.cloud import translate
from .script_detector import script_cat
from docx.enum.table import WD_TABLE_ALIGNMENT, WD_ALIGN_VERTICAL
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.shared import Inches, Cm, Pt
# import textract
from tqdm import tqdm
import sys
import re
import docx
import os
doc = docx.Document()
from docx.shared import Inches, Cm, Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.table import WD_TABLE_ALIGNMENT, WD_ALIGN_VERTICAL
from MNF.settings import BasePath
basePath = BasePath()
#google
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]=rf"{basePath}/conversion/My First Project-2573112d5326.json"
from google.cloud import translate
from google.cloud import translate_v2 as Translate
translate_client = Translate.Client()
client = translate.TranslationServiceClient()
project_id = 'authentic-bongo-272808'
location = "global"
parent = f"projects/{project_id}/locations/{location}"
slug_pattern= r'[\d]*[.]?[\s]*[IE][NX]T'
pat = r'[\d]*[\s]*[IE]/[IE][.]?'
transitions = ['CUT TO:','FADE IN:','FADE OUT:','DISSOLVE TO:','JUMP CUT TO:','JUMP TO:','CUT BACK TO:','INTERCUT WITH:','I/C WITH:','BACK TO:', 'INTERVAL']
reserved_words = ['MONTAGE','PBS','FADE','FADE','TITLE','SPLIT', 'SCREEN','CUT']
style = doc.styles['Normal']
font = style.font
font.name = 'Courier New'
font.size = Pt(12)
def breaksen(s):
l =[]
#if len(s.split())<=256:
if len(s.split())<=256:
l.append(s)
else:
n = len(s.split())
for i in range(n//32 + 1):
l.append(" ".join(s.split()[32*i:32*(i+1)]))
return l
def getRefined(filename1):
#print("get_refined_called")
total_scenes = 0
text = textract.process(filename1, encoding="utf8", errors='ignore')
filename= rf"{basePath}/conversion/translation/file.txt"
f=open(filename, 'wb')
f.write(text)
f.close()
dialog_coming=False
f=open(filename, 'r', encoding="utf8", errors='ignore')
doc11=f.read()
f.close()
f1=open(rf"{basePath}/conversion/translation/file1.txt",'w', encoding="utf8", errors='ignore')
c=0
flag=False
for line in doc11.split("\n"):
if (line.strip().startswith(('INT.','INT ')) or \
line.strip().startswith(('I/E','E/I')) or \
line.strip().startswith(('EXT.','EXT ')) or \
line.strip().startswith('EXT/INT') or \
line.strip().startswith('INT/EXT') or \
re.match(slug_pattern,line.strip())):
flag=True
f1.write(line)
f1.write('\n')
continue
else:
line = line.strip()
if flag:
if line.strip()=='\n':
continue
if dialog_coming and (line=='\n' or line.strip()==""):
continue
if dialog_coming:
f1.write(line)
f1.write('\n')
if re.match(r"\(.*\)",line):
continue
else:
dialog_coming=False
continue
continue
if line.isupper() and re.fullmatch(r"([A-Z']+[\s]*[-]*)*([#]*[\s]*[1-9])*(\(.*\))*",line.strip()):
f1.write(line)
f1.write('\n')
dialog_coming=True
continue
if not line=='\n':
f1.write(line)
f1.write('\n')
f1.close()
filename1=rf"{basePath}/conversion/translation/file1.txt"
#file.txt contains the data of file1.txt , no usage as of now may be change the mame of the file.txt to file1.txt
text = textract.process(filename1, encoding="utf8", errors='ignore')
filename=rf"{basePath}/conversion/translation/file.txt"
_, file_extension = os.path.splitext(filename1)
f=open(filename, 'wb')
f.write(text)
f.close()
with open(filename, "r") as input:
input_ = input.read().split('\n\n')
refined=[]
for line in input_:
refined.append(line.strip())
refined=list(filter(lambda a: a != "", refined))
#print("processing the script")
for i in range(len(refined)):
if not (refined[i].strip().startswith(('INT.','INT ')) or refined[i].strip().startswith(('EXT.','EXT ')) or refined[i].strip().startswith('EXT/INT') or refined[i].strip().startswith('INT/EXT') or refined[i].strip().startswith(('I/E','E/I')) or re.match(slug_pattern,refined[i].strip()) or re.match(pat,refined[i].strip())):
total_scenes = total_scenes + 1
continue
refined=refined[i:]
break
# refined.append(line.strip())
refined=list(filter(lambda a: a != "", refined))
return refined,total_scenes
def getSlugAndNonSlug(refined):
sluglines=[]
without_slug=[]
for para in refined:
para=para.strip()
if para.strip().startswith(('INT.','INT')) or para.strip().startswith(('EXT.','EXT')) or para.strip().startswith('EXT/INT') or para.strip().startswith(('I/E','E/I')) or para.strip().startswith('INT/EXT') or re.match(slug_pattern,para.strip()) or re.match(pat,para.strip()):
sluglines.append(para)
continue
without_slug.append(para)
return sluglines,without_slug
def getSpeakers(without_slug):
characters=[]
for para in without_slug:
lis=para.split('\n')
i=0
for item in lis:
i=i+1
i=min(i,len(lis)-2)
if item.isupper() and not(lis[i+1].strip()==""):
if re.match(r"[A-Z']+[\s]*[-]*[A-Z']*([#]*[\s]*[1-9])*(\(.*\))*",item):
tem = item.split("(")[0].strip()
characters.append(tem.strip())
else:
continue
characters=list(set(characters))
characters=list(filter(lambda x: len(x) >0,characters))
characters = [character for character in characters if set(character.split(" ")).intersection(reserved_words) == set()]
return characters
def getScenes(refined,total_scenes,characters):
# To find scenes data structure and prev and next scenes numbers
i=0
scene=[]
dialogues=[]
speakers=[]
slugline_dic={}
prev_dial_speaker=""
next_dial_speaker=""
pc=0
scene_no=0
actionline=[]
successor_scene_no=0
predecessor_scene_no=0
parenthetical_lis=[]
scenes=[]
speaker=""
parenthetical='NONE'
patttern=r'[\d]*[.]?[\s]*[IE][NX]T'
for line in refined:
if line.strip().startswith(('INT.','INT')) or line.strip().startswith(('EXT.','EXT')) or line.strip().startswith('EXT/INT') or line.strip().startswith('INT/EXT') or line.strip().startswith(('I/E','E/I')) or re.match(slug_pattern,line.strip()) or re.match(pat,line.strip()):
scenes.append(scene)
scene=[]
i=0
scene_no+=1
scene.append(line)
slugline_dic[scene_no]=line.split("\n")[0].strip('0123456789.- ')
else:
lis=line.split("\n")
lis=[l.strip() for l in lis]
word=lis[0]
if word.split('(')[0].strip() in characters:
mydic={}
prev_dial_speaker=speaker
speaker=word.split('(')[0].strip()
if len(lis)>1 and re.match(r"\(.*\)",lis[1]):
pc=pc+1
parenthetical=lis[1]
parenthetical=parenthetical.replace("\n","")
dia=' '.join(lis[2:])
dia=dia.replace("\n","")
##renu
dia=dia.replace("\"", '')
else:
dia=''.join(lis[1:])
dia=dia.replace("\n","")
dia=dia.replace("\"", '')
if not (len(dia)==0 and parenthetical=="NONE"):
if i-1 >= 0:
try:
prev=main_lis[scene_no-1][i-1]
except:
prev=""
else:
prev=""
try:
next=main_lis[scene_no-1][i+1]
except:
next=""
#prev is previous speaker and next is next speaker of the dialogue
mydic[speaker]=[parenthetical,scene_no,dia,len(dia),prev,next]
# print(mydic)
prev,next="",""
i=i+1
speakers.append(speaker)
parenthetical_lis.append(parenthetical)
dialogues.append(mydic)
scene.append(mydic)
parenthetical="NONE"
else:
line=line.replace("\n"," ")
line=' '.join(line.split())
if line.strip() in transitions:
scene.append({'Transition':line.strip()})
continue
actionline.append(line)
scene.append(line.strip())
scenes.append(scene)
speakers=list(set(speakers))
scenes=scenes[1:]
s = []
for scene in scenes:
s1=[]
for ele in scene:
if type(ele) == type(""):
s1.extend(ele.split("\n"))
else:
s1.append(ele)
s.append(s1)
return s,actionline,parenthetical_lis,speakers,dialogues
def language_detector(text):
result = translate_client.translate(text, target_language='hi')
det_lang = result["detectedSourceLanguage"]
return det_lang
def getInputs(filename1):
'''
non_dial_src_lang =''
dial_src_lang =''
dial_src_script =''
'''
refined, total_scenes = getRefined(filename1)
sluglines, without_slug = getSlugAndNonSlug(refined)
characters = getSpeakers(without_slug)
scenes, actionline, parenthetical_lis, speakers, dialogues = getScenes(
refined, total_scenes, characters)
# print("scene")
# x = "False"
# y = "False"
# for i, line in enumerate(scene):
# if i == 0:
# continue
# if type(line) == type(""):
# x = "True"
# non_dial_src_lang = language_detector(line)
# else:
# y = "True"
# [speaker] = line.keys()
# if speaker == 'Transition':
# continue
# if line[speaker][0] != 'NONE':
# continue
# dial_src_lang = language_detector(line[speaker][2])
# dial_src_script = script_cat(line[speaker][2][1])[0]
for scene in tqdm(scenes):
#print("scene")
x = "False"
y = "False"
for i,line in enumerate(scene):
if i == 0:
continue
if type(line)==type(""):
x = "True"
non_dial_src_lang = language_detector(line)
non_dial_src_script = script_cat(line[1])[0]
else:
y = "True"
[speaker] = line.keys()
if speaker == 'Transition':
continue
# if line[speaker][0] != 'NONE':
# continue
dial_src_lang = language_detector(line[speaker][2])
dial_src_script = script_cat(line[speaker][2][1])[0]
if x == "True" and y == "True":
break
mydata = [non_dial_src_lang, dial_src_lang, dial_src_script, non_dial_src_script]
return mydata