from google.cloud import translate_v2 as Translate from google.cloud import translate from .script_detector import script_cat from docx.enum.table import WD_TABLE_ALIGNMENT, WD_ALIGN_VERTICAL from docx.enum.text import WD_ALIGN_PARAGRAPH from docx.shared import Inches, Cm, Pt # import textract from tqdm import tqdm import sys import re import docx import os doc = docx.Document() from docx.shared import Inches, Cm, Pt from docx.enum.text import WD_ALIGN_PARAGRAPH from docx.enum.table import WD_TABLE_ALIGNMENT, WD_ALIGN_VERTICAL from MNF.settings import BasePath basePath = BasePath() #google os.environ["GOOGLE_APPLICATION_CREDENTIALS"]=rf"{basePath}/conversion/My First Project-2573112d5326.json" from google.cloud import translate from google.cloud import translate_v2 as Translate translate_client = Translate.Client() client = translate.TranslationServiceClient() project_id = 'authentic-bongo-272808' location = "global" parent = f"projects/{project_id}/locations/{location}" slug_pattern= r'[\d]*[.]?[\s]*[IE][NX]T' pat = r'[\d]*[\s]*[IE]/[IE][.]?' transitions = ['CUT TO:','FADE IN:','FADE OUT:','DISSOLVE TO:','JUMP CUT TO:','JUMP TO:','CUT BACK TO:','INTERCUT WITH:','I/C WITH:','BACK TO:', 'INTERVAL'] reserved_words = ['MONTAGE','PBS','FADE','FADE','TITLE','SPLIT', 'SCREEN','CUT'] style = doc.styles['Normal'] font = style.font font.name = 'Courier New' font.size = Pt(12) def breaksen(s): l =[] #if len(s.split())<=256: if len(s.split())<=256: l.append(s) else: n = len(s.split()) for i in range(n//32 + 1): l.append(" ".join(s.split()[32*i:32*(i+1)])) return l def getRefined(filename1): #print("get_refined_called") total_scenes = 0 text = textract.process(filename1, encoding="utf8", errors='ignore') filename= rf"{basePath}/conversion/translation/file.txt" f=open(filename, 'wb') f.write(text) f.close() dialog_coming=False f=open(filename, 'r', encoding="utf8", errors='ignore') doc11=f.read() f.close() f1=open(rf"{basePath}/conversion/translation/file1.txt",'w', encoding="utf8", errors='ignore') c=0 flag=False for line in doc11.split("\n"): if (line.strip().startswith(('INT.','INT ')) or \ line.strip().startswith(('I/E','E/I')) or \ line.strip().startswith(('EXT.','EXT ')) or \ line.strip().startswith('EXT/INT') or \ line.strip().startswith('INT/EXT') or \ re.match(slug_pattern,line.strip())): flag=True f1.write(line) f1.write('\n') continue else: line = line.strip() if flag: if line.strip()=='\n': continue if dialog_coming and (line=='\n' or line.strip()==""): continue if dialog_coming: f1.write(line) f1.write('\n') if re.match(r"\(.*\)",line): continue else: dialog_coming=False continue continue if line.isupper() and re.fullmatch(r"([A-Z'’]+[\s]*[-]*)*([#]*[\s]*[1-9])*(\(.*\))*",line.strip()): f1.write(line) f1.write('\n') dialog_coming=True continue if not line=='\n': f1.write(line) f1.write('\n') f1.close() filename1=rf"{basePath}/conversion/translation/file1.txt" #file.txt contains the data of file1.txt , no usage as of now may be change the mame of the file.txt to file1.txt text = textract.process(filename1, encoding="utf8", errors='ignore') filename=rf"{basePath}/conversion/translation/file.txt" _, file_extension = os.path.splitext(filename1) f=open(filename, 'wb') f.write(text) f.close() with open(filename, "r") as input: input_ = input.read().split('\n\n') refined=[] for line in input_: refined.append(line.strip()) refined=list(filter(lambda a: a != "", refined)) #print("processing the script") for i in range(len(refined)): if not (refined[i].strip().startswith(('INT.','INT ')) or refined[i].strip().startswith(('EXT.','EXT ')) or refined[i].strip().startswith('EXT/INT') or refined[i].strip().startswith('INT/EXT') or refined[i].strip().startswith(('I/E','E/I')) or re.match(slug_pattern,refined[i].strip()) or re.match(pat,refined[i].strip())): total_scenes = total_scenes + 1 continue refined=refined[i:] break # refined.append(line.strip()) refined=list(filter(lambda a: a != "", refined)) return refined,total_scenes def getSlugAndNonSlug(refined): sluglines=[] without_slug=[] for para in refined: para=para.strip() if para.strip().startswith(('INT.','INT')) or para.strip().startswith(('EXT.','EXT')) or para.strip().startswith('EXT/INT') or para.strip().startswith(('I/E','E/I')) or para.strip().startswith('INT/EXT') or re.match(slug_pattern,para.strip()) or re.match(pat,para.strip()): sluglines.append(para) continue without_slug.append(para) return sluglines,without_slug def getSpeakers(without_slug): characters=[] for para in without_slug: lis=para.split('\n') i=0 for item in lis: i=i+1 i=min(i,len(lis)-2) if item.isupper() and not(lis[i+1].strip()==""): if re.match(r"[A-Z'’]+[\s]*[-]*[A-Z'’]*([#]*[\s]*[1-9])*(\(.*\))*",item): tem = item.split("(")[0].strip() characters.append(tem.strip()) else: continue characters=list(set(characters)) characters=list(filter(lambda x: len(x) >0,characters)) characters = [character for character in characters if set(character.split(" ")).intersection(reserved_words) == set()] return characters def getScenes(refined,total_scenes,characters): # To find scenes data structure and prev and next scenes numbers i=0 scene=[] dialogues=[] speakers=[] slugline_dic={} prev_dial_speaker="" next_dial_speaker="" pc=0 scene_no=0 actionline=[] successor_scene_no=0 predecessor_scene_no=0 parenthetical_lis=[] scenes=[] speaker="" parenthetical='NONE' patttern=r'[\d]*[.]?[\s]*[IE][NX]T' for line in refined: if line.strip().startswith(('INT.','INT')) or line.strip().startswith(('EXT.','EXT')) or line.strip().startswith('EXT/INT') or line.strip().startswith('INT/EXT') or line.strip().startswith(('I/E','E/I')) or re.match(slug_pattern,line.strip()) or re.match(pat,line.strip()): scenes.append(scene) scene=[] i=0 scene_no+=1 scene.append(line) slugline_dic[scene_no]=line.split("\n")[0].strip('0123456789.- ') else: lis=line.split("\n") lis=[l.strip() for l in lis] word=lis[0] if word.split('(')[0].strip() in characters: mydic={} prev_dial_speaker=speaker speaker=word.split('(')[0].strip() if len(lis)>1 and re.match(r"\(.*\)",lis[1]): pc=pc+1 parenthetical=lis[1] parenthetical=parenthetical.replace("\n","") dia=' '.join(lis[2:]) dia=dia.replace("\n","") ##renu dia=dia.replace("\"", '') else: dia=''.join(lis[1:]) dia=dia.replace("\n","") dia=dia.replace("\"", '') if not (len(dia)==0 and parenthetical=="NONE"): if i-1 >= 0: try: prev=main_lis[scene_no-1][i-1] except: prev="" else: prev="" try: next=main_lis[scene_no-1][i+1] except: next="" #prev is previous speaker and next is next speaker of the dialogue mydic[speaker]=[parenthetical,scene_no,dia,len(dia),prev,next] # print(mydic) prev,next="","" i=i+1 speakers.append(speaker) parenthetical_lis.append(parenthetical) dialogues.append(mydic) scene.append(mydic) parenthetical="NONE" else: line=line.replace("\n"," ") line=' '.join(line.split()) if line.strip() in transitions: scene.append({'Transition':line.strip()}) continue actionline.append(line) scene.append(line.strip()) scenes.append(scene) speakers=list(set(speakers)) scenes=scenes[1:] s = [] for scene in scenes: s1=[] for ele in scene: if type(ele) == type(""): s1.extend(ele.split("\n")) else: s1.append(ele) s.append(s1) return s,actionline,parenthetical_lis,speakers,dialogues def language_detector(text): result = translate_client.translate(text, target_language='hi') det_lang = result["detectedSourceLanguage"] return det_lang def getInputs(filename1): ''' non_dial_src_lang ='' dial_src_lang ='' dial_src_script ='' ''' refined, total_scenes = getRefined(filename1) sluglines, without_slug = getSlugAndNonSlug(refined) characters = getSpeakers(without_slug) scenes, actionline, parenthetical_lis, speakers, dialogues = getScenes( refined, total_scenes, characters) # print("scene") # x = "False" # y = "False" # for i, line in enumerate(scene): # if i == 0: # continue # if type(line) == type(""): # x = "True" # non_dial_src_lang = language_detector(line) # else: # y = "True" # [speaker] = line.keys() # if speaker == 'Transition': # continue # if line[speaker][0] != 'NONE': # continue # dial_src_lang = language_detector(line[speaker][2]) # dial_src_script = script_cat(line[speaker][2][1])[0] for scene in tqdm(scenes): #print("scene") x = "False" y = "False" for i,line in enumerate(scene): if i == 0: continue if type(line)==type(""): x = "True" non_dial_src_lang = language_detector(line) non_dial_src_script = script_cat(line[1])[0] else: y = "True" [speaker] = line.keys() if speaker == 'Transition': continue # if line[speaker][0] != 'NONE': # continue dial_src_lang = language_detector(line[speaker][2]) dial_src_script = script_cat(line[speaker][2][1])[0] if x == "True" and y == "True": break mydata = [non_dial_src_lang, dial_src_lang, dial_src_script, non_dial_src_script] return mydata