# import textract from tqdm import tqdm import sys import re import docx import os doc = docx.Document() from docx.shared import Inches, Cm, Pt from docx.enum.text import WD_ALIGN_PARAGRAPH from docx.enum.table import WD_TABLE_ALIGNMENT, WD_ALIGN_VERTICAL from collections import Counter from script_detector import script_cat #google #os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="gifted-mountain-318504-0a5f94cda0c8.json" os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = rf"{basePath}/conversion/gifted-mountain-318504-4f001d5f08db.json" from google.cloud import translate from google.cloud import translate_v2 as Translate translate_client = Translate.Client() client = translate.TranslationServiceClient() project_id = 'authentic-bongo-272808' location = "global" parent = f"projects/{project_id}/locations/{location}" slug_pattern= r'[\d]*[.]?[\s]*[IE][NX]T' pat = r'[\d]*[\s]*[IE]/[IE][.]?' transitions = ['CUT TO:','FADE IN:','FADE OUT:','DISSOLVE TO:','JUMP CUT TO:','JUMP TO:','CUT BACK TO:','INTERCUT WITH:','I/C WITH:','BACK TO:', 'INTERVAL'] reserved_words = ['MONTAGE','PBS','FADE','FADE','TITLE','SPLIT', 'SCREEN','CUT'] style = doc.styles['Normal'] font = style.font font.name = 'Courier New' font.size = Pt(12) def breaksen(s): l =[] #if len(s.split())<=256: if len(s.split())<=256: l.append(s) else: n = len(s.split()) for i in range(n//32 + 1): l.append(" ".join(s.split()[32*i:32*(i+1)])) return l def getRefined(filename1): #print("get_refined_called") total_scenes = 0 text = textract.process(filename1, encoding="utf8", errors='ignore') filename="file.txt" f=open(filename, 'wb') f.write(text) f.close() dialog_coming=False f=open(filename, 'r', encoding="utf8", errors='ignore') doc11=f.read() f.close() f1=open("file1.txt",'w', encoding="utf8", errors='ignore') c=0 flag=False for line in doc11.split("\n"): if (line.strip().startswith(('INT.','INT ')) or \ line.strip().startswith(('I/E','E/I')) or \ line.strip().startswith(('EXT.','EXT ')) or \ line.strip().startswith('EXT/INT') or \ line.strip().startswith('INT/EXT') or \ re.match(slug_pattern,line.strip())): flag=True f1.write(line) f1.write('\n') continue else: line = line.strip() if flag: if line.strip()=='\n': continue if dialog_coming and (line=='\n' or line.strip()==""): continue if dialog_coming: f1.write(line) f1.write('\n') if re.match(r"\(.*\)",line): continue else: dialog_coming=False continue continue if line.isupper() and re.fullmatch(r"([A-Z'’]+[\s]*[-]*)*([#]*[\s]*[1-9])*(\(.*\))*",line.strip()): f1.write(line) f1.write('\n') dialog_coming=True continue if not line=='\n': f1.write(line) f1.write('\n') f1.close() filename1="file1.txt" #file.txt contains the data of file1.txt , no usage as of now may be change the mame of the file.txt to file1.txt text = textract.process(filename1, encoding="utf8", errors='ignore') filename="file.txt" _, file_extension = os.path.splitext(filename1) f=open(filename, 'wb') f.write(text) f.close() with open(filename, "r") as input: input_ = input.read().split('\n\n') refined=[] for line in input_: refined.append(line.strip()) refined=list(filter(lambda a: a != "", refined)) #print("processing the script") for i in range(len(refined)): if not (refined[i].strip().startswith(('INT.','INT ')) or refined[i].strip().startswith(('EXT.','EXT ')) or refined[i].strip().startswith('EXT/INT') or refined[i].strip().startswith('INT/EXT') or refined[i].strip().startswith(('I/E','E/I')) or re.match(slug_pattern,refined[i].strip()) or re.match(pat,refined[i].strip())): total_scenes = total_scenes + 1 continue refined=refined[i:] break # refined.append(line.strip()) refined=list(filter(lambda a: a != "", refined)) return refined,total_scenes def getSlugAndNonSlug(refined): sluglines=[] without_slug=[] for para in refined: para=para.strip() if para.strip().startswith(('INT.','INT')) or para.strip().startswith(('EXT.','EXT')) or para.strip().startswith('EXT/INT') or para.strip().startswith(('I/E','E/I')) or para.strip().startswith('INT/EXT') or re.match(slug_pattern,para.strip()) or re.match(pat,para.strip()): sluglines.append(para) continue without_slug.append(para) return sluglines,without_slug def getSpeakers(without_slug): characters=[] for para in without_slug: lis=para.split('\n') i=0 for item in lis: i=i+1 i=min(i,len(lis)-2) if item.isupper() and not(lis[i+1].strip()==""): if re.match(r"[A-Z'’]+[\s]*[-]*[A-Z'’]*([#]*[\s]*[1-9])*(\(.*\))*",item): tem = item.split("(")[0].strip() characters.append(tem.strip()) else: continue characters=list(set(characters)) characters=list(filter(lambda x: len(x) >0,characters)) characters = [character for character in characters if set(character.split(" ")).intersection(reserved_words) == set()] return characters def getScenes(refined,total_scenes,characters): # To find scenes data structure and prev and next scenes numbers i=0 scene=[] dialogues=[] speakers=[] slugline_dic={} prev_dial_speaker="" next_dial_speaker="" pc=0 scene_no=0 actionline=[] successor_scene_no=0 predecessor_scene_no=0 parenthetical_lis=[] scenes=[] speaker="" parenthetical='NONE' patttern=r'[\d]*[.]?[\s]*[IE][NX]T' for line in refined: if line.strip().startswith(('INT.','INT')) or line.strip().startswith(('EXT.','EXT')) or line.strip().startswith('EXT/INT') or line.strip().startswith('INT/EXT') or line.strip().startswith(('I/E','E/I')) or re.match(slug_pattern,line.strip()) or re.match(pat,line.strip()): scenes.append(scene) scene=[] i=0 scene_no+=1 scene.append(line) slugline_dic[scene_no]=line.split("\n")[0].strip('0123456789.- ') else: lis=line.split("\n") lis=[l.strip() for l in lis] word=lis[0] if word.split('(')[0].strip() in characters: mydic={} prev_dial_speaker=speaker speaker=word.split('(')[0].strip() if len(lis)>1 and re.match(r"\(.*\)",lis[1]): pc=pc+1 parenthetical=lis[1] parenthetical=parenthetical.replace("\n","") dia=' '.join(lis[2:]) dia=dia.replace("\n","") ##renu dia=dia.replace("\"", '') else: dia=''.join(lis[1:]) dia=dia.replace("\n","") dia=dia.replace("\"", '') if not (len(dia)==0 and parenthetical=="NONE"): if i-1 >= 0: try: prev=main_lis[scene_no-1][i-1] except: prev="" else: prev="" try: next=main_lis[scene_no-1][i+1] except: next="" #prev is previous speaker and next is next speaker of the dialogue mydic[speaker]=[parenthetical,scene_no,dia,len(dia),prev,next] # print(mydic) prev,next="","" i=i+1 speakers.append(speaker) parenthetical_lis.append(parenthetical) dialogues.append(mydic) scene.append(mydic) parenthetical="NONE" else: line=line.replace("\n"," ") line=' '.join(line.split()) if line.strip() in transitions: scene.append({'Transition':line.strip()}) continue actionline.append(line) scene.append(line.strip()) scenes.append(scene) speakers=list(set(speakers)) scenes=scenes[1:] s = [] for scene in scenes: s1=[] for ele in scene: if type(ele) == type(""): s1.extend(ele.split("\n")) else: s1.append(ele) s.append(s1) return s,actionline,parenthetical_lis,speakers,dialogues filename1 = sys.argv[1] #print(filename1) refined,total_scenes = getRefined(filename1) #print(refined) sluglines,without_slug = getSlugAndNonSlug(refined) characters = getSpeakers(without_slug) scenes,actionline,parenthetical_lis,speakers,dialogues = getScenes(refined,total_scenes,characters) #print(scenes) def language_detector(text): result = translate_client.translate(text, target_language='hi') det_lang = result["detectedSourceLanguage"] return det_lang def script_det(text): punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~''' no_punct = "" for char in text: if char not in punctuations: no_punct = char break script = script_cat(no_punct)[0] return script ''' A. Language of Highest number of full dialogues, B. Numbers of dialogues in action line language, C. Number of dialogues in other languages) ''' def A_B_C(dialogue_language, non_dial_src_lang): dict1 = dict(Counter(dialogue_language)) sorted_values = sorted(dict1.values(), reverse=True) # Sort the values sorted_dict = {} for i in sorted_values: for k in dict1.keys(): if dict1[k] == i: sorted_dict[k] = dict1[k] sources = list(sorted_dict.keys()) A = sources[0] if len(sources)!=1: B = sorted_values[sources.index(non_dial_src_lang)] C = sum(sorted_values[2:]) else: B=0 C=0 return A, B, C #print(scenes) dialogue_language = [] count =0 for scene in tqdm(scenes[:]): #print("scene") for i,line in enumerate(scene): if i == 0: continue if type(line)==type(""): #print("here") if count==0: #print(line) non_dial_src_lang = language_detector(line) count+=1 #print("non_dial_src_lang", non_dial_src_lang) else: #print("line", line) [speaker] = line.keys() #print([speaker]) if speaker == 'Transition': continue #print("dial", line[speaker][2]) dial_src_lang = language_detector(line[speaker][2]) dialogue_language.append(dial_src_lang) #dial_src_script = script_det(line[speaker][2]) # print("non_dial_src_lang", non_dial_src_lang) # print("dial_src_lang", dialogue_language) #print(len(dialogue_language)) #print(Counter(dialogue_language)) A, B, C = A_B_C(dialogue_language, non_dial_src_lang) # print("A = {} B = {} C = {}".format(A, B, C)) def dial_each_word_lang1(non_dial_src_lang, dial): for word in dial.split(): if language_detector(word)==non_dial_src_lang: print("word", word) return "True" return "False" def dial_each_word_lang2(non_dial_src_lang, A, dial ): for word in dial.split(): if (language_detector(word)!=non_dial_src_lang) or (language_detector(word)!=A): print("in 4") print("word", word) return "True" return "False" def word_with_actionline(scenes): for scene in tqdm(scenes[:]): for i,line in enumerate(scene): if i == 0: continue if type(line)==type(""): continue else: [speaker] = line.keys() if speaker == 'Transition': continue dial_src_lang = language_detector(line[speaker][2]) if dial_src_lang==A: word_lang_with_actionline = dial_each_word_lang1(non_dial_src_lang, line[speaker][2]) if word_lang_with_actionline == "True": return word_lang_with_actionline def word_with_other(scenes): for scene in tqdm(scenes[:]): for i,line in enumerate(scene): if i == 0: continue if type(line)==type(""): continue else: [speaker] = line.keys() if speaker == 'Transition': continue dial_src_lang = language_detector(line[speaker][2]) if dial_src_lang==A: word_lang_with_other = dial_each_word_lang2(non_dial_src_lang, A, line[speaker][2]) if word_lang_with_other == "True": return word_lang_with_other word_lang_with_actionline = word_with_actionline(scenes) #print(word_lang_with_actionline) word_lang_with_other = word_with_other(scenes) #print(word_lang_with_other) #### print("actionline_lanuge", non_dial_src_lang) #print("A = {} B = {} C = {}".format(A, B, C)) print("dial_language", A) if B>0: print("UI option3 - yes" ) else: print("UI option3 - no" ) if C>0: print("UI option4 - yes" ) else: print("UI option4 - no" ) if word_lang_with_actionline=="True": print("UI option5 - Yes") else: print("UI_option5 - NO") if word_lang_with_other=="True": print("UI option6 - Yes") else: print("UI option6 - No")