# import textract from tqdm import tqdm import sys import re import docx import os doc = docx.Document() from docx.shared import Inches, Cm, Pt from docx.enum.text import WD_ALIGN_PARAGRAPH from docx.enum.table import WD_TABLE_ALIGNMENT, WD_ALIGN_VERTICAL from collections import Counter from statistics import mode from .translation.script_detector import script_cat from .translation.script_writing import default_script # import textract from tqdm import tqdm import sys import re import docx import os doc = docx.Document() from docx.shared import Inches, Cm, Pt from docx.enum.text import WD_ALIGN_PARAGRAPH from docx.enum.table import WD_TABLE_ALIGNMENT, WD_ALIGN_VERTICAL from MNF.settings import BasePath basePath = BasePath() #google #os.environ["GOOGLE_APPLICATION_CREDENTIALS"]=rf"{basePath}/conversion/My First Project-2573112d5326.json" os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = rf"{basePath}/conversion/gifted-mountain-318504-4f001d5f08db.json" from google.cloud import translate from google.cloud import translate_v2 as Translate translate_client = Translate.Client() client = translate.TranslationServiceClient() project_id = 'authentic-bongo-272808' location = "global" parent = f"projects/{project_id}/locations/{location}" slug_pattern= r'[\d]*[.]?[\s]*[IE][NX]T' pat = r'[\d]*[\s]*[IE]/[IE][.]?' transitions = ['CUT TO:','FADE IN:','FADE OUT:','DISSOLVE TO:','JUMP CUT TO:','JUMP TO:','CUT BACK TO:','INTERCUT WITH:','I/C WITH:','BACK TO:', 'INTERVAL'] reserved_words = ['MONTAGE','PBS','FADE','FADE','TITLE','SPLIT', 'SCREEN','CUT'] style = doc.styles['Normal'] font = style.font font.name = 'Courier New' font.size = Pt(12) def breaksen(s): l =[] #if len(s.split())<=256: if len(s.split())<=256: l.append(s) else: n = len(s.split()) for i in range(n//32 + 1): l.append(" ".join(s.split()[32*i:32*(i+1)])) return l def getRefined(filename1): print("get_refined_called") total_scenes = 0 text = textract.process(filename1, encoding="utf8", errors='ignore') filename= rf"{basePath}/conversion/translation/file.txt" f=open(filename, 'wb') f.write(text) f.close() dialog_coming=False f=open(filename, 'r', encoding="utf8", errors='ignore') doc11=f.read() f.close() f1=open(rf"{basePath}/conversion/translation/file1.txt",'w', encoding="utf8", errors='ignore') c=0 flag=False print("Slugline") for line in doc11.split("\n"): line=line.strip() print("line 427:",line) if (line.strip().startswith(('INT.','INT ')) or \ line.strip().startswith(('I/E','E/I')) or \ line.strip().startswith(('EXT.','EXT ')) or \ line.strip().startswith('EXT/INT') or \ line.strip().startswith('INT/EXT') or \ re.match(slug_pattern, line.strip()) or re.match(pat, line.strip())) and not (line.strip().startswith(('INTERCUT', 'INTERMISSION', 'INTERVAL'))): flag=True f1.write(line) f1.write('\n') continue else: print("line 96: else loop", line) #line = line.strip() if flag: print("line 99: if loop:", line) if line.strip()=='\n': continue if dialog_coming and (line=='\n' or line.strip()==""): print("line empty or just have newline", line) continue if dialog_coming: print("line 101 probable dialog or PC: ", line) f1.write(line) f1.write('\n') if re.match(r"\(.*\)",line): continue else: print(" line 207: else of PCs", line) dialog_coming=False print(" line 457 dialog over") continue continue # if line.isupper() and re.fullmatch(r"([A-Z'’]*[.]*[\s]*[-]*)*([#]*[\s]*[1-9])*(\(.*\))*",line.strip()): if line.isupper() and (re.fullmatch(r"([A-Z'’]+[\s]*[-]*)*([#]*[\s]*[1-9])*(\(.*\))*", line.strip()) or re.fullmatch(r"(MRS?|DR|ER|PHD|ESQ|HON|JR|MS|MESSRS|MMES|MSGR|PROF|REV|RT. HON|SR|ST)\. [A-Z]+",line.strip())): print("line 111: May be speaker: ", line) f1.write(line) f1.write('\n') dialog_coming=True continue if not line=='\n': print("470 probably action or something else so just write it", line) f1.write(line) f1.write('\n') f1.close() print("line 132 file closed") filename1=rf"{basePath}/conversion/translation/file1.txt" #file.txt contains the data of file1.txt , no usage as of now may be change the mame of the file.txt to file1.txt text = textract.process(filename1, encoding="utf8", errors='ignore') print("line 136: ",text) filename=rf"{basePath}/conversion/translation/file.txt" _, file_extension = os.path.splitext(filename1) f=open(filename, 'wb') f.write(text) f.close() with open(filename, "r") as input: input_ = input.read().split('\n\n') refined=[] for line in input_: refined.append(line.strip()) refined=list(filter(lambda a: a != "", refined)) #print("processing the script") for i in range(len(refined)): if not (refined[i].strip().startswith(('INT.','INT ')) or refined[i].strip().startswith(('EXT.','EXT ')) or refined[i].strip().startswith('EXT/INT') or refined[i].strip().startswith('INT/EXT') or refined[i].strip().startswith(('I/E','E/I')) or re.match(slug_pattern,refined[i].strip()) or re.match(pat,refined[i].strip())): total_scenes = total_scenes + 1 continue refined=refined[i:] break # refined.append(line.strip()) refined=list(filter(lambda a: a != "", refined)) print("line 163:Refined",refined) return refined,total_scenes def getSlugAndNonSlug(refined): sluglines=[] without_slug=[] for para in refined: para=para.strip() if para.strip().startswith(('INT.','INT')) or para.strip().startswith(('EXT.','EXT')) or para.strip().startswith('EXT/INT') or para.strip().startswith(('I/E','E/I')) or para.strip().startswith('INT/EXT') or re.match(slug_pattern,para.strip()) or re.match(pat,para.strip()): sluglines.append(para) continue without_slug.append(para) return sluglines,without_slug def getSpeakers(without_slug): characters=[] for para in without_slug: lis=para.split('\n') i=0 for item in lis: i=i+1 i=min(i,len(lis)-2) if item.isupper() and not(lis[i+1].strip()==""): if re.match(r"[A-Z'’]+[\s]*[-]*[A-Z'’]*([#]*[\s]*[1-9])*(\(.*\))*",item): tem = item.split("(")[0].strip() characters.append(tem.strip()) else: continue characters=list(set(characters)) characters=list(filter(lambda x: len(x) >0,characters)) characters = [character for character in characters if set(character.split(" ")).intersection(reserved_words) == set()] return characters def getScenes(refined,total_scenes,characters): # To find scenes data structure and prev and next scenes numbers i=0 scene=[] dialogues=[] speakers=[] slugline_dic={} prev_dial_speaker="" next_dial_speaker="" pc=0 scene_no=0 actionline=[] successor_scene_no=0 predecessor_scene_no=0 parenthetical_lis=[] scenes=[] speaker="" parenthetical='NONE' patttern=r'[\d]*[.]?[\s]*[IE][NX]T' for line in refined: if line.strip().startswith(('INT.','INT')) or line.strip().startswith(('EXT.','EXT')) or line.strip().startswith('EXT/INT') or line.strip().startswith('INT/EXT') or line.strip().startswith(('I/E','E/I')) or re.match(slug_pattern,line.strip()) or re.match(pat,line.strip()): scenes.append(scene) scene=[] i=0 scene_no+=1 scene.append(line) slugline_dic[scene_no]=line.split("\n")[0].strip('0123456789.- ') else: lis=line.split("\n") lis=[l.strip() for l in lis] print(" \n Line 222 probable dialogue list", lis) word=lis[0] if word.split('(')[0].strip() in characters: mydic={} prev_dial_speaker=speaker speaker=word.split('(')[0].strip() print("Speaker 228", speaker) if len(lis)>1 and re.match(r"\(.*\)",lis[1]): pc=pc+1 parenthetical=lis[1] parenthetical=parenthetical.replace("\n","") dia=' '.join(lis[2:]) dia=dia.replace("\n","") ##renu dia=dia.replace("\"", '') else: dia=''.join(lis[1:]) dia=dia.replace("\n","") dia=dia.replace("\"", '') print(" length dia\n", len(dia)) if not (len(dia)==0 and parenthetical=="NONE"): print(" len dia != and Parenthetical == NONE: 384 ") if i-1 >= 0: try: prev=main_lis[scene_no-1][i-1] except: prev="" else: prev="" try: next=main_lis[scene_no-1][i+1] except: next="" #prev is previous speaker and next is next speaker of the dialogue mydic[speaker]=[parenthetical,scene_no,dia,len(dia),prev,next] print("line 259",mydic) #print("mydic 260", speaker, mydic[speaker]) prev,next="","" i=i+1 speakers.append(speaker) parenthetical_lis.append(parenthetical) dialogues.append(mydic) scene.append(mydic) parenthetical="NONE" else: line=line.replace("\n"," ") line=' '.join(line.split()) if line.strip() in transitions: scene.append({'Transition':line.strip()}) continue actionline.append(line) scene.append(line.strip()) scenes.append(scene) speakers=list(set(speakers)) scenes=scenes[1:] s = [] for scene in scenes: s1=[] for ele in scene: if type(ele) == type(""): s1.extend(ele.split("\n")) else: s1.append(ele) s.append(s1) print("dialogue: ",dialogues) return s,actionline,parenthetical_lis,speakers,dialogues # def getScenes(refined, total_scenes, characters): # # To find scenes data structure and prev and next scenes numbers # i = 0 # scene = [] # dialogues = [] # speakers = [] # slugline_dic = {} # prev_dial_speaker = "" # next_dial_speaker = "" # pc = 0 # scene_no = 0 # actionline = [] # successor_scene_no = 0 # predecessor_scene_no = 0 # parenthetical_lis = [] # scenes = [] # speaker = "" # parenthetical = 'NONE' # predecessor_scene_no_dict = { # 'Scene '+str(i+1): 0 for i in range(total_scenes)} # dia_count = {'Scene '+str(i+1): 0 for i in range(total_scenes)} # successor_scene_no_dict = { # 'Scene '+str(i+1): 0 for i in range(total_scenes)} # parenthetical_count_dict = { # 'Scene '+str(i+1): 0 for i in range(total_scenes)} # patttern = r'[\d]*[.]?[\s]*[IE][NX]T' # for line in refined: # if ((line.strip().startswith(('INT.', 'INT')) or line.strip().startswith(('EXT.', 'EXT')) or line.strip().startswith('EXT/INT') or line.strip().startswith('INT/EXT') or line.strip().startswith(('I/E', 'E/I')) or re.match(slug_pattern, line.strip()) or re.match(pat, line.strip())) and not (line.strip().startswith(('INTERCUT', 'INTERVAL', 'INTERMISSION')))): # # if re.match(patttern,line): # # current_scene=line.split(" ")[0] # # current_scene=current_scene[:1] # # current_scene=int(current_scene) # # successor_scene_no=min(int(current_scene)+2,total_scenes) # # predecessor_scene_no=max(int(current_scene),-1) # # else: # successor_scene_no = min(scene_no+3, total_scenes) # predecessor_scene_no = max(scene_no+1, 0) # if scene_no > 0: # parenthetical_count_dict['Scene '+str(scene_no+1)] = pc # pc = 0 # scenes.append(scene) # scene = [] # i = 0 # scene_no += 1 # predecessor_scene_no_dict['Scene ' + # str(scene_no+1)] = predecessor_scene_no # successor_scene_no_dict['Scene ' + # str(scene_no+1)] = successor_scene_no # successor_scene_no_dict['Scene '+str(1)] = 2 # scene.append(line) # slugline_dic[scene_no] = line.split( # "\n")[0].strip('0123456789.- ') # else: # lis = line.split("\n") # lis = [l.strip() for l in lis] # print(" \n Line 363 probable dialogue list", lis) # word = lis[0] # extendedSpeaker = "" # if word.split('(')[0].strip() in characters: # mydic = {} # prev_dial_speaker = speaker # speakerline = word.split('(') # # speaker = word.split('(')[0].strip() # speaker = speakerline[0].strip() # print("Speaker 378", speaker) # extendedSpeaker = word.strip() # if len(lis) > 1 and re.match(r"\(.*\)", lis[1]): # pc = pc+1 # parenthetical = lis[1] # parenthetical = parenthetical.replace("\n", "") # dia = ' '.join(lis[2:]) # dia = dia.replace("\n", "") # # renu # # dia=dia.replace("\"", '') # else: # dia = ''.join(lis[1:]) # dia = dia.replace("\n", "") # dia = dia.replace("\"", '') # print(" length dia\n", len(dia)) # if not (len(dia) == 0 and parenthetical == "NONE"): # print(" len dia != and Parenthetical == NONE: 384 ") # if i-1 >= 0: # try: # prev = main_lis[scene_no-1][i-1] # except: # prev = "" # else: # prev = "" # try: # next = main_lis[scene_no-1][i+1] # except: # next = "" # # prev is previous speaker and next is next speaker of the dialogue # mydic[speaker] = [parenthetical, # scene_no, dia, len(dia), prev, next, extendedSpeaker] # print("mydic 398", speaker, mydic[speaker]) # dia_count['Scene '+str(scene_no)] += 1 # # print(mydic) # prev, next = "", "" # i = i+1 # speakers.append(speaker) # parenthetical_lis.append(parenthetical) # dialogues.append(mydic) # scene.append(mydic) # parenthetical = "NONE" # else: # line = line.replace("\n", " ") # line = ' '.join(line.split()) # if line.strip() in transitions: # scene.append({'Transition': line.strip()}) # continue # actionline.append(line) # scene.append(line.strip()) # scenes.append(scene) # parenthetical_count_dict['Scene '+str(scene_no)] = pc # speakers = list(set(speakers)) # scenes = scenes[1:] # print("Scenes:", scenes) # # for removing '\n' from action lines # # return scenes also if '\n' required and modify practice_with_db also # # s = [] # # for scene in scenes: # # s1=[] # # for ele in scene: # # if type(ele) == type(""): # # s1.extend(ele.split("\n")) # # else: # # s1.append(ele) # # s.append(s1) # return scenes,actionline,parenthetical_lis,speakers,dialogues def language_detector(text): result = translate_client.translate(text, target_language='hi') det_lang = result["detectedSourceLanguage"] return det_lang def script_det(text): punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~''' no_punct = "" for char in text: if char not in punctuations: no_punct = char break script = script_cat(no_punct)[0] return script ''' A. Language of Highest number of full dialogues, B. Numbers of dialogues in action line language, C. Number of dialogues in other languages) ''' def A_B_C(dialogue_language, non_dial_src_lang): dict1 = dict(Counter(dialogue_language)) sorted_values = sorted(dict1.values(), reverse=True) # Sort the values sorted_dict = {} for i in sorted_values: for k in dict1.keys(): if dict1[k] == i: sorted_dict[k] = dict1[k] sources = list(sorted_dict.keys()) A = sources[0] if len(sources)==1: B=0 C=0 elif non_dial_src_lang not in sources: B=0 C = sum(sorted_values[1:]) else: B = sorted_values[sources.index(non_dial_src_lang)] C = sum(sorted_values[2:]) return A, B, C def dial_each_word_lang1(non_dial_src_lang, dial): for word in dial.split(): if language_detector(word)==non_dial_src_lang: #print("word", word) return "True" return "False" def dial_each_word_lang2(non_dial_src_lang, A, dial ): for word in dial.split(): if (language_detector(word)!=non_dial_src_lang) or (language_detector(word)!=A): #print("in 4") #print("word", word) return "True" return "False" def word_with_actionline(scenes, A, non_dial_src_lang): if A==non_dial_src_lang: return "False" for scene in tqdm(scenes[:]): for i,line in enumerate(scene): if i == 0: continue if type(line)==type(""): continue else: [speaker] = line.keys() if speaker == 'Transition': continue dial_src_lang = language_detector(line[speaker][2]) if dial_src_lang==A: word_lang_with_actionline = dial_each_word_lang1(non_dial_src_lang, line[speaker][2]) if word_lang_with_actionline == "True": return word_lang_with_actionline def word_with_other(scenes, A, non_dial_src_lang): word_lang_with_other = "False" for scene in tqdm(scenes[:]): for i,line in enumerate(scene): if i == 0: continue if type(line)==type(""): continue else: [speaker] = line.keys() if speaker == 'Transition': continue dial_src_lang = language_detector(line[speaker][2]) if dial_src_lang==A: word_lang_with_other = dial_each_word_lang2(non_dial_src_lang, A, line[speaker][2]) if word_lang_with_other == "True": return word_lang_with_other def getInputs(filename1): refined, total_scenes = getRefined(filename1) sluglines, without_slug = getSlugAndNonSlug(refined) characters = getSpeakers(without_slug) scenes, actionline, parenthetical_lis, speakers,dialogues = getScenes(refined,total_scenes,characters) print("line 405:scenes: ",scenes) language_of_all_dialogues = [] script_of_all_dialogues = [] count =0 for scene in tqdm(scenes[:]): for i,line in enumerate(scene): if i == 0: continue if type(line)==type(""): #print("here") if count==0: #print(line) non_dial_src_lang = language_detector(line) non_dial_src_script=script_det(line) count+=1 #print("non_dial_src_lang", non_dial_src_lang) else: #print("line", line) [speaker] = line.keys() #print([speaker]) if speaker == 'Transition': continue #print("dial", line[speaker][2]) dial_src_lang = language_detector(line[speaker][2]) language_of_all_dialogues.append(dial_src_lang) script_of_all_dialogues.append(script_det(line[speaker][2])) # print(non_dial_src_lang) # print(language_of_all_dialogues) # print(script_of_all_dialogues) A, B, C = A_B_C(language_of_all_dialogues, non_dial_src_lang) dial_src_script = mode(script_of_all_dialogues) word_lang_with_actionline = word_with_actionline(scenes, A, non_dial_src_lang) #print(word_lang_with_actionline) word_lang_with_other = word_with_other(scenes, A, non_dial_src_lang) #print(word_lang_with_other) print("actionline_lanuge", non_dial_src_lang) non_dial_src_lang = non_dial_src_lang print("A = {} B = {} C = {}".format(A, B, C)) print("dial_language", A) dial_src_lang = A print("dial_src_script", dial_src_script) # print("Steps in the process:") # print("") if B>0: print("UI option3 - yes" ) UI_option3 = "Yes" else: print("UI option3 - no" ) UI_option3 = "No" if C>0: print("UI option4 - yes" ) UI_option4 = "Yes" else: print("UI option4 - no" ) UI_option4 = "No" if word_lang_with_actionline=="True": print("UI option5 - Yes") UI_option5 = "Yes" else: print("UI_option5 - NO") UI_option5 = "No" if word_lang_with_other=="True": print("UI option6 - Yes") UI_option6 = "Yes" else: print("UI option6 - No") UI_option6 = "No" return [non_dial_src_lang, dial_src_lang, dial_src_script,non_dial_src_script, UI_option3, UI_option4, UI_option5, UI_option6] # filename1 = sys.argv[1] # getInputs(filename1)