from google.cloud import translate_v2 as Translate from google.cloud import translate from MNF.settings import BasePath from narration.vectorcode.code.functions import ScriptBreakdown from .translation.script_writing import default_script from .translation.script_detector import script_cat from statistics import mode from collections import Counter from docx.enum.table import WD_TABLE_ALIGNMENT, WD_ALIGN_VERTICAL from docx.enum.text import WD_ALIGN_PARAGRAPH from docx.shared import Inches, Cm, Pt # import textract from tqdm import tqdm import sys import re import docx import os doc = docx.Document() doc = docx.Document() basePath = BasePath() # google os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = rf"{basePath}/conversion/My First Project-2573112d5326.json" # os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = rf"{basePath}/conversion/gifted-mountain-318504-4f001d5f08db.json" translate_client = Translate.Client() client = translate.TranslationServiceClient() project_id = 'authentic-bongo-272808' location = "global" parent = f"projects/{project_id}/locations/{location}" slug_pattern = r'[\d]*[.]?[\s]*[IE][NX]T' pat = r'[\d]*[\s]*[IE]/[IE][.]?' transitions = ['CUT TO:', 'FADE IN:', 'FADE OUT:', 'DISSOLVE TO:', 'JUMP CUT TO:', 'JUMP TO:', 'CUT BACK TO:', 'INTERCUT WITH:', 'I/C WITH:', 'BACK TO:', 'INTERVAL'] reserved_words = ['MONTAGE', 'PBS', 'FADE', 'FADE', 'TITLE', 'SPLIT', 'SCREEN', 'CUT'] style = doc.styles['Normal'] font = style.font font.name = 'Courier New' font.size = Pt(12) def breaksen(s): l = [] # if len(s.split())<=256: if len(s.split()) <= 256: l.append(s) else: n = len(s.split()) for i in range(n//32 + 1): l.append(" ".join(s.split()[32*i:32*(i+1)])) return l # def getRefined(filename1): # print("get_refined_called") # total_scenes = 0 # text = textract.process(filename1, encoding="utf8", errors='ignore') # filename = rf"{basePath}/conversion/translation/file.txt" # f = open(filename, 'wb') # f.write(text) # f.close() # dialog_coming = False # f = open(filename, 'r', encoding="utf8", errors='ignore') # doc11 = f.read() # f.close() # f1 = open(rf"{basePath}/conversion/translation/file1.txt", # 'w', encoding="utf8", errors='ignore') # c = 0 # flag = False # print("Slugline") # for line in doc11.split("\n"): # line = line.strip() # print("line 427:", line) # if (line.strip().startswith(('INT.', 'INT ')) or # line.strip().startswith(('I/E', 'E/I')) or # line.strip().startswith(('EXT.', 'EXT ')) or # line.strip().startswith('EXT/INT') or # line.strip().startswith('INT/EXT') or # re.match(slug_pattern, line.strip()) or re.match(pat, line.strip())) and not (line.strip().startswith(('INTERCUT', 'INTERMISSION', 'INTERVAL'))): # flag = True # f1.write(line) # f1.write('\n') # continue # else: # print("line 96: else loop", line) # #line = line.strip() # if flag: # print("line 99: if loop:", line) # if line.strip() == '\n': # continue # if dialog_coming and (line == '\n' or line.strip() == ""): # print("line empty or just have newline", line) # continue # if dialog_coming: # print("line 101 probable dialog or PC: ", line) # f1.write(line) # f1.write('\n') # if re.match(r"\(.*\)", line): # continue # else: # print(" line 207: else of PCs", line) # dialog_coming = False # print(" line 457 dialog over") # continue # continue # # if line.isupper() and re.fullmatch(r"([A-Z'’]*[.]*[\s]*[-]*)*([#]*[\s]*[1-9])*(\(.*\))*",line.strip()): # if line.isupper() and (re.fullmatch(r"([A-Z'’]+[\s]*[-]*)*([#]*[\s]*[1-9])*(\(.*\))*", line.strip()) or re.fullmatch(r"(MRS?|DR|ER|PHD|ESQ|HON|JR|MS|MESSRS|MMES|MSGR|PROF|REV|RT. HON|SR|ST)\. [A-Z]+", line.strip())): # print("line 111: May be speaker: ", line) # f1.write(line) # f1.write('\n') # dialog_coming = True # continue # if not line == '\n': # print( # "470 probably action or something else so just write it", line) # f1.write(line) # f1.write('\n') # f1.close() # print("line 132 file closed") # filename1 = rf"{basePath}/conversion/translation/file1.txt" # # file.txt contains the data of file1.txt , no usage as of now may be change the mame of the file.txt to file1.txt # text = textract.process(filename1, encoding="utf8", errors='ignore') # print("line 136: ", text) # filename = rf"{basePath}/conversion/translation/file.txt" # _, file_extension = os.path.splitext(filename1) # f = open(filename, 'wb') # f.write(text) # f.close() # with open(filename, "r") as input: # input_ = input.read().split('\n\n') # refined = [] # for line in input_: # refined.append(line.strip()) # refined = list(filter(lambda a: a != "", refined)) # #print("processing the script") # for i in range(len(refined)): # if not (refined[i].strip().startswith(('INT.', 'INT ')) or refined[i].strip().startswith(('EXT.', 'EXT ')) or refined[i].strip().startswith('EXT/INT') or refined[i].strip().startswith('INT/EXT') or refined[i].strip().startswith(('I/E', 'E/I')) or re.match(slug_pattern, refined[i].strip()) or re.match(pat, refined[i].strip())): # total_scenes = total_scenes + 1 # continue # refined = refined[i:] # break # # refined.append(line.strip()) # refined = list(filter(lambda a: a != "", refined)) # print("line 163:Refined", refined) # return refined, total_scenes # def getSlugAndNonSlug(refined): # sluglines = [] # without_slug = [] # for para in refined: # para = para.strip() # if para.strip().startswith(('INT.', 'INT')) or para.strip().startswith(('EXT.', 'EXT')) or para.strip().startswith('EXT/INT') or para.strip().startswith(('I/E', 'E/I')) or para.strip().startswith('INT/EXT') or re.match(slug_pattern, para.strip()) or re.match(pat, para.strip()): # sluglines.append(para) # continue # without_slug.append(para) # return sluglines, without_slug # def getSpeakers(without_slug): # characters = [] # for para in without_slug: # lis = para.split('\n') # i = 0 # for item in lis: # i = i+1 # i = min(i, len(lis)-2) # if item.isupper() and not(lis[i+1].strip() == ""): # if re.match(r"[A-Z'’]+[\s]*[-]*[A-Z'’]*([#]*[\s]*[1-9])*(\(.*\))*", item): # tem = item.split("(")[0].strip() # characters.append(tem.strip()) # else: # continue # characters = list(set(characters)) # characters = list(filter(lambda x: len(x) > 0, characters)) # characters = [character for character in characters if set( # character.split(" ")).intersection(reserved_words) == set()] # return characters # def getScenes(refined, total_scenes, characters): # # To find scenes data structure and prev and next scenes numbers # i = 0 # scene = [] # dialogues = [] # speakers = [] # slugline_dic = {} # prev_dial_speaker = "" # next_dial_speaker = "" # pc = 0 # scene_no = 0 # actionline = [] # successor_scene_no = 0 # predecessor_scene_no = 0 # parenthetical_lis = [] # scenes = [] # speaker = "" # parenthetical = 'NONE' # patttern = r'[\d]*[.]?[\s]*[IE][NX]T' # for line in refined: # if line.strip().startswith(('INT.', 'INT')) or line.strip().startswith(('EXT.', 'EXT')) or line.strip().startswith('EXT/INT') or line.strip().startswith('INT/EXT') or line.strip().startswith(('I/E', 'E/I')) or re.match(slug_pattern, line.strip()) or re.match(pat, line.strip()): # scenes.append(scene) # scene = [] # i = 0 # scene_no += 1 # scene.append(line) # slugline_dic[scene_no] = line.split("\n")[0].strip('0123456789.- ') # else: # lis = line.split("\n") # lis = [l.strip() for l in lis] # print(" \n Line 222 probable dialogue list", lis) # word = lis[0] # if word.split('(')[0].strip() in characters: # mydic = {} # prev_dial_speaker = speaker # speaker = word.split('(')[0].strip() # print("Speaker 228", speaker) # if len(lis) > 1 and re.match(r"\(.*\)", lis[1]): # pc = pc+1 # parenthetical = lis[1] # parenthetical = parenthetical.replace("\n", "") # dia = ' '.join(lis[2:]) # dia = dia.replace("\n", "") # # renu # dia = dia.replace("\"", '') # else: # dia = ''.join(lis[1:]) # dia = dia.replace("\n", "") # dia = dia.replace("\"", '') # print(" length dia\n", len(dia)) # if not (len(dia) == 0 and parenthetical == "NONE"): # print(" len dia != and Parenthetical == NONE: 384 ") # if i-1 >= 0: # try: # prev = main_lis[scene_no-1][i-1] # except: # prev = "" # else: # prev = "" # try: # next = main_lis[scene_no-1][i+1] # except: # next = "" # # prev is previous speaker and next is next speaker of the dialogue # mydic[speaker] = [parenthetical, # scene_no, dia, len(dia), prev, next] # print("line 259", mydic) # #print("mydic 260", speaker, mydic[speaker]) # prev, next = "", "" # i = i+1 # speakers.append(speaker) # parenthetical_lis.append(parenthetical) # dialogues.append(mydic) # scene.append(mydic) # parenthetical = "NONE" # else: # line = line.replace("\n", " ") # line = ' '.join(line.split()) # if line.strip() in transitions: # scene.append({'Transition': line.strip()}) # continue # actionline.append(line) # scene.append(line.strip()) # scenes.append(scene) # speakers = list(set(speakers)) # scenes = scenes[1:] # s = [] # for scene in scenes: # s1 = [] # for ele in scene: # if type(ele) == type(""): # s1.extend(ele.split("\n")) # else: # s1.append(ele) # s.append(s1) # print("dialogue: ", dialogues) # return s, actionline, parenthetical_lis, speakers, dialogues def language_detector(text): result = translate_client.translate(text, target_language='hi') det_lang = result["detectedSourceLanguage"] return det_lang def script_det(text): punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~''' no_punct = "" for char in text: if char not in punctuations: no_punct = char break script = script_cat(no_punct)[0] return script ''' A. Language of Highest number of full dialogues, B. Numbers of dialogues in action line language, C. Number of dialogues in other languages) ''' def A_B_C(dialogue_language, non_dial_src_lang): print("line 316:dialogue_language", dialogue_language) print(non_dial_src_lang) dict1 = dict(Counter(dialogue_language)) print("line 319:dict1", dict1) sorted_values = sorted(dict1.values(), reverse=True) # Sort the values print("line 321:sorted_values:", sorted_values) sorted_dict = {} for i in sorted_values: for k in dict1.keys(): if dict1[k] == i: sorted_dict[k] = dict1[k] sources = list(sorted_dict.keys()) print("line 328: sources: ", sources) A = sources[0] if len(sources) == 1: B = 0 C = 0 elif non_dial_src_lang not in sources: B = 0 C = sum(sorted_values[1:]) else: if A == non_dial_src_lang: B = 0 else: B = sorted_values[sources.index(non_dial_src_lang)] C = sum(sorted_values[2:]) return A, B, C def dial_each_word_lang1(non_dial_src_lang, dial): for word in dial.split(): if language_detector(word) == non_dial_src_lang: #print("word", word) return "True" return "False" def dial_each_word_lang2(non_dial_src_lang, A, dial): for word in dial.split(): if (language_detector(word) != non_dial_src_lang) or (language_detector(word) != A): #print("in 4") #print("word", word) return "True" return "False" def word_with_actionline(scenes, A, non_dial_src_lang): word_lang_with_actionline = "False" if A == non_dial_src_lang: return "False" for scene in tqdm(scenes[:]): for i, line in enumerate(scene): if i == 0: continue if type(line) == type(""): continue else: [speaker] = line.keys() if speaker == 'Transition': continue dial_src_lang = language_detector(line[speaker][2]) if dial_src_lang == A: word_lang_with_actionline = dial_each_word_lang1( non_dial_src_lang, line[speaker][2]) if word_lang_with_actionline == "True": return word_lang_with_actionline def word_with_other(scenes, A, non_dial_src_lang): word_lang_with_other = "False" for scene in tqdm(scenes[:]): for i, line in enumerate(scene): if i == 0: continue if type(line) == type(""): continue else: [speaker] = line.keys() if speaker == 'Transition': continue dial_src_lang = language_detector(line[speaker][2]) if dial_src_lang == A: word_lang_with_other = dial_each_word_lang2( non_dial_src_lang, A, line[speaker][2]) if word_lang_with_other == "True": return word_lang_with_other def getInputs(filename1): refined, total_scenes = ScriptBreakdown().getRefined(filename1) sluglines, without_slug = ScriptBreakdown().getSlugAndNonSlug(refined) characters = ScriptBreakdown().getSpeakers(without_slug) scenes, actionline, parenthetical_lis, speakers, dialogues = ScriptBreakdown().getScenes( refined, total_scenes, characters) print("line 405:scenes: ", scenes) language_of_all_dialogues = [] script_of_all_dialogues = [] count = 0 length = len(scenes) if(length > 10): length = 10 for scene in tqdm(scenes[:length]): for i, line in enumerate(scene): if i == 0: continue if type(line) == type(""): # print("here") if count == 0: # print(line) non_dial_src_lang = language_detector(line) non_dial_src_script=script_det(line) count += 1 #print("non_dial_src_lang", non_dial_src_lang) else: #print("line", line) [speaker] = line.keys() # print([speaker]) if speaker == 'Transition': continue #print("dial", line[speaker][2]) dial_src_lang = language_detector(line[speaker][2]) print("dial_src_lang:line 430:", dial_src_lang) language_of_all_dialogues.append(dial_src_lang) script_of_all_dialogues.append(script_det(line[speaker][2])) # print(non_dial_src_lang) print(language_of_all_dialogues) # print(script_of_all_dialogues) A, B, C = A_B_C(language_of_all_dialogues, non_dial_src_lang) dial_src_script = mode(script_of_all_dialogues) one_step_process = "Yes" if dial_src_script == default_script[A] else "Can_not_say" print("one_step_process", one_step_process) if B > 0: print("UI option3 - yes") UI_option3 = "Yes" else: print("UI option3 - no") UI_option3 = "No" if C > 0: print("UI option4 - yes") UI_option4 = "Yes" else: print("UI option4 - no") UI_option4 = "No" dial_src_lang = A return [non_dial_src_lang, dial_src_lang, dial_src_script, non_dial_src_script, UI_option3, UI_option4, "Yes", "Yes"] word_lang_with_actionline = word_with_actionline( scenes, A, non_dial_src_lang) # print(word_lang_with_actionline) word_lang_with_other = word_with_other(scenes, A, non_dial_src_lang) # print(word_lang_with_other) print("actionline_lanuge", non_dial_src_lang) non_dial_src_lang = non_dial_src_lang print("A = {} B = {} C = {}".format(A, B, C)) print("dial_language", A) dial_src_lang = A print("dial_src_script", dial_src_script) # print("Steps in the process:") # print("") if B > 0: print("UI option3 - yes") UI_option3 = "Yes" else: print("UI option3 - no") UI_option3 = "No" if C > 0: print("UI option4 - yes") UI_option4 = "Yes" else: print("UI option4 - no") UI_option4 = "No" if word_lang_with_actionline == "True": print("UI option5 - Yes") UI_option5 = "Yes" else: print("UI_option5 - NO") UI_option5 = "No" if word_lang_with_other == "True": print("UI option6 - Yes") UI_option6 = "Yes" else: print("UI option6 - No") UI_option6 = "No" return [non_dial_src_lang, dial_src_lang, dial_src_script, one_step_process, UI_option3, UI_option4, UI_option5, UI_option6] # filename1 = sys.argv[1] # getInputs(filename1)