454 lines
		
	
	
		
			14 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
	
			
		
		
	
	
			454 lines
		
	
	
		
			14 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
	
| # import textract
 | ||
| from tqdm import tqdm
 | ||
| import sys 
 | ||
| import re
 | ||
| import docx
 | ||
| import os
 | ||
| doc = docx.Document() 
 | ||
| from docx.shared import Inches, Cm, Pt
 | ||
| from docx.enum.text import WD_ALIGN_PARAGRAPH
 | ||
| from docx.enum.table import WD_TABLE_ALIGNMENT, WD_ALIGN_VERTICAL
 | ||
| from collections import Counter
 | ||
| 
 | ||
| 
 | ||
| from script_detector import script_cat
 | ||
| 
 | ||
| #google
 | ||
| #os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="gifted-mountain-318504-0a5f94cda0c8.json"
 | ||
| os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = rf"{basePath}/conversion/gifted-mountain-318504-4f001d5f08db.json"
 | ||
| from google.cloud import translate
 | ||
| from google.cloud import translate_v2 as Translate
 | ||
| translate_client = Translate.Client()
 | ||
| client = translate.TranslationServiceClient()
 | ||
| project_id = 'authentic-bongo-272808'
 | ||
| location = "global"
 | ||
| parent = f"projects/{project_id}/locations/{location}"
 | ||
| 
 | ||
| 
 | ||
| slug_pattern= r'[\d]*[.]?[\s]*[IE][NX]T'
 | ||
| pat = r'[\d]*[\s]*[IE]/[IE][.]?'
 | ||
| transitions = ['CUT TO:','FADE IN:','FADE OUT:','DISSOLVE TO:','JUMP CUT TO:','JUMP TO:','CUT BACK TO:','INTERCUT WITH:','I/C WITH:','BACK TO:', 'INTERVAL']
 | ||
| reserved_words = ['MONTAGE','PBS','FADE','FADE','TITLE','SPLIT', 'SCREEN','CUT']
 | ||
| style = doc.styles['Normal']
 | ||
| font = style.font
 | ||
| font.name = 'Courier New'
 | ||
| font.size = Pt(12)
 | ||
| 
 | ||
| def breaksen(s):
 | ||
|     l =[]
 | ||
|     #if len(s.split())<=256:
 | ||
|     if len(s.split())<=256:
 | ||
|         l.append(s)
 | ||
|     else:
 | ||
|         n = len(s.split())
 | ||
|         for i in range(n//32 + 1):
 | ||
|             l.append(" ".join(s.split()[32*i:32*(i+1)]))
 | ||
|     return l
 | ||
| 
 | ||
| def getRefined(filename1):
 | ||
|     #print("get_refined_called")
 | ||
|     total_scenes = 0
 | ||
|     text = textract.process(filename1, encoding="utf8", errors='ignore')
 | ||
|     filename="file.txt"
 | ||
|     f=open(filename, 'wb')
 | ||
|     f.write(text)
 | ||
|     f.close()
 | ||
|     dialog_coming=False
 | ||
|     f=open(filename, 'r',  encoding="utf8", errors='ignore')
 | ||
|     doc11=f.read()
 | ||
| 
 | ||
|     f.close()
 | ||
|     f1=open("file1.txt",'w', encoding="utf8", errors='ignore')
 | ||
|     c=0
 | ||
|     flag=False
 | ||
|     for line in doc11.split("\n"):
 | ||
|         if (line.strip().startswith(('INT.','INT ')) or \
 | ||
|             line.strip().startswith(('I/E','E/I')) or \
 | ||
|             line.strip().startswith(('EXT.','EXT ')) or \
 | ||
|             line.strip().startswith('EXT/INT') or \
 | ||
|             line.strip().startswith('INT/EXT') or \
 | ||
|             re.match(slug_pattern,line.strip())):
 | ||
|     
 | ||
|             flag=True
 | ||
|             f1.write(line)
 | ||
|             f1.write('\n')
 | ||
|             continue
 | ||
|         else:
 | ||
|             line = line.strip()
 | ||
|             if flag:
 | ||
|                 if line.strip()=='\n':
 | ||
|                     continue
 | ||
|                 if dialog_coming and (line=='\n' or line.strip()==""):
 | ||
|                     continue
 | ||
|                 if dialog_coming:
 | ||
|                     f1.write(line)
 | ||
|                     f1.write('\n')
 | ||
|                     if re.match(r"\(.*\)",line):
 | ||
|                         
 | ||
|                         continue
 | ||
|                     else:
 | ||
|                         dialog_coming=False
 | ||
|                         continue
 | ||
|                     continue
 | ||
|                 if line.isupper() and re.fullmatch(r"([A-Z'’]+[\s]*[-]*)*([#]*[\s]*[1-9])*(\(.*\))*",line.strip()):
 | ||
|                     f1.write(line)
 | ||
|                     f1.write('\n')
 | ||
|                     dialog_coming=True
 | ||
|                     continue
 | ||
|                 if not line=='\n':
 | ||
|                     f1.write(line)
 | ||
|                     f1.write('\n')
 | ||
| 
 | ||
| 
 | ||
|     f1.close()
 | ||
|     filename1="file1.txt"
 | ||
|     #file.txt contains the data of file1.txt  , no usage as of now may be change the mame of the file.txt to file1.txt
 | ||
|     text = textract.process(filename1, encoding="utf8", errors='ignore')
 | ||
| 
 | ||
|     filename="file.txt"
 | ||
|     _, file_extension = os.path.splitext(filename1)
 | ||
|     f=open(filename, 'wb')
 | ||
|     f.write(text)
 | ||
|     f.close()
 | ||
| 
 | ||
|     with open(filename, "r") as input:
 | ||
|         input_ = input.read().split('\n\n')
 | ||
| 
 | ||
|     refined=[]
 | ||
| 
 | ||
|     for line in input_:
 | ||
|         refined.append(line.strip())
 | ||
|     refined=list(filter(lambda a: a != "", refined)) 
 | ||
|     #print("processing the script")                              
 | ||
| 
 | ||
|     for i in range(len(refined)):
 | ||
|         if not (refined[i].strip().startswith(('INT.','INT ')) or refined[i].strip().startswith(('EXT.','EXT ')) or refined[i].strip().startswith('EXT/INT') or refined[i].strip().startswith('INT/EXT') or refined[i].strip().startswith(('I/E','E/I')) or re.match(slug_pattern,refined[i].strip()) or re.match(pat,refined[i].strip())):
 | ||
|             total_scenes = total_scenes + 1
 | ||
|             continue
 | ||
|         refined=refined[i:]
 | ||
|         break
 | ||
|     # refined.append(line.strip())
 | ||
|     
 | ||
|     refined=list(filter(lambda a: a != "", refined))   
 | ||
|     return refined,total_scenes
 | ||
| 
 | ||
| def getSlugAndNonSlug(refined):
 | ||
|     sluglines=[]
 | ||
|     without_slug=[]
 | ||
|     for para in refined:
 | ||
|         para=para.strip()
 | ||
|         if para.strip().startswith(('INT.','INT')) or para.strip().startswith(('EXT.','EXT')) or para.strip().startswith('EXT/INT') or para.strip().startswith(('I/E','E/I')) or para.strip().startswith('INT/EXT') or re.match(slug_pattern,para.strip()) or re.match(pat,para.strip()):
 | ||
|             sluglines.append(para)
 | ||
|             continue
 | ||
|         without_slug.append(para)
 | ||
|     return sluglines,without_slug
 | ||
| 
 | ||
| def getSpeakers(without_slug):
 | ||
|     characters=[]
 | ||
|     for para in without_slug:
 | ||
|         lis=para.split('\n')
 | ||
|         i=0
 | ||
|         for item in lis:
 | ||
|             i=i+1
 | ||
|             i=min(i,len(lis)-2)
 | ||
|             if item.isupper() and not(lis[i+1].strip()==""):
 | ||
|                 if re.match(r"[A-Z'’]+[\s]*[-]*[A-Z'’]*([#]*[\s]*[1-9])*(\(.*\))*",item):
 | ||
|                     tem = item.split("(")[0].strip()
 | ||
|                     characters.append(tem.strip())
 | ||
|                 else:
 | ||
|                     continue
 | ||
|                     
 | ||
|     characters=list(set(characters))
 | ||
|     characters=list(filter(lambda x: len(x) >0,characters))
 | ||
|     characters = [character for character in characters if set(character.split(" ")).intersection(reserved_words) == set()]
 | ||
|     return characters
 | ||
| 
 | ||
| 
 | ||
| def getScenes(refined,total_scenes,characters):
 | ||
|     # To find scenes data structure and prev and next scenes numbers
 | ||
|     i=0
 | ||
|     scene=[]
 | ||
|     dialogues=[]
 | ||
|     speakers=[]
 | ||
|     slugline_dic={}
 | ||
|     prev_dial_speaker=""
 | ||
|     next_dial_speaker=""
 | ||
|     pc=0
 | ||
|     scene_no=0
 | ||
|     actionline=[]
 | ||
|     successor_scene_no=0
 | ||
|     predecessor_scene_no=0
 | ||
|     parenthetical_lis=[]
 | ||
|     
 | ||
|     scenes=[]
 | ||
|     speaker=""
 | ||
|     parenthetical='NONE'
 | ||
|     patttern=r'[\d]*[.]?[\s]*[IE][NX]T'
 | ||
|     for line in refined:
 | ||
|         if line.strip().startswith(('INT.','INT')) or line.strip().startswith(('EXT.','EXT')) or line.strip().startswith('EXT/INT') or line.strip().startswith('INT/EXT') or line.strip().startswith(('I/E','E/I')) or re.match(slug_pattern,line.strip()) or re.match(pat,line.strip()):
 | ||
|             scenes.append(scene)
 | ||
|             scene=[]
 | ||
|             i=0
 | ||
|             scene_no+=1
 | ||
|             scene.append(line)
 | ||
|             slugline_dic[scene_no]=line.split("\n")[0].strip('0123456789.- ')
 | ||
| 
 | ||
|         else:
 | ||
|             lis=line.split("\n")
 | ||
|             lis=[l.strip() for l in lis]
 | ||
|             word=lis[0]
 | ||
|             if word.split('(')[0].strip() in characters:
 | ||
|                 mydic={}
 | ||
|                 prev_dial_speaker=speaker
 | ||
|                 speaker=word.split('(')[0].strip()
 | ||
|                 if len(lis)>1 and re.match(r"\(.*\)",lis[1]):
 | ||
|                     pc=pc+1
 | ||
|                     parenthetical=lis[1]
 | ||
|                     parenthetical=parenthetical.replace("\n","")
 | ||
|                     dia=' '.join(lis[2:])
 | ||
|                     dia=dia.replace("\n","")
 | ||
|                     ##renu
 | ||
|                     dia=dia.replace("\"", '')
 | ||
|                 
 | ||
|                 else:
 | ||
|                     dia=''.join(lis[1:])
 | ||
|                     dia=dia.replace("\n","")
 | ||
|                     dia=dia.replace("\"", '')
 | ||
|                 if not (len(dia)==0 and parenthetical=="NONE"):
 | ||
|                     
 | ||
|                     if i-1 >= 0:
 | ||
|                         try:
 | ||
|                             prev=main_lis[scene_no-1][i-1]
 | ||
|                         except:
 | ||
|                             prev=""
 | ||
|                     else:
 | ||
|                         prev=""
 | ||
|                     try:
 | ||
|                         next=main_lis[scene_no-1][i+1]
 | ||
|                     except:
 | ||
|                         next=""
 | ||
|                     #prev is previous speaker and next is next speaker of the dialogue
 | ||
|                     mydic[speaker]=[parenthetical,scene_no,dia,len(dia),prev,next]
 | ||
|                     # print(mydic)
 | ||
|                     prev,next="",""
 | ||
|                     i=i+1
 | ||
|                     speakers.append(speaker)
 | ||
|                     parenthetical_lis.append(parenthetical)
 | ||
|                     dialogues.append(mydic)
 | ||
|                     scene.append(mydic)
 | ||
|                 parenthetical="NONE"
 | ||
|             else:
 | ||
|                 line=line.replace("\n"," ")
 | ||
|                 line=' '.join(line.split())
 | ||
|                 if line.strip() in transitions:
 | ||
|                     scene.append({'Transition':line.strip()})
 | ||
|                     continue
 | ||
|                 actionline.append(line)
 | ||
|                 scene.append(line.strip())
 | ||
| 
 | ||
| 
 | ||
|     scenes.append(scene)
 | ||
|     speakers=list(set(speakers))
 | ||
|     scenes=scenes[1:]
 | ||
|     s = []
 | ||
|     for scene in scenes:
 | ||
|         s1=[]
 | ||
|         for ele in scene:
 | ||
|             if type(ele) == type(""):
 | ||
|                 s1.extend(ele.split("\n"))
 | ||
|             else:
 | ||
|                 s1.append(ele)
 | ||
|         s.append(s1)
 | ||
|     return s,actionline,parenthetical_lis,speakers,dialogues
 | ||
| 
 | ||
| 
 | ||
| filename1 = sys.argv[1]
 | ||
| #print(filename1)
 | ||
| 
 | ||
| refined,total_scenes = getRefined(filename1)
 | ||
| #print(refined)
 | ||
| sluglines,without_slug = getSlugAndNonSlug(refined)
 | ||
| characters = getSpeakers(without_slug)
 | ||
| scenes,actionline,parenthetical_lis,speakers,dialogues = getScenes(refined,total_scenes,characters)
 | ||
| #print(scenes)
 | ||
| 
 | ||
| def language_detector(text):
 | ||
|     result = translate_client.translate(text, target_language='hi')
 | ||
|     det_lang = result["detectedSourceLanguage"]   
 | ||
|     return det_lang 
 | ||
| 
 | ||
| def script_det(text):
 | ||
|   punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
 | ||
|   no_punct = ""
 | ||
|   for char in text:
 | ||
|    if char not in punctuations:
 | ||
|        no_punct = char
 | ||
|        break    
 | ||
|   script = script_cat(no_punct)[0]
 | ||
|   return script
 | ||
| 
 | ||
| '''
 | ||
| A. Language of Highest number of full dialogues, 
 | ||
| B. Numbers of dialogues in action line language, 
 | ||
| C. Number of dialogues in other languages) 
 | ||
| '''
 | ||
| 
 | ||
| def A_B_C(dialogue_language, non_dial_src_lang):
 | ||
|   dict1 = dict(Counter(dialogue_language))
 | ||
|   sorted_values = sorted(dict1.values(), reverse=True) # Sort the values
 | ||
|   sorted_dict = {}
 | ||
|   for i in sorted_values:
 | ||
|       for k in dict1.keys():
 | ||
|           if dict1[k] == i:
 | ||
|               sorted_dict[k] = dict1[k]
 | ||
|   sources = list(sorted_dict.keys())
 | ||
|   A = sources[0]
 | ||
|   if len(sources)!=1:
 | ||
|     B = sorted_values[sources.index(non_dial_src_lang)]
 | ||
|     C = sum(sorted_values[2:])
 | ||
|   else:
 | ||
|     B=0
 | ||
|     C=0  
 | ||
|   return A, B, C
 | ||
| 
 | ||
| 
 | ||
| #print(scenes)
 | ||
| 
 | ||
| dialogue_language = []
 | ||
| count =0
 | ||
| for scene in tqdm(scenes[:]):
 | ||
|     #print("scene") 
 | ||
| 
 | ||
|     for i,line in enumerate(scene):
 | ||
|         if i == 0:
 | ||
|             continue
 | ||
|         if type(line)==type(""):
 | ||
|            #print("here")
 | ||
|            if count==0:
 | ||
|               #print(line)  
 | ||
|               non_dial_src_lang = language_detector(line)
 | ||
|               count+=1
 | ||
|               #print("non_dial_src_lang", non_dial_src_lang)
 | ||
| 
 | ||
|         else:
 | ||
|             #print("line", line)
 | ||
|             [speaker] = line.keys()
 | ||
|             #print([speaker])
 | ||
|             if speaker == 'Transition':
 | ||
|                 continue
 | ||
|             
 | ||
|             #print("dial", line[speaker][2])
 | ||
|             dial_src_lang = language_detector(line[speaker][2])
 | ||
|             dialogue_language.append(dial_src_lang)
 | ||
| 
 | ||
|             #dial_src_script = script_det(line[speaker][2])
 | ||
| 
 | ||
| # print("non_dial_src_lang", non_dial_src_lang)
 | ||
| # print("dial_src_lang", dialogue_language)
 | ||
| 
 | ||
| 
 | ||
| #print(len(dialogue_language))
 | ||
| #print(Counter(dialogue_language))
 | ||
| 
 | ||
| A, B, C = A_B_C(dialogue_language, non_dial_src_lang)
 | ||
| # print("A = {} B = {} C = {}".format(A, B, C))
 | ||
| 
 | ||
| 
 | ||
| def dial_each_word_lang1(non_dial_src_lang, dial):
 | ||
|     for word in dial.split():
 | ||
|         if language_detector(word)==non_dial_src_lang:
 | ||
|             print("word", word)
 | ||
|             return "True"
 | ||
|     return "False"        
 | ||
| 
 | ||
| def dial_each_word_lang2(non_dial_src_lang, A, dial ):
 | ||
|     for word in dial.split():
 | ||
|         if (language_detector(word)!=non_dial_src_lang) or (language_detector(word)!=A):
 | ||
|             print("in 4")
 | ||
|             print("word", word)
 | ||
|             return "True"
 | ||
|     return "False"        
 | ||
| 
 | ||
| def word_with_actionline(scenes):
 | ||
| 
 | ||
|     for scene in tqdm(scenes[:]):
 | ||
|         for i,line in enumerate(scene):
 | ||
|             if i == 0:
 | ||
|                 continue
 | ||
|             if type(line)==type(""):
 | ||
|               continue
 | ||
| 
 | ||
|             else:
 | ||
|                 [speaker] = line.keys()
 | ||
|                 if speaker == 'Transition':
 | ||
|                     continue
 | ||
|                 
 | ||
|                 dial_src_lang = language_detector(line[speaker][2])
 | ||
|                 if dial_src_lang==A:
 | ||
|                     word_lang_with_actionline = dial_each_word_lang1(non_dial_src_lang, line[speaker][2])
 | ||
| 
 | ||
|                 if word_lang_with_actionline == "True":
 | ||
|                   return  word_lang_with_actionline
 | ||
| 
 | ||
| 
 | ||
| def word_with_other(scenes):
 | ||
| 
 | ||
|     for scene in tqdm(scenes[:]):
 | ||
|         for i,line in enumerate(scene):
 | ||
|             if i == 0:
 | ||
|                 continue
 | ||
|             if type(line)==type(""):
 | ||
|               continue
 | ||
| 
 | ||
|             else:
 | ||
|                 [speaker] = line.keys()
 | ||
|                 if speaker == 'Transition':
 | ||
|                     continue
 | ||
|                 
 | ||
|                 dial_src_lang = language_detector(line[speaker][2])
 | ||
| 
 | ||
|                 if dial_src_lang==A:
 | ||
|                   word_lang_with_other = dial_each_word_lang2(non_dial_src_lang, A, line[speaker][2])
 | ||
| 
 | ||
|                 if word_lang_with_other == "True":
 | ||
|                   return word_lang_with_other                 
 | ||
| 
 | ||
| word_lang_with_actionline = word_with_actionline(scenes)
 | ||
| #print(word_lang_with_actionline) 
 | ||
| 
 | ||
| word_lang_with_other = word_with_other(scenes)       
 | ||
| #print(word_lang_with_other) 
 | ||
| 
 | ||
| 
 | ||
| ####
 | ||
| print("actionline_lanuge", non_dial_src_lang)
 | ||
| #print("A = {} B = {} C = {}".format(A, B, C))
 | ||
| print("dial_language", A)
 | ||
| 
 | ||
| if B>0:
 | ||
|     print("UI option3 - yes" )
 | ||
| else:
 | ||
|     print("UI option3 - no" )
 | ||
| 
 | ||
| if C>0:
 | ||
|     print("UI option4 - yes" )
 | ||
| else:
 | ||
|     print("UI option4 - no" )
 | ||
|  
 | ||
| if word_lang_with_actionline=="True":
 | ||
|     print("UI option5 - Yes")
 | ||
| else:
 | ||
|     print("UI_option5 - NO")  
 | ||
| 
 | ||
| if word_lang_with_other=="True":
 | ||
|     print("UI option6 - Yes") 
 | ||
| else:
 | ||
|     print("UI option6 - No")    
 | ||
| 
 | ||
|       
 | ||
| 
 | ||
| 
 | ||
| 
 | ||
| 
 | ||
|     
 |