Conversion_Kitchen_Code/kitchen_counter/conversion/translation/detection123.py

454 lines
14 KiB
Python
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# import textract
from tqdm import tqdm
import sys
import re
import docx
import os
doc = docx.Document()
from docx.shared import Inches, Cm, Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.table import WD_TABLE_ALIGNMENT, WD_ALIGN_VERTICAL
from collections import Counter
from script_detector import script_cat
#google
#os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="gifted-mountain-318504-0a5f94cda0c8.json"
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = rf"{basePath}/conversion/gifted-mountain-318504-4f001d5f08db.json"
from google.cloud import translate
from google.cloud import translate_v2 as Translate
translate_client = Translate.Client()
client = translate.TranslationServiceClient()
project_id = 'authentic-bongo-272808'
location = "global"
parent = f"projects/{project_id}/locations/{location}"
slug_pattern= r'[\d]*[.]?[\s]*[IE][NX]T'
pat = r'[\d]*[\s]*[IE]/[IE][.]?'
transitions = ['CUT TO:','FADE IN:','FADE OUT:','DISSOLVE TO:','JUMP CUT TO:','JUMP TO:','CUT BACK TO:','INTERCUT WITH:','I/C WITH:','BACK TO:', 'INTERVAL']
reserved_words = ['MONTAGE','PBS','FADE','FADE','TITLE','SPLIT', 'SCREEN','CUT']
style = doc.styles['Normal']
font = style.font
font.name = 'Courier New'
font.size = Pt(12)
def breaksen(s):
l =[]
#if len(s.split())<=256:
if len(s.split())<=256:
l.append(s)
else:
n = len(s.split())
for i in range(n//32 + 1):
l.append(" ".join(s.split()[32*i:32*(i+1)]))
return l
def getRefined(filename1):
#print("get_refined_called")
total_scenes = 0
text = textract.process(filename1, encoding="utf8", errors='ignore')
filename="file.txt"
f=open(filename, 'wb')
f.write(text)
f.close()
dialog_coming=False
f=open(filename, 'r', encoding="utf8", errors='ignore')
doc11=f.read()
f.close()
f1=open("file1.txt",'w', encoding="utf8", errors='ignore')
c=0
flag=False
for line in doc11.split("\n"):
if (line.strip().startswith(('INT.','INT ')) or \
line.strip().startswith(('I/E','E/I')) or \
line.strip().startswith(('EXT.','EXT ')) or \
line.strip().startswith('EXT/INT') or \
line.strip().startswith('INT/EXT') or \
re.match(slug_pattern,line.strip())):
flag=True
f1.write(line)
f1.write('\n')
continue
else:
line = line.strip()
if flag:
if line.strip()=='\n':
continue
if dialog_coming and (line=='\n' or line.strip()==""):
continue
if dialog_coming:
f1.write(line)
f1.write('\n')
if re.match(r"\(.*\)",line):
continue
else:
dialog_coming=False
continue
continue
if line.isupper() and re.fullmatch(r"([A-Z']+[\s]*[-]*)*([#]*[\s]*[1-9])*(\(.*\))*",line.strip()):
f1.write(line)
f1.write('\n')
dialog_coming=True
continue
if not line=='\n':
f1.write(line)
f1.write('\n')
f1.close()
filename1="file1.txt"
#file.txt contains the data of file1.txt , no usage as of now may be change the mame of the file.txt to file1.txt
text = textract.process(filename1, encoding="utf8", errors='ignore')
filename="file.txt"
_, file_extension = os.path.splitext(filename1)
f=open(filename, 'wb')
f.write(text)
f.close()
with open(filename, "r") as input:
input_ = input.read().split('\n\n')
refined=[]
for line in input_:
refined.append(line.strip())
refined=list(filter(lambda a: a != "", refined))
#print("processing the script")
for i in range(len(refined)):
if not (refined[i].strip().startswith(('INT.','INT ')) or refined[i].strip().startswith(('EXT.','EXT ')) or refined[i].strip().startswith('EXT/INT') or refined[i].strip().startswith('INT/EXT') or refined[i].strip().startswith(('I/E','E/I')) or re.match(slug_pattern,refined[i].strip()) or re.match(pat,refined[i].strip())):
total_scenes = total_scenes + 1
continue
refined=refined[i:]
break
# refined.append(line.strip())
refined=list(filter(lambda a: a != "", refined))
return refined,total_scenes
def getSlugAndNonSlug(refined):
sluglines=[]
without_slug=[]
for para in refined:
para=para.strip()
if para.strip().startswith(('INT.','INT')) or para.strip().startswith(('EXT.','EXT')) or para.strip().startswith('EXT/INT') or para.strip().startswith(('I/E','E/I')) or para.strip().startswith('INT/EXT') or re.match(slug_pattern,para.strip()) or re.match(pat,para.strip()):
sluglines.append(para)
continue
without_slug.append(para)
return sluglines,without_slug
def getSpeakers(without_slug):
characters=[]
for para in without_slug:
lis=para.split('\n')
i=0
for item in lis:
i=i+1
i=min(i,len(lis)-2)
if item.isupper() and not(lis[i+1].strip()==""):
if re.match(r"[A-Z']+[\s]*[-]*[A-Z']*([#]*[\s]*[1-9])*(\(.*\))*",item):
tem = item.split("(")[0].strip()
characters.append(tem.strip())
else:
continue
characters=list(set(characters))
characters=list(filter(lambda x: len(x) >0,characters))
characters = [character for character in characters if set(character.split(" ")).intersection(reserved_words) == set()]
return characters
def getScenes(refined,total_scenes,characters):
# To find scenes data structure and prev and next scenes numbers
i=0
scene=[]
dialogues=[]
speakers=[]
slugline_dic={}
prev_dial_speaker=""
next_dial_speaker=""
pc=0
scene_no=0
actionline=[]
successor_scene_no=0
predecessor_scene_no=0
parenthetical_lis=[]
scenes=[]
speaker=""
parenthetical='NONE'
patttern=r'[\d]*[.]?[\s]*[IE][NX]T'
for line in refined:
if line.strip().startswith(('INT.','INT')) or line.strip().startswith(('EXT.','EXT')) or line.strip().startswith('EXT/INT') or line.strip().startswith('INT/EXT') or line.strip().startswith(('I/E','E/I')) or re.match(slug_pattern,line.strip()) or re.match(pat,line.strip()):
scenes.append(scene)
scene=[]
i=0
scene_no+=1
scene.append(line)
slugline_dic[scene_no]=line.split("\n")[0].strip('0123456789.- ')
else:
lis=line.split("\n")
lis=[l.strip() for l in lis]
word=lis[0]
if word.split('(')[0].strip() in characters:
mydic={}
prev_dial_speaker=speaker
speaker=word.split('(')[0].strip()
if len(lis)>1 and re.match(r"\(.*\)",lis[1]):
pc=pc+1
parenthetical=lis[1]
parenthetical=parenthetical.replace("\n","")
dia=' '.join(lis[2:])
dia=dia.replace("\n","")
##renu
dia=dia.replace("\"", '')
else:
dia=''.join(lis[1:])
dia=dia.replace("\n","")
dia=dia.replace("\"", '')
if not (len(dia)==0 and parenthetical=="NONE"):
if i-1 >= 0:
try:
prev=main_lis[scene_no-1][i-1]
except:
prev=""
else:
prev=""
try:
next=main_lis[scene_no-1][i+1]
except:
next=""
#prev is previous speaker and next is next speaker of the dialogue
mydic[speaker]=[parenthetical,scene_no,dia,len(dia),prev,next]
# print(mydic)
prev,next="",""
i=i+1
speakers.append(speaker)
parenthetical_lis.append(parenthetical)
dialogues.append(mydic)
scene.append(mydic)
parenthetical="NONE"
else:
line=line.replace("\n"," ")
line=' '.join(line.split())
if line.strip() in transitions:
scene.append({'Transition':line.strip()})
continue
actionline.append(line)
scene.append(line.strip())
scenes.append(scene)
speakers=list(set(speakers))
scenes=scenes[1:]
s = []
for scene in scenes:
s1=[]
for ele in scene:
if type(ele) == type(""):
s1.extend(ele.split("\n"))
else:
s1.append(ele)
s.append(s1)
return s,actionline,parenthetical_lis,speakers,dialogues
filename1 = sys.argv[1]
#print(filename1)
refined,total_scenes = getRefined(filename1)
#print(refined)
sluglines,without_slug = getSlugAndNonSlug(refined)
characters = getSpeakers(without_slug)
scenes,actionline,parenthetical_lis,speakers,dialogues = getScenes(refined,total_scenes,characters)
#print(scenes)
def language_detector(text):
result = translate_client.translate(text, target_language='hi')
det_lang = result["detectedSourceLanguage"]
return det_lang
def script_det(text):
punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
no_punct = ""
for char in text:
if char not in punctuations:
no_punct = char
break
script = script_cat(no_punct)[0]
return script
'''
A. Language of Highest number of full dialogues,
B. Numbers of dialogues in action line language,
C. Number of dialogues in other languages)
'''
def A_B_C(dialogue_language, non_dial_src_lang):
dict1 = dict(Counter(dialogue_language))
sorted_values = sorted(dict1.values(), reverse=True) # Sort the values
sorted_dict = {}
for i in sorted_values:
for k in dict1.keys():
if dict1[k] == i:
sorted_dict[k] = dict1[k]
sources = list(sorted_dict.keys())
A = sources[0]
if len(sources)!=1:
B = sorted_values[sources.index(non_dial_src_lang)]
C = sum(sorted_values[2:])
else:
B=0
C=0
return A, B, C
#print(scenes)
dialogue_language = []
count =0
for scene in tqdm(scenes[:]):
#print("scene")
for i,line in enumerate(scene):
if i == 0:
continue
if type(line)==type(""):
#print("here")
if count==0:
#print(line)
non_dial_src_lang = language_detector(line)
count+=1
#print("non_dial_src_lang", non_dial_src_lang)
else:
#print("line", line)
[speaker] = line.keys()
#print([speaker])
if speaker == 'Transition':
continue
#print("dial", line[speaker][2])
dial_src_lang = language_detector(line[speaker][2])
dialogue_language.append(dial_src_lang)
#dial_src_script = script_det(line[speaker][2])
# print("non_dial_src_lang", non_dial_src_lang)
# print("dial_src_lang", dialogue_language)
#print(len(dialogue_language))
#print(Counter(dialogue_language))
A, B, C = A_B_C(dialogue_language, non_dial_src_lang)
# print("A = {} B = {} C = {}".format(A, B, C))
def dial_each_word_lang1(non_dial_src_lang, dial):
for word in dial.split():
if language_detector(word)==non_dial_src_lang:
print("word", word)
return "True"
return "False"
def dial_each_word_lang2(non_dial_src_lang, A, dial ):
for word in dial.split():
if (language_detector(word)!=non_dial_src_lang) or (language_detector(word)!=A):
print("in 4")
print("word", word)
return "True"
return "False"
def word_with_actionline(scenes):
for scene in tqdm(scenes[:]):
for i,line in enumerate(scene):
if i == 0:
continue
if type(line)==type(""):
continue
else:
[speaker] = line.keys()
if speaker == 'Transition':
continue
dial_src_lang = language_detector(line[speaker][2])
if dial_src_lang==A:
word_lang_with_actionline = dial_each_word_lang1(non_dial_src_lang, line[speaker][2])
if word_lang_with_actionline == "True":
return word_lang_with_actionline
def word_with_other(scenes):
for scene in tqdm(scenes[:]):
for i,line in enumerate(scene):
if i == 0:
continue
if type(line)==type(""):
continue
else:
[speaker] = line.keys()
if speaker == 'Transition':
continue
dial_src_lang = language_detector(line[speaker][2])
if dial_src_lang==A:
word_lang_with_other = dial_each_word_lang2(non_dial_src_lang, A, line[speaker][2])
if word_lang_with_other == "True":
return word_lang_with_other
word_lang_with_actionline = word_with_actionline(scenes)
#print(word_lang_with_actionline)
word_lang_with_other = word_with_other(scenes)
#print(word_lang_with_other)
####
print("actionline_lanuge", non_dial_src_lang)
#print("A = {} B = {} C = {}".format(A, B, C))
print("dial_language", A)
if B>0:
print("UI option3 - yes" )
else:
print("UI option3 - no" )
if C>0:
print("UI option4 - yes" )
else:
print("UI option4 - no" )
if word_lang_with_actionline=="True":
print("UI option5 - Yes")
else:
print("UI_option5 - NO")
if word_lang_with_other=="True":
print("UI option6 - Yes")
else:
print("UI option6 - No")