Conversion_Kitchen_Code/kitchen_counter/conversion/translation/detection4march.py

637 lines
23 KiB
Python
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# import textract
from tqdm import tqdm
import sys
import re
import docx
import os
doc = docx.Document()
from docx.shared import Inches, Cm, Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.table import WD_TABLE_ALIGNMENT, WD_ALIGN_VERTICAL
from collections import Counter
from statistics import mode
from .translation.script_detector import script_cat
from .translation.script_writing import default_script
# import textract
from tqdm import tqdm
import sys
import re
import docx
import os
doc = docx.Document()
from docx.shared import Inches, Cm, Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.table import WD_TABLE_ALIGNMENT, WD_ALIGN_VERTICAL
from MNF.settings import BasePath
basePath = BasePath()
#google
#os.environ["GOOGLE_APPLICATION_CREDENTIALS"]=rf"{basePath}/conversion/My First Project-2573112d5326.json"
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = rf"{basePath}/conversion/gifted-mountain-318504-4f001d5f08db.json"
from google.cloud import translate
from google.cloud import translate_v2 as Translate
translate_client = Translate.Client()
client = translate.TranslationServiceClient()
project_id = 'authentic-bongo-272808'
location = "global"
parent = f"projects/{project_id}/locations/{location}"
slug_pattern= r'[\d]*[.]?[\s]*[IE][NX]T'
pat = r'[\d]*[\s]*[IE]/[IE][.]?'
transitions = ['CUT TO:','FADE IN:','FADE OUT:','DISSOLVE TO:','JUMP CUT TO:','JUMP TO:','CUT BACK TO:','INTERCUT WITH:','I/C WITH:','BACK TO:', 'INTERVAL']
reserved_words = ['MONTAGE','PBS','FADE','FADE','TITLE','SPLIT', 'SCREEN','CUT']
style = doc.styles['Normal']
font = style.font
font.name = 'Courier New'
font.size = Pt(12)
def breaksen(s):
l =[]
#if len(s.split())<=256:
if len(s.split())<=256:
l.append(s)
else:
n = len(s.split())
for i in range(n//32 + 1):
l.append(" ".join(s.split()[32*i:32*(i+1)]))
return l
def getRefined(filename1):
print("get_refined_called")
total_scenes = 0
text = textract.process(filename1, encoding="utf8", errors='ignore')
filename= rf"{basePath}/conversion/translation/file.txt"
f=open(filename, 'wb')
f.write(text)
f.close()
dialog_coming=False
f=open(filename, 'r', encoding="utf8", errors='ignore')
doc11=f.read()
f.close()
f1=open(rf"{basePath}/conversion/translation/file1.txt",'w', encoding="utf8", errors='ignore')
c=0
flag=False
print("Slugline")
for line in doc11.split("\n"):
line=line.strip()
print("line 427:",line)
if (line.strip().startswith(('INT.','INT ')) or \
line.strip().startswith(('I/E','E/I')) or \
line.strip().startswith(('EXT.','EXT ')) or \
line.strip().startswith('EXT/INT') or \
line.strip().startswith('INT/EXT') or \
re.match(slug_pattern, line.strip()) or re.match(pat, line.strip())) and not (line.strip().startswith(('INTERCUT', 'INTERMISSION', 'INTERVAL'))):
flag=True
f1.write(line)
f1.write('\n')
continue
else:
print("line 96: else loop", line)
#line = line.strip()
if flag:
print("line 99: if loop:", line)
if line.strip()=='\n':
continue
if dialog_coming and (line=='\n' or line.strip()==""):
print("line empty or just have newline", line)
continue
if dialog_coming:
print("line 101 probable dialog or PC: ", line)
f1.write(line)
f1.write('\n')
if re.match(r"\(.*\)",line):
continue
else:
print(" line 207: else of PCs", line)
dialog_coming=False
print(" line 457 dialog over")
continue
continue
# if line.isupper() and re.fullmatch(r"([A-Z']*[.]*[\s]*[-]*)*([#]*[\s]*[1-9])*(\(.*\))*",line.strip()):
if line.isupper() and (re.fullmatch(r"([A-Z']+[\s]*[-]*)*([#]*[\s]*[1-9])*(\(.*\))*", line.strip()) or re.fullmatch(r"(MRS?|DR|ER|PHD|ESQ|HON|JR|MS|MESSRS|MMES|MSGR|PROF|REV|RT. HON|SR|ST)\. [A-Z]+",line.strip())):
print("line 111: May be speaker: ", line)
f1.write(line)
f1.write('\n')
dialog_coming=True
continue
if not line=='\n':
print("470 probably action or something else so just write it", line)
f1.write(line)
f1.write('\n')
f1.close()
print("line 132 file closed")
filename1=rf"{basePath}/conversion/translation/file1.txt"
#file.txt contains the data of file1.txt , no usage as of now may be change the mame of the file.txt to file1.txt
text = textract.process(filename1, encoding="utf8", errors='ignore')
print("line 136: ",text)
filename=rf"{basePath}/conversion/translation/file.txt"
_, file_extension = os.path.splitext(filename1)
f=open(filename, 'wb')
f.write(text)
f.close()
with open(filename, "r") as input:
input_ = input.read().split('\n\n')
refined=[]
for line in input_:
refined.append(line.strip())
refined=list(filter(lambda a: a != "", refined))
#print("processing the script")
for i in range(len(refined)):
if not (refined[i].strip().startswith(('INT.','INT ')) or refined[i].strip().startswith(('EXT.','EXT ')) or refined[i].strip().startswith('EXT/INT') or refined[i].strip().startswith('INT/EXT') or refined[i].strip().startswith(('I/E','E/I')) or re.match(slug_pattern,refined[i].strip()) or re.match(pat,refined[i].strip())):
total_scenes = total_scenes + 1
continue
refined=refined[i:]
break
# refined.append(line.strip())
refined=list(filter(lambda a: a != "", refined))
print("line 163:Refined",refined)
return refined,total_scenes
def getSlugAndNonSlug(refined):
sluglines=[]
without_slug=[]
for para in refined:
para=para.strip()
if para.strip().startswith(('INT.','INT')) or para.strip().startswith(('EXT.','EXT')) or para.strip().startswith('EXT/INT') or para.strip().startswith(('I/E','E/I')) or para.strip().startswith('INT/EXT') or re.match(slug_pattern,para.strip()) or re.match(pat,para.strip()):
sluglines.append(para)
continue
without_slug.append(para)
return sluglines,without_slug
def getSpeakers(without_slug):
characters=[]
for para in without_slug:
lis=para.split('\n')
i=0
for item in lis:
i=i+1
i=min(i,len(lis)-2)
if item.isupper() and not(lis[i+1].strip()==""):
if re.match(r"[A-Z']+[\s]*[-]*[A-Z']*([#]*[\s]*[1-9])*(\(.*\))*",item):
tem = item.split("(")[0].strip()
characters.append(tem.strip())
else:
continue
characters=list(set(characters))
characters=list(filter(lambda x: len(x) >0,characters))
characters = [character for character in characters if set(character.split(" ")).intersection(reserved_words) == set()]
return characters
def getScenes(refined,total_scenes,characters):
# To find scenes data structure and prev and next scenes numbers
i=0
scene=[]
dialogues=[]
speakers=[]
slugline_dic={}
prev_dial_speaker=""
next_dial_speaker=""
pc=0
scene_no=0
actionline=[]
successor_scene_no=0
predecessor_scene_no=0
parenthetical_lis=[]
scenes=[]
speaker=""
parenthetical='NONE'
patttern=r'[\d]*[.]?[\s]*[IE][NX]T'
for line in refined:
if line.strip().startswith(('INT.','INT')) or line.strip().startswith(('EXT.','EXT')) or line.strip().startswith('EXT/INT') or line.strip().startswith('INT/EXT') or line.strip().startswith(('I/E','E/I')) or re.match(slug_pattern,line.strip()) or re.match(pat,line.strip()):
scenes.append(scene)
scene=[]
i=0
scene_no+=1
scene.append(line)
slugline_dic[scene_no]=line.split("\n")[0].strip('0123456789.- ')
else:
lis=line.split("\n")
lis=[l.strip() for l in lis]
print(" \n Line 222 probable dialogue list", lis)
word=lis[0]
if word.split('(')[0].strip() in characters:
mydic={}
prev_dial_speaker=speaker
speaker=word.split('(')[0].strip()
print("Speaker 228", speaker)
if len(lis)>1 and re.match(r"\(.*\)",lis[1]):
pc=pc+1
parenthetical=lis[1]
parenthetical=parenthetical.replace("\n","")
dia=' '.join(lis[2:])
dia=dia.replace("\n","")
##renu
dia=dia.replace("\"", '')
else:
dia=''.join(lis[1:])
dia=dia.replace("\n","")
dia=dia.replace("\"", '')
print(" length dia\n", len(dia))
if not (len(dia)==0 and parenthetical=="NONE"):
print(" len dia != and Parenthetical == NONE: 384 ")
if i-1 >= 0:
try:
prev=main_lis[scene_no-1][i-1]
except:
prev=""
else:
prev=""
try:
next=main_lis[scene_no-1][i+1]
except:
next=""
#prev is previous speaker and next is next speaker of the dialogue
mydic[speaker]=[parenthetical,scene_no,dia,len(dia),prev,next]
print("line 259",mydic)
#print("mydic 260", speaker, mydic[speaker])
prev,next="",""
i=i+1
speakers.append(speaker)
parenthetical_lis.append(parenthetical)
dialogues.append(mydic)
scene.append(mydic)
parenthetical="NONE"
else:
line=line.replace("\n"," ")
line=' '.join(line.split())
if line.strip() in transitions:
scene.append({'Transition':line.strip()})
continue
actionline.append(line)
scene.append(line.strip())
scenes.append(scene)
speakers=list(set(speakers))
scenes=scenes[1:]
s = []
for scene in scenes:
s1=[]
for ele in scene:
if type(ele) == type(""):
s1.extend(ele.split("\n"))
else:
s1.append(ele)
s.append(s1)
print("dialogue: ",dialogues)
return s,actionline,parenthetical_lis,speakers,dialogues
# def getScenes(refined, total_scenes, characters):
# # To find scenes data structure and prev and next scenes numbers
# i = 0
# scene = []
# dialogues = []
# speakers = []
# slugline_dic = {}
# prev_dial_speaker = ""
# next_dial_speaker = ""
# pc = 0
# scene_no = 0
# actionline = []
# successor_scene_no = 0
# predecessor_scene_no = 0
# parenthetical_lis = []
# scenes = []
# speaker = ""
# parenthetical = 'NONE'
# predecessor_scene_no_dict = {
# 'Scene '+str(i+1): 0 for i in range(total_scenes)}
# dia_count = {'Scene '+str(i+1): 0 for i in range(total_scenes)}
# successor_scene_no_dict = {
# 'Scene '+str(i+1): 0 for i in range(total_scenes)}
# parenthetical_count_dict = {
# 'Scene '+str(i+1): 0 for i in range(total_scenes)}
# patttern = r'[\d]*[.]?[\s]*[IE][NX]T'
# for line in refined:
# if ((line.strip().startswith(('INT.', 'INT')) or line.strip().startswith(('EXT.', 'EXT')) or line.strip().startswith('EXT/INT') or line.strip().startswith('INT/EXT') or line.strip().startswith(('I/E', 'E/I')) or re.match(slug_pattern, line.strip()) or re.match(pat, line.strip())) and not (line.strip().startswith(('INTERCUT', 'INTERVAL', 'INTERMISSION')))):
# # if re.match(patttern,line):
# # current_scene=line.split(" ")[0]
# # current_scene=current_scene[:1]
# # current_scene=int(current_scene)
# # successor_scene_no=min(int(current_scene)+2,total_scenes)
# # predecessor_scene_no=max(int(current_scene),-1)
# # else:
# successor_scene_no = min(scene_no+3, total_scenes)
# predecessor_scene_no = max(scene_no+1, 0)
# if scene_no > 0:
# parenthetical_count_dict['Scene '+str(scene_no+1)] = pc
# pc = 0
# scenes.append(scene)
# scene = []
# i = 0
# scene_no += 1
# predecessor_scene_no_dict['Scene ' +
# str(scene_no+1)] = predecessor_scene_no
# successor_scene_no_dict['Scene ' +
# str(scene_no+1)] = successor_scene_no
# successor_scene_no_dict['Scene '+str(1)] = 2
# scene.append(line)
# slugline_dic[scene_no] = line.split(
# "\n")[0].strip('0123456789.- ')
# else:
# lis = line.split("\n")
# lis = [l.strip() for l in lis]
# print(" \n Line 363 probable dialogue list", lis)
# word = lis[0]
# extendedSpeaker = ""
# if word.split('(')[0].strip() in characters:
# mydic = {}
# prev_dial_speaker = speaker
# speakerline = word.split('(')
# # speaker = word.split('(')[0].strip()
# speaker = speakerline[0].strip()
# print("Speaker 378", speaker)
# extendedSpeaker = word.strip()
# if len(lis) > 1 and re.match(r"\(.*\)", lis[1]):
# pc = pc+1
# parenthetical = lis[1]
# parenthetical = parenthetical.replace("\n", "")
# dia = ' '.join(lis[2:])
# dia = dia.replace("\n", "")
# # renu
# # dia=dia.replace("\"", '')
# else:
# dia = ''.join(lis[1:])
# dia = dia.replace("\n", "")
# dia = dia.replace("\"", '')
# print(" length dia\n", len(dia))
# if not (len(dia) == 0 and parenthetical == "NONE"):
# print(" len dia != and Parenthetical == NONE: 384 ")
# if i-1 >= 0:
# try:
# prev = main_lis[scene_no-1][i-1]
# except:
# prev = ""
# else:
# prev = ""
# try:
# next = main_lis[scene_no-1][i+1]
# except:
# next = ""
# # prev is previous speaker and next is next speaker of the dialogue
# mydic[speaker] = [parenthetical,
# scene_no, dia, len(dia), prev, next, extendedSpeaker]
# print("mydic 398", speaker, mydic[speaker])
# dia_count['Scene '+str(scene_no)] += 1
# # print(mydic)
# prev, next = "", ""
# i = i+1
# speakers.append(speaker)
# parenthetical_lis.append(parenthetical)
# dialogues.append(mydic)
# scene.append(mydic)
# parenthetical = "NONE"
# else:
# line = line.replace("\n", " ")
# line = ' '.join(line.split())
# if line.strip() in transitions:
# scene.append({'Transition': line.strip()})
# continue
# actionline.append(line)
# scene.append(line.strip())
# scenes.append(scene)
# parenthetical_count_dict['Scene '+str(scene_no)] = pc
# speakers = list(set(speakers))
# scenes = scenes[1:]
# print("Scenes:", scenes)
# # for removing '\n' from action lines
# # return scenes also if '\n' required and modify practice_with_db also
# # s = []
# # for scene in scenes:
# # s1=[]
# # for ele in scene:
# # if type(ele) == type(""):
# # s1.extend(ele.split("\n"))
# # else:
# # s1.append(ele)
# # s.append(s1)
# return scenes,actionline,parenthetical_lis,speakers,dialogues
def language_detector(text):
result = translate_client.translate(text, target_language='hi')
det_lang = result["detectedSourceLanguage"]
return det_lang
def script_det(text):
punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
no_punct = ""
for char in text:
if char not in punctuations:
no_punct = char
break
script = script_cat(no_punct)[0]
return script
'''
A. Language of Highest number of full dialogues,
B. Numbers of dialogues in action line language,
C. Number of dialogues in other languages)
'''
def A_B_C(dialogue_language, non_dial_src_lang):
dict1 = dict(Counter(dialogue_language))
sorted_values = sorted(dict1.values(), reverse=True) # Sort the values
sorted_dict = {}
for i in sorted_values:
for k in dict1.keys():
if dict1[k] == i:
sorted_dict[k] = dict1[k]
sources = list(sorted_dict.keys())
A = sources[0]
if len(sources)==1:
B=0
C=0
elif non_dial_src_lang not in sources:
B=0
C = sum(sorted_values[1:])
else:
B = sorted_values[sources.index(non_dial_src_lang)]
C = sum(sorted_values[2:])
return A, B, C
def dial_each_word_lang1(non_dial_src_lang, dial):
for word in dial.split():
if language_detector(word)==non_dial_src_lang:
#print("word", word)
return "True"
return "False"
def dial_each_word_lang2(non_dial_src_lang, A, dial ):
for word in dial.split():
if (language_detector(word)!=non_dial_src_lang) or (language_detector(word)!=A):
#print("in 4")
#print("word", word)
return "True"
return "False"
def word_with_actionline(scenes, A, non_dial_src_lang):
if A==non_dial_src_lang:
return "False"
for scene in tqdm(scenes[:]):
for i,line in enumerate(scene):
if i == 0:
continue
if type(line)==type(""):
continue
else:
[speaker] = line.keys()
if speaker == 'Transition':
continue
dial_src_lang = language_detector(line[speaker][2])
if dial_src_lang==A:
word_lang_with_actionline = dial_each_word_lang1(non_dial_src_lang, line[speaker][2])
if word_lang_with_actionline == "True":
return word_lang_with_actionline
def word_with_other(scenes, A, non_dial_src_lang):
word_lang_with_other = "False"
for scene in tqdm(scenes[:]):
for i,line in enumerate(scene):
if i == 0:
continue
if type(line)==type(""):
continue
else:
[speaker] = line.keys()
if speaker == 'Transition':
continue
dial_src_lang = language_detector(line[speaker][2])
if dial_src_lang==A:
word_lang_with_other = dial_each_word_lang2(non_dial_src_lang, A, line[speaker][2])
if word_lang_with_other == "True":
return word_lang_with_other
def getInputs(filename1):
refined, total_scenes = getRefined(filename1)
sluglines, without_slug = getSlugAndNonSlug(refined)
characters = getSpeakers(without_slug)
scenes, actionline, parenthetical_lis, speakers,dialogues = getScenes(refined,total_scenes,characters)
print("line 405:scenes: ",scenes)
language_of_all_dialogues = []
script_of_all_dialogues = []
count =0
for scene in tqdm(scenes[:]):
for i,line in enumerate(scene):
if i == 0:
continue
if type(line)==type(""):
#print("here")
if count==0:
#print(line)
non_dial_src_lang = language_detector(line)
non_dial_src_script=script_det(line)
count+=1
#print("non_dial_src_lang", non_dial_src_lang)
else:
#print("line", line)
[speaker] = line.keys()
#print([speaker])
if speaker == 'Transition':
continue
#print("dial", line[speaker][2])
dial_src_lang = language_detector(line[speaker][2])
language_of_all_dialogues.append(dial_src_lang)
script_of_all_dialogues.append(script_det(line[speaker][2]))
# print(non_dial_src_lang)
# print(language_of_all_dialogues)
# print(script_of_all_dialogues)
A, B, C = A_B_C(language_of_all_dialogues, non_dial_src_lang)
dial_src_script = mode(script_of_all_dialogues)
word_lang_with_actionline = word_with_actionline(scenes, A, non_dial_src_lang)
#print(word_lang_with_actionline)
word_lang_with_other = word_with_other(scenes, A, non_dial_src_lang)
#print(word_lang_with_other)
print("actionline_lanuge", non_dial_src_lang)
non_dial_src_lang = non_dial_src_lang
print("A = {} B = {} C = {}".format(A, B, C))
print("dial_language", A)
dial_src_lang = A
print("dial_src_script", dial_src_script)
# print("Steps in the process:")
# print("")
if B>0:
print("UI option3 - yes" )
UI_option3 = "Yes"
else:
print("UI option3 - no" )
UI_option3 = "No"
if C>0:
print("UI option4 - yes" )
UI_option4 = "Yes"
else:
print("UI option4 - no" )
UI_option4 = "No"
if word_lang_with_actionline=="True":
print("UI option5 - Yes")
UI_option5 = "Yes"
else:
print("UI_option5 - NO")
UI_option5 = "No"
if word_lang_with_other=="True":
print("UI option6 - Yes")
UI_option6 = "Yes"
else:
print("UI option6 - No")
UI_option6 = "No"
return [non_dial_src_lang, dial_src_lang, dial_src_script,non_dial_src_script, UI_option3, UI_option4, UI_option5, UI_option6]
# filename1 = sys.argv[1]
# getInputs(filename1)