Conversion_Kitchen_Code/kitchen_counter/conversion/translation/detection5april.py

533 lines
19 KiB
Python
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from google.cloud import translate_v2 as Translate
from google.cloud import translate
from MNF.settings import BasePath
from narration.vectorcode.code.functions import ScriptBreakdown
from .translation.script_writing import default_script
from .translation.script_detector import script_cat
from statistics import mode
from collections import Counter
from docx.enum.table import WD_TABLE_ALIGNMENT, WD_ALIGN_VERTICAL
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.shared import Inches, Cm, Pt
# import textract
from tqdm import tqdm
import sys
import re
import docx
import os
doc = docx.Document()
doc = docx.Document()
basePath = BasePath()
# google
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = rf"{basePath}/conversion/My First Project-2573112d5326.json"
# os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = rf"{basePath}/conversion/gifted-mountain-318504-4f001d5f08db.json"
translate_client = Translate.Client()
client = translate.TranslationServiceClient()
project_id = 'authentic-bongo-272808'
location = "global"
parent = f"projects/{project_id}/locations/{location}"
slug_pattern = r'[\d]*[.]?[\s]*[IE][NX]T'
pat = r'[\d]*[\s]*[IE]/[IE][.]?'
transitions = ['CUT TO:', 'FADE IN:', 'FADE OUT:', 'DISSOLVE TO:', 'JUMP CUT TO:',
'JUMP TO:', 'CUT BACK TO:', 'INTERCUT WITH:', 'I/C WITH:', 'BACK TO:', 'INTERVAL']
reserved_words = ['MONTAGE', 'PBS', 'FADE',
'FADE', 'TITLE', 'SPLIT', 'SCREEN', 'CUT']
style = doc.styles['Normal']
font = style.font
font.name = 'Courier New'
font.size = Pt(12)
def breaksen(s):
l = []
# if len(s.split())<=256:
if len(s.split()) <= 256:
l.append(s)
else:
n = len(s.split())
for i in range(n//32 + 1):
l.append(" ".join(s.split()[32*i:32*(i+1)]))
return l
# def getRefined(filename1):
# print("get_refined_called")
# total_scenes = 0
# text = textract.process(filename1, encoding="utf8", errors='ignore')
# filename = rf"{basePath}/conversion/translation/file.txt"
# f = open(filename, 'wb')
# f.write(text)
# f.close()
# dialog_coming = False
# f = open(filename, 'r', encoding="utf8", errors='ignore')
# doc11 = f.read()
# f.close()
# f1 = open(rf"{basePath}/conversion/translation/file1.txt",
# 'w', encoding="utf8", errors='ignore')
# c = 0
# flag = False
# print("Slugline")
# for line in doc11.split("\n"):
# line = line.strip()
# print("line 427:", line)
# if (line.strip().startswith(('INT.', 'INT ')) or
# line.strip().startswith(('I/E', 'E/I')) or
# line.strip().startswith(('EXT.', 'EXT ')) or
# line.strip().startswith('EXT/INT') or
# line.strip().startswith('INT/EXT') or
# re.match(slug_pattern, line.strip()) or re.match(pat, line.strip())) and not (line.strip().startswith(('INTERCUT', 'INTERMISSION', 'INTERVAL'))):
# flag = True
# f1.write(line)
# f1.write('\n')
# continue
# else:
# print("line 96: else loop", line)
# #line = line.strip()
# if flag:
# print("line 99: if loop:", line)
# if line.strip() == '\n':
# continue
# if dialog_coming and (line == '\n' or line.strip() == ""):
# print("line empty or just have newline", line)
# continue
# if dialog_coming:
# print("line 101 probable dialog or PC: ", line)
# f1.write(line)
# f1.write('\n')
# if re.match(r"\(.*\)", line):
# continue
# else:
# print(" line 207: else of PCs", line)
# dialog_coming = False
# print(" line 457 dialog over")
# continue
# continue
# # if line.isupper() and re.fullmatch(r"([A-Z']*[.]*[\s]*[-]*)*([#]*[\s]*[1-9])*(\(.*\))*",line.strip()):
# if line.isupper() and (re.fullmatch(r"([A-Z']+[\s]*[-]*)*([#]*[\s]*[1-9])*(\(.*\))*", line.strip()) or re.fullmatch(r"(MRS?|DR|ER|PHD|ESQ|HON|JR|MS|MESSRS|MMES|MSGR|PROF|REV|RT. HON|SR|ST)\. [A-Z]+", line.strip())):
# print("line 111: May be speaker: ", line)
# f1.write(line)
# f1.write('\n')
# dialog_coming = True
# continue
# if not line == '\n':
# print(
# "470 probably action or something else so just write it", line)
# f1.write(line)
# f1.write('\n')
# f1.close()
# print("line 132 file closed")
# filename1 = rf"{basePath}/conversion/translation/file1.txt"
# # file.txt contains the data of file1.txt , no usage as of now may be change the mame of the file.txt to file1.txt
# text = textract.process(filename1, encoding="utf8", errors='ignore')
# print("line 136: ", text)
# filename = rf"{basePath}/conversion/translation/file.txt"
# _, file_extension = os.path.splitext(filename1)
# f = open(filename, 'wb')
# f.write(text)
# f.close()
# with open(filename, "r") as input:
# input_ = input.read().split('\n\n')
# refined = []
# for line in input_:
# refined.append(line.strip())
# refined = list(filter(lambda a: a != "", refined))
# #print("processing the script")
# for i in range(len(refined)):
# if not (refined[i].strip().startswith(('INT.', 'INT ')) or refined[i].strip().startswith(('EXT.', 'EXT ')) or refined[i].strip().startswith('EXT/INT') or refined[i].strip().startswith('INT/EXT') or refined[i].strip().startswith(('I/E', 'E/I')) or re.match(slug_pattern, refined[i].strip()) or re.match(pat, refined[i].strip())):
# total_scenes = total_scenes + 1
# continue
# refined = refined[i:]
# break
# # refined.append(line.strip())
# refined = list(filter(lambda a: a != "", refined))
# print("line 163:Refined", refined)
# return refined, total_scenes
# def getSlugAndNonSlug(refined):
# sluglines = []
# without_slug = []
# for para in refined:
# para = para.strip()
# if para.strip().startswith(('INT.', 'INT')) or para.strip().startswith(('EXT.', 'EXT')) or para.strip().startswith('EXT/INT') or para.strip().startswith(('I/E', 'E/I')) or para.strip().startswith('INT/EXT') or re.match(slug_pattern, para.strip()) or re.match(pat, para.strip()):
# sluglines.append(para)
# continue
# without_slug.append(para)
# return sluglines, without_slug
# def getSpeakers(without_slug):
# characters = []
# for para in without_slug:
# lis = para.split('\n')
# i = 0
# for item in lis:
# i = i+1
# i = min(i, len(lis)-2)
# if item.isupper() and not(lis[i+1].strip() == ""):
# if re.match(r"[A-Z']+[\s]*[-]*[A-Z']*([#]*[\s]*[1-9])*(\(.*\))*", item):
# tem = item.split("(")[0].strip()
# characters.append(tem.strip())
# else:
# continue
# characters = list(set(characters))
# characters = list(filter(lambda x: len(x) > 0, characters))
# characters = [character for character in characters if set(
# character.split(" ")).intersection(reserved_words) == set()]
# return characters
# def getScenes(refined, total_scenes, characters):
# # To find scenes data structure and prev and next scenes numbers
# i = 0
# scene = []
# dialogues = []
# speakers = []
# slugline_dic = {}
# prev_dial_speaker = ""
# next_dial_speaker = ""
# pc = 0
# scene_no = 0
# actionline = []
# successor_scene_no = 0
# predecessor_scene_no = 0
# parenthetical_lis = []
# scenes = []
# speaker = ""
# parenthetical = 'NONE'
# patttern = r'[\d]*[.]?[\s]*[IE][NX]T'
# for line in refined:
# if line.strip().startswith(('INT.', 'INT')) or line.strip().startswith(('EXT.', 'EXT')) or line.strip().startswith('EXT/INT') or line.strip().startswith('INT/EXT') or line.strip().startswith(('I/E', 'E/I')) or re.match(slug_pattern, line.strip()) or re.match(pat, line.strip()):
# scenes.append(scene)
# scene = []
# i = 0
# scene_no += 1
# scene.append(line)
# slugline_dic[scene_no] = line.split("\n")[0].strip('0123456789.- ')
# else:
# lis = line.split("\n")
# lis = [l.strip() for l in lis]
# print(" \n Line 222 probable dialogue list", lis)
# word = lis[0]
# if word.split('(')[0].strip() in characters:
# mydic = {}
# prev_dial_speaker = speaker
# speaker = word.split('(')[0].strip()
# print("Speaker 228", speaker)
# if len(lis) > 1 and re.match(r"\(.*\)", lis[1]):
# pc = pc+1
# parenthetical = lis[1]
# parenthetical = parenthetical.replace("\n", "")
# dia = ' '.join(lis[2:])
# dia = dia.replace("\n", "")
# # renu
# dia = dia.replace("\"", '')
# else:
# dia = ''.join(lis[1:])
# dia = dia.replace("\n", "")
# dia = dia.replace("\"", '')
# print(" length dia\n", len(dia))
# if not (len(dia) == 0 and parenthetical == "NONE"):
# print(" len dia != and Parenthetical == NONE: 384 ")
# if i-1 >= 0:
# try:
# prev = main_lis[scene_no-1][i-1]
# except:
# prev = ""
# else:
# prev = ""
# try:
# next = main_lis[scene_no-1][i+1]
# except:
# next = ""
# # prev is previous speaker and next is next speaker of the dialogue
# mydic[speaker] = [parenthetical,
# scene_no, dia, len(dia), prev, next]
# print("line 259", mydic)
# #print("mydic 260", speaker, mydic[speaker])
# prev, next = "", ""
# i = i+1
# speakers.append(speaker)
# parenthetical_lis.append(parenthetical)
# dialogues.append(mydic)
# scene.append(mydic)
# parenthetical = "NONE"
# else:
# line = line.replace("\n", " ")
# line = ' '.join(line.split())
# if line.strip() in transitions:
# scene.append({'Transition': line.strip()})
# continue
# actionline.append(line)
# scene.append(line.strip())
# scenes.append(scene)
# speakers = list(set(speakers))
# scenes = scenes[1:]
# s = []
# for scene in scenes:
# s1 = []
# for ele in scene:
# if type(ele) == type(""):
# s1.extend(ele.split("\n"))
# else:
# s1.append(ele)
# s.append(s1)
# print("dialogue: ", dialogues)
# return s, actionline, parenthetical_lis, speakers, dialogues
def language_detector(text):
result = translate_client.translate(text, target_language='hi')
det_lang = result["detectedSourceLanguage"]
return det_lang
def script_det(text):
punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
no_punct = ""
for char in text:
if char not in punctuations:
no_punct = char
break
script = script_cat(no_punct)[0]
return script
'''
A. Language of Highest number of full dialogues,
B. Numbers of dialogues in action line language,
C. Number of dialogues in other languages)
'''
def A_B_C(dialogue_language, non_dial_src_lang):
print("line 316:dialogue_language", dialogue_language)
print(non_dial_src_lang)
dict1 = dict(Counter(dialogue_language))
print("line 319:dict1", dict1)
sorted_values = sorted(dict1.values(), reverse=True) # Sort the values
print("line 321:sorted_values:", sorted_values)
sorted_dict = {}
for i in sorted_values:
for k in dict1.keys():
if dict1[k] == i:
sorted_dict[k] = dict1[k]
sources = list(sorted_dict.keys())
print("line 328: sources: ", sources)
A = sources[0]
if len(sources) == 1:
B = 0
C = 0
elif non_dial_src_lang not in sources:
B = 0
C = sum(sorted_values[1:])
else:
if A == non_dial_src_lang:
B = 0
else:
B = sorted_values[sources.index(non_dial_src_lang)]
C = sum(sorted_values[2:])
return A, B, C
def dial_each_word_lang1(non_dial_src_lang, dial):
for word in dial.split():
if language_detector(word) == non_dial_src_lang:
#print("word", word)
return "True"
return "False"
def dial_each_word_lang2(non_dial_src_lang, A, dial):
for word in dial.split():
if (language_detector(word) != non_dial_src_lang) or (language_detector(word) != A):
#print("in 4")
#print("word", word)
return "True"
return "False"
def word_with_actionline(scenes, A, non_dial_src_lang):
word_lang_with_actionline = "False"
if A == non_dial_src_lang:
return "False"
for scene in tqdm(scenes[:]):
for i, line in enumerate(scene):
if i == 0:
continue
if type(line) == type(""):
continue
else:
[speaker] = line.keys()
if speaker == 'Transition':
continue
dial_src_lang = language_detector(line[speaker][2])
if dial_src_lang == A:
word_lang_with_actionline = dial_each_word_lang1(
non_dial_src_lang, line[speaker][2])
if word_lang_with_actionline == "True":
return word_lang_with_actionline
def word_with_other(scenes, A, non_dial_src_lang):
word_lang_with_other = "False"
for scene in tqdm(scenes[:]):
for i, line in enumerate(scene):
if i == 0:
continue
if type(line) == type(""):
continue
else:
[speaker] = line.keys()
if speaker == 'Transition':
continue
dial_src_lang = language_detector(line[speaker][2])
if dial_src_lang == A:
word_lang_with_other = dial_each_word_lang2(
non_dial_src_lang, A, line[speaker][2])
if word_lang_with_other == "True":
return word_lang_with_other
def getInputs(filename1):
refined, total_scenes = ScriptBreakdown().getRefined(filename1)
sluglines, without_slug = ScriptBreakdown().getSlugAndNonSlug(refined)
characters = ScriptBreakdown().getSpeakers(without_slug)
scenes, actionline, parenthetical_lis, speakers, dialogues = ScriptBreakdown().getScenes(
refined, total_scenes, characters)
print("line 405:scenes: ", scenes)
language_of_all_dialogues = []
script_of_all_dialogues = []
count = 0
length = len(scenes)
if(length > 10):
length = 10
for scene in tqdm(scenes[:length]):
for i, line in enumerate(scene):
if i == 0:
continue
if type(line) == type(""):
# print("here")
if count == 0:
# print(line)
non_dial_src_lang = language_detector(line)
non_dial_src_script=script_det(line)
count += 1
#print("non_dial_src_lang", non_dial_src_lang)
else:
#print("line", line)
[speaker] = line.keys()
# print([speaker])
if speaker == 'Transition':
continue
#print("dial", line[speaker][2])
dial_src_lang = language_detector(line[speaker][2])
print("dial_src_lang:line 430:", dial_src_lang)
language_of_all_dialogues.append(dial_src_lang)
script_of_all_dialogues.append(script_det(line[speaker][2]))
# print(non_dial_src_lang)
print(language_of_all_dialogues)
# print(script_of_all_dialogues)
A, B, C = A_B_C(language_of_all_dialogues, non_dial_src_lang)
dial_src_script = mode(script_of_all_dialogues)
one_step_process = "Yes" if dial_src_script == default_script[A] else "Can_not_say"
print("one_step_process", one_step_process)
if B > 0:
print("UI option3 - yes")
UI_option3 = "Yes"
else:
print("UI option3 - no")
UI_option3 = "No"
if C > 0:
print("UI option4 - yes")
UI_option4 = "Yes"
else:
print("UI option4 - no")
UI_option4 = "No"
dial_src_lang = A
return [non_dial_src_lang, dial_src_lang, dial_src_script, non_dial_src_script, UI_option3, UI_option4, "Yes", "Yes"]
word_lang_with_actionline = word_with_actionline(
scenes, A, non_dial_src_lang)
# print(word_lang_with_actionline)
word_lang_with_other = word_with_other(scenes, A, non_dial_src_lang)
# print(word_lang_with_other)
print("actionline_lanuge", non_dial_src_lang)
non_dial_src_lang = non_dial_src_lang
print("A = {} B = {} C = {}".format(A, B, C))
print("dial_language", A)
dial_src_lang = A
print("dial_src_script", dial_src_script)
# print("Steps in the process:")
# print("")
if B > 0:
print("UI option3 - yes")
UI_option3 = "Yes"
else:
print("UI option3 - no")
UI_option3 = "No"
if C > 0:
print("UI option4 - yes")
UI_option4 = "Yes"
else:
print("UI option4 - no")
UI_option4 = "No"
if word_lang_with_actionline == "True":
print("UI option5 - Yes")
UI_option5 = "Yes"
else:
print("UI_option5 - NO")
UI_option5 = "No"
if word_lang_with_other == "True":
print("UI option6 - Yes")
UI_option6 = "Yes"
else:
print("UI option6 - No")
UI_option6 = "No"
return [non_dial_src_lang, dial_src_lang, dial_src_script, one_step_process, UI_option3, UI_option4, UI_option5, UI_option6]
# filename1 = sys.argv[1]
# getInputs(filename1)