Conversion_Kitchen_Code/kitchen_counter/conversion/translation/detection5april.py

533 lines
19 KiB
Python
Raw Normal View History

2024-04-27 09:33:09 +00:00
from google.cloud import translate_v2 as Translate
from google.cloud import translate
from MNF.settings import BasePath
from narration.vectorcode.code.functions import ScriptBreakdown
from .translation.script_writing import default_script
from .translation.script_detector import script_cat
from statistics import mode
from collections import Counter
from docx.enum.table import WD_TABLE_ALIGNMENT, WD_ALIGN_VERTICAL
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.shared import Inches, Cm, Pt
# import textract
from tqdm import tqdm
import sys
import re
import docx
import os
doc = docx.Document()
doc = docx.Document()
basePath = BasePath()
# google
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = rf"{basePath}/conversion/My First Project-2573112d5326.json"
# os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = rf"{basePath}/conversion/gifted-mountain-318504-4f001d5f08db.json"
translate_client = Translate.Client()
client = translate.TranslationServiceClient()
project_id = 'authentic-bongo-272808'
location = "global"
parent = f"projects/{project_id}/locations/{location}"
slug_pattern = r'[\d]*[.]?[\s]*[IE][NX]T'
pat = r'[\d]*[\s]*[IE]/[IE][.]?'
transitions = ['CUT TO:', 'FADE IN:', 'FADE OUT:', 'DISSOLVE TO:', 'JUMP CUT TO:',
'JUMP TO:', 'CUT BACK TO:', 'INTERCUT WITH:', 'I/C WITH:', 'BACK TO:', 'INTERVAL']
reserved_words = ['MONTAGE', 'PBS', 'FADE',
'FADE', 'TITLE', 'SPLIT', 'SCREEN', 'CUT']
style = doc.styles['Normal']
font = style.font
font.name = 'Courier New'
font.size = Pt(12)
def breaksen(s):
l = []
# if len(s.split())<=256:
if len(s.split()) <= 256:
l.append(s)
else:
n = len(s.split())
for i in range(n//32 + 1):
l.append(" ".join(s.split()[32*i:32*(i+1)]))
return l
# def getRefined(filename1):
# print("get_refined_called")
# total_scenes = 0
# text = textract.process(filename1, encoding="utf8", errors='ignore')
# filename = rf"{basePath}/conversion/translation/file.txt"
# f = open(filename, 'wb')
# f.write(text)
# f.close()
# dialog_coming = False
# f = open(filename, 'r', encoding="utf8", errors='ignore')
# doc11 = f.read()
# f.close()
# f1 = open(rf"{basePath}/conversion/translation/file1.txt",
# 'w', encoding="utf8", errors='ignore')
# c = 0
# flag = False
# print("Slugline")
# for line in doc11.split("\n"):
# line = line.strip()
# print("line 427:", line)
# if (line.strip().startswith(('INT.', 'INT ')) or
# line.strip().startswith(('I/E', 'E/I')) or
# line.strip().startswith(('EXT.', 'EXT ')) or
# line.strip().startswith('EXT/INT') or
# line.strip().startswith('INT/EXT') or
# re.match(slug_pattern, line.strip()) or re.match(pat, line.strip())) and not (line.strip().startswith(('INTERCUT', 'INTERMISSION', 'INTERVAL'))):
# flag = True
# f1.write(line)
# f1.write('\n')
# continue
# else:
# print("line 96: else loop", line)
# #line = line.strip()
# if flag:
# print("line 99: if loop:", line)
# if line.strip() == '\n':
# continue
# if dialog_coming and (line == '\n' or line.strip() == ""):
# print("line empty or just have newline", line)
# continue
# if dialog_coming:
# print("line 101 probable dialog or PC: ", line)
# f1.write(line)
# f1.write('\n')
# if re.match(r"\(.*\)", line):
# continue
# else:
# print(" line 207: else of PCs", line)
# dialog_coming = False
# print(" line 457 dialog over")
# continue
# continue
# # if line.isupper() and re.fullmatch(r"([A-Z']*[.]*[\s]*[-]*)*([#]*[\s]*[1-9])*(\(.*\))*",line.strip()):
# if line.isupper() and (re.fullmatch(r"([A-Z']+[\s]*[-]*)*([#]*[\s]*[1-9])*(\(.*\))*", line.strip()) or re.fullmatch(r"(MRS?|DR|ER|PHD|ESQ|HON|JR|MS|MESSRS|MMES|MSGR|PROF|REV|RT. HON|SR|ST)\. [A-Z]+", line.strip())):
# print("line 111: May be speaker: ", line)
# f1.write(line)
# f1.write('\n')
# dialog_coming = True
# continue
# if not line == '\n':
# print(
# "470 probably action or something else so just write it", line)
# f1.write(line)
# f1.write('\n')
# f1.close()
# print("line 132 file closed")
# filename1 = rf"{basePath}/conversion/translation/file1.txt"
# # file.txt contains the data of file1.txt , no usage as of now may be change the mame of the file.txt to file1.txt
# text = textract.process(filename1, encoding="utf8", errors='ignore')
# print("line 136: ", text)
# filename = rf"{basePath}/conversion/translation/file.txt"
# _, file_extension = os.path.splitext(filename1)
# f = open(filename, 'wb')
# f.write(text)
# f.close()
# with open(filename, "r") as input:
# input_ = input.read().split('\n\n')
# refined = []
# for line in input_:
# refined.append(line.strip())
# refined = list(filter(lambda a: a != "", refined))
# #print("processing the script")
# for i in range(len(refined)):
# if not (refined[i].strip().startswith(('INT.', 'INT ')) or refined[i].strip().startswith(('EXT.', 'EXT ')) or refined[i].strip().startswith('EXT/INT') or refined[i].strip().startswith('INT/EXT') or refined[i].strip().startswith(('I/E', 'E/I')) or re.match(slug_pattern, refined[i].strip()) or re.match(pat, refined[i].strip())):
# total_scenes = total_scenes + 1
# continue
# refined = refined[i:]
# break
# # refined.append(line.strip())
# refined = list(filter(lambda a: a != "", refined))
# print("line 163:Refined", refined)
# return refined, total_scenes
# def getSlugAndNonSlug(refined):
# sluglines = []
# without_slug = []
# for para in refined:
# para = para.strip()
# if para.strip().startswith(('INT.', 'INT')) or para.strip().startswith(('EXT.', 'EXT')) or para.strip().startswith('EXT/INT') or para.strip().startswith(('I/E', 'E/I')) or para.strip().startswith('INT/EXT') or re.match(slug_pattern, para.strip()) or re.match(pat, para.strip()):
# sluglines.append(para)
# continue
# without_slug.append(para)
# return sluglines, without_slug
# def getSpeakers(without_slug):
# characters = []
# for para in without_slug:
# lis = para.split('\n')
# i = 0
# for item in lis:
# i = i+1
# i = min(i, len(lis)-2)
# if item.isupper() and not(lis[i+1].strip() == ""):
# if re.match(r"[A-Z']+[\s]*[-]*[A-Z']*([#]*[\s]*[1-9])*(\(.*\))*", item):
# tem = item.split("(")[0].strip()
# characters.append(tem.strip())
# else:
# continue
# characters = list(set(characters))
# characters = list(filter(lambda x: len(x) > 0, characters))
# characters = [character for character in characters if set(
# character.split(" ")).intersection(reserved_words) == set()]
# return characters
# def getScenes(refined, total_scenes, characters):
# # To find scenes data structure and prev and next scenes numbers
# i = 0
# scene = []
# dialogues = []
# speakers = []
# slugline_dic = {}
# prev_dial_speaker = ""
# next_dial_speaker = ""
# pc = 0
# scene_no = 0
# actionline = []
# successor_scene_no = 0
# predecessor_scene_no = 0
# parenthetical_lis = []
# scenes = []
# speaker = ""
# parenthetical = 'NONE'
# patttern = r'[\d]*[.]?[\s]*[IE][NX]T'
# for line in refined:
# if line.strip().startswith(('INT.', 'INT')) or line.strip().startswith(('EXT.', 'EXT')) or line.strip().startswith('EXT/INT') or line.strip().startswith('INT/EXT') or line.strip().startswith(('I/E', 'E/I')) or re.match(slug_pattern, line.strip()) or re.match(pat, line.strip()):
# scenes.append(scene)
# scene = []
# i = 0
# scene_no += 1
# scene.append(line)
# slugline_dic[scene_no] = line.split("\n")[0].strip('0123456789.- ')
# else:
# lis = line.split("\n")
# lis = [l.strip() for l in lis]
# print(" \n Line 222 probable dialogue list", lis)
# word = lis[0]
# if word.split('(')[0].strip() in characters:
# mydic = {}
# prev_dial_speaker = speaker
# speaker = word.split('(')[0].strip()
# print("Speaker 228", speaker)
# if len(lis) > 1 and re.match(r"\(.*\)", lis[1]):
# pc = pc+1
# parenthetical = lis[1]
# parenthetical = parenthetical.replace("\n", "")
# dia = ' '.join(lis[2:])
# dia = dia.replace("\n", "")
# # renu
# dia = dia.replace("\"", '')
# else:
# dia = ''.join(lis[1:])
# dia = dia.replace("\n", "")
# dia = dia.replace("\"", '')
# print(" length dia\n", len(dia))
# if not (len(dia) == 0 and parenthetical == "NONE"):
# print(" len dia != and Parenthetical == NONE: 384 ")
# if i-1 >= 0:
# try:
# prev = main_lis[scene_no-1][i-1]
# except:
# prev = ""
# else:
# prev = ""
# try:
# next = main_lis[scene_no-1][i+1]
# except:
# next = ""
# # prev is previous speaker and next is next speaker of the dialogue
# mydic[speaker] = [parenthetical,
# scene_no, dia, len(dia), prev, next]
# print("line 259", mydic)
# #print("mydic 260", speaker, mydic[speaker])
# prev, next = "", ""
# i = i+1
# speakers.append(speaker)
# parenthetical_lis.append(parenthetical)
# dialogues.append(mydic)
# scene.append(mydic)
# parenthetical = "NONE"
# else:
# line = line.replace("\n", " ")
# line = ' '.join(line.split())
# if line.strip() in transitions:
# scene.append({'Transition': line.strip()})
# continue
# actionline.append(line)
# scene.append(line.strip())
# scenes.append(scene)
# speakers = list(set(speakers))
# scenes = scenes[1:]
# s = []
# for scene in scenes:
# s1 = []
# for ele in scene:
# if type(ele) == type(""):
# s1.extend(ele.split("\n"))
# else:
# s1.append(ele)
# s.append(s1)
# print("dialogue: ", dialogues)
# return s, actionline, parenthetical_lis, speakers, dialogues
def language_detector(text):
result = translate_client.translate(text, target_language='hi')
det_lang = result["detectedSourceLanguage"]
return det_lang
def script_det(text):
punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
no_punct = ""
for char in text:
if char not in punctuations:
no_punct = char
break
script = script_cat(no_punct)[0]
return script
'''
A. Language of Highest number of full dialogues,
B. Numbers of dialogues in action line language,
C. Number of dialogues in other languages)
'''
def A_B_C(dialogue_language, non_dial_src_lang):
print("line 316:dialogue_language", dialogue_language)
print(non_dial_src_lang)
dict1 = dict(Counter(dialogue_language))
print("line 319:dict1", dict1)
sorted_values = sorted(dict1.values(), reverse=True) # Sort the values
print("line 321:sorted_values:", sorted_values)
sorted_dict = {}
for i in sorted_values:
for k in dict1.keys():
if dict1[k] == i:
sorted_dict[k] = dict1[k]
sources = list(sorted_dict.keys())
print("line 328: sources: ", sources)
A = sources[0]
if len(sources) == 1:
B = 0
C = 0
elif non_dial_src_lang not in sources:
B = 0
C = sum(sorted_values[1:])
else:
if A == non_dial_src_lang:
B = 0
else:
B = sorted_values[sources.index(non_dial_src_lang)]
C = sum(sorted_values[2:])
return A, B, C
def dial_each_word_lang1(non_dial_src_lang, dial):
for word in dial.split():
if language_detector(word) == non_dial_src_lang:
#print("word", word)
return "True"
return "False"
def dial_each_word_lang2(non_dial_src_lang, A, dial):
for word in dial.split():
if (language_detector(word) != non_dial_src_lang) or (language_detector(word) != A):
#print("in 4")
#print("word", word)
return "True"
return "False"
def word_with_actionline(scenes, A, non_dial_src_lang):
word_lang_with_actionline = "False"
if A == non_dial_src_lang:
return "False"
for scene in tqdm(scenes[:]):
for i, line in enumerate(scene):
if i == 0:
continue
if type(line) == type(""):
continue
else:
[speaker] = line.keys()
if speaker == 'Transition':
continue
dial_src_lang = language_detector(line[speaker][2])
if dial_src_lang == A:
word_lang_with_actionline = dial_each_word_lang1(
non_dial_src_lang, line[speaker][2])
if word_lang_with_actionline == "True":
return word_lang_with_actionline
def word_with_other(scenes, A, non_dial_src_lang):
word_lang_with_other = "False"
for scene in tqdm(scenes[:]):
for i, line in enumerate(scene):
if i == 0:
continue
if type(line) == type(""):
continue
else:
[speaker] = line.keys()
if speaker == 'Transition':
continue
dial_src_lang = language_detector(line[speaker][2])
if dial_src_lang == A:
word_lang_with_other = dial_each_word_lang2(
non_dial_src_lang, A, line[speaker][2])
if word_lang_with_other == "True":
return word_lang_with_other
def getInputs(filename1):
refined, total_scenes = ScriptBreakdown().getRefined(filename1)
sluglines, without_slug = ScriptBreakdown().getSlugAndNonSlug(refined)
characters = ScriptBreakdown().getSpeakers(without_slug)
scenes, actionline, parenthetical_lis, speakers, dialogues = ScriptBreakdown().getScenes(
refined, total_scenes, characters)
print("line 405:scenes: ", scenes)
language_of_all_dialogues = []
script_of_all_dialogues = []
count = 0
length = len(scenes)
if(length > 10):
length = 10
for scene in tqdm(scenes[:length]):
for i, line in enumerate(scene):
if i == 0:
continue
if type(line) == type(""):
# print("here")
if count == 0:
# print(line)
non_dial_src_lang = language_detector(line)
non_dial_src_script=script_det(line)
count += 1
#print("non_dial_src_lang", non_dial_src_lang)
else:
#print("line", line)
[speaker] = line.keys()
# print([speaker])
if speaker == 'Transition':
continue
#print("dial", line[speaker][2])
dial_src_lang = language_detector(line[speaker][2])
print("dial_src_lang:line 430:", dial_src_lang)
language_of_all_dialogues.append(dial_src_lang)
script_of_all_dialogues.append(script_det(line[speaker][2]))
# print(non_dial_src_lang)
print(language_of_all_dialogues)
# print(script_of_all_dialogues)
A, B, C = A_B_C(language_of_all_dialogues, non_dial_src_lang)
dial_src_script = mode(script_of_all_dialogues)
one_step_process = "Yes" if dial_src_script == default_script[A] else "Can_not_say"
print("one_step_process", one_step_process)
if B > 0:
print("UI option3 - yes")
UI_option3 = "Yes"
else:
print("UI option3 - no")
UI_option3 = "No"
if C > 0:
print("UI option4 - yes")
UI_option4 = "Yes"
else:
print("UI option4 - no")
UI_option4 = "No"
dial_src_lang = A
return [non_dial_src_lang, dial_src_lang, dial_src_script, non_dial_src_script, UI_option3, UI_option4, "Yes", "Yes"]
word_lang_with_actionline = word_with_actionline(
scenes, A, non_dial_src_lang)
# print(word_lang_with_actionline)
word_lang_with_other = word_with_other(scenes, A, non_dial_src_lang)
# print(word_lang_with_other)
print("actionline_lanuge", non_dial_src_lang)
non_dial_src_lang = non_dial_src_lang
print("A = {} B = {} C = {}".format(A, B, C))
print("dial_language", A)
dial_src_lang = A
print("dial_src_script", dial_src_script)
# print("Steps in the process:")
# print("")
if B > 0:
print("UI option3 - yes")
UI_option3 = "Yes"
else:
print("UI option3 - no")
UI_option3 = "No"
if C > 0:
print("UI option4 - yes")
UI_option4 = "Yes"
else:
print("UI option4 - no")
UI_option4 = "No"
if word_lang_with_actionline == "True":
print("UI option5 - Yes")
UI_option5 = "Yes"
else:
print("UI_option5 - NO")
UI_option5 = "No"
if word_lang_with_other == "True":
print("UI option6 - Yes")
UI_option6 = "Yes"
else:
print("UI option6 - No")
UI_option6 = "No"
return [non_dial_src_lang, dial_src_lang, dial_src_script, one_step_process, UI_option3, UI_option4, UI_option5, UI_option6]
# filename1 = sys.argv[1]
# getInputs(filename1)