Conversion_Kitchen_Code/kitchen_counter/conversion/translation/detectionold.py

320 lines
12 KiB
Python
Raw Normal View History

2024-04-27 09:33:09 +00:00
from google.cloud import translate_v2 as Translate
from google.cloud import translate
from MNF.settings import BasePath
# from .script_writing import default_script
from .translation_variables import code_script
from .script_detector import script_cat
from statistics import mode
from collections import Counter
# import textract
from tqdm import tqdm
import sys
import re
import os
from .script_reading import getRefined, getSlugAndNonSlug, getSpeakers, getScenes
import requests
import uuid
import json
import boto3
from collections import Counter
basePath = BasePath()
# -> Google Translation API Credentials
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = rf"{basePath}/MNF/json_keys/authentication.json"
translate_client = Translate.Client()
client = translate.TranslationServiceClient()
# -> For Detecting language of any text
def language_detector(text):
lang_detected = []
print(text,"sentence recieved")
#primary language detector
result = translate_client.detect_language(text)
print("length re:",len(result['language']))
if len(result['language']) > 3:
#print((str(result['language']).split("-"))[0])
return (str(result['language']).split("-"))[0]
#lang_detected.append((str(result['language']).split("-"))[0])
else:
#lang_detected.append(result['language'])
return result['language']
# takes too long to use this
#secondary translation_detection
try:
subscription_key = "83ce6233419541929f7ab0d3035fca58"
location = "eastus"
headers = {
'Ocp-Apim-Subscription-Key': subscription_key,
'Ocp-Apim-Subscription-Region': location,
'Content-type': 'application/json',
'X-ClientTraceId': str(uuid.uuid4())
}
params = {'api-version': '3.0'}
body = [{'text': text}]
request = requests.post("https://api.cognitive.microsofttranslator.com/detect?api-version=3.0", params=params,
headers=headers, json=body)
response = request.json()
lang_detected.append(str(response[0]['language']))
except:
print("azure is not working ")
pass
#tertiary translation_detection
try:
aws_json_path = basePath + "/MNF/json_keys"
with open(rf"{aws_json_path}/keys_aws.json") as f:
keys1 = json.load(f)
session = boto3.Session(aws_access_key_id=keys1["aws_access_key_id"],
aws_secret_access_key=keys1["aws_secret_access_key"],
region_name=keys1["region_name"])
detect_aws = session.client(service_name='comprehend', region_name='us-east-2', use_ssl=True)
pred_3 = (detect_aws.detect_dominant_language(Text=text))['Languages'][0]["LanguageCode"]
lang_detected.append(str(pred_3))
except:
print("aws is not working ")
pass
most_common_lang = Counter(lang_detected)
sorted_values = sorted(most_common_lang.values(), reverse=True) # Sort the values
sorted_dict = {}
for i in sorted_values:
for k in most_common_lang.keys():
if most_common_lang[k] == i:
sorted_dict[k] = most_common_lang[k]
sources = list(sorted_dict.keys())
return sources[0]
# -> For Detecting Script of any text
def script_det(text):
punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
no_punct = ""
for char in text:
if char not in punctuations:
no_punct = char
break
script = script_cat(no_punct)[0]
return script
'''
A. Language of Highest number of full dialogues,
B. Numbers of dialogues in action line language,
C. Number of dialogues in other languages)
'''
# -> For Detecting presence of different languages in dialogues (whole sentences)
def A_B_C(dialogue_language, non_dial_src_lang):
print("line 316:dialogue_language", dialogue_language)
dict1 = dict(Counter(dialogue_language))
print("line 319:dict1", dict1)
sorted_values = sorted(dict1.values(), reverse=True) # Sort the values
print("line 321:sorted_values:", sorted_values)
sorted_dict = {}
for i in sorted_values:
for k in dict1.keys():
if dict1[k] == i:
sorted_dict[k] = dict1[k]
sources = list(sorted_dict.keys())
print("line 328: sources: ", sources)
A = sources[0]
print("Most Prominent Dialogue Language", A)
if len(sources) == 1:
B = 0
C = 0
elif non_dial_src_lang not in sources:
B = 0
C = sum(sorted_values[1:])
else:
if A == non_dial_src_lang:
B = 0
else:
B = sorted_values[sources.index(non_dial_src_lang)]
C = sum(sorted_values[1:]) - B
return A, B, C
# -> Detection of Different Lanugages and Scripts in Script
def dial_each_word_lang1(non_dial_src_lang, dial):
for word in dial.split():
if language_detector(word) == non_dial_src_lang:
return "True"
return "False"
# -> Detection of Different Lanugages and Scripts in Script
def dial_each_word_lang2(non_dial_src_lang, A, dial):
for word in dial.split():
if (language_detector(word) != non_dial_src_lang) or (language_detector(word) != A):
return "True"
return "False"
# -> Detection of words in lines with different languages
def word_with_actionline_other_lang(scenes, A, non_dial_src_lang):
dials_with_actionline_langs = 0
dials_with_other_langs = 0
sceneno = 0
actionline_lang_output = "False"
other_lang_output = "False"
ignore_actionline_match = "False"
if A == non_dial_src_lang:
ignore_actionline_match = "True"
for scene in tqdm(scenes[:]):
sceneno += 1
for i, line in enumerate(scene):
if i == 0:
continue
if isinstance(line, str):
continue
else:
[speaker] = line.keys()
if speaker == 'Transition':
continue
dial_src_lang = language_detector(line[speaker][2])
if actionline_lang_output == "False" or other_lang_output == "False":
print(
"Still Searching if Words of other langs are present or not...")
if dial_src_lang == A:
if actionline_lang_output != "True" and not ignore_actionline_match:
output = dial_each_word_lang1(
non_dial_src_lang, line[speaker][2])
if output == "True":
dials_with_actionline_langs += 1
if dials_with_actionline_langs > 5:
actionline_lang_output = "True"
if other_lang_output != "True":
output = dial_each_word_lang2(
non_dial_src_lang, A, line[speaker][2])
if output == "True":
dials_with_other_langs += 1
if dials_with_other_langs > 5:
other_lang_output = "True"
else:
print("Found Presence of other Langs in Words")
return actionline_lang_output, other_lang_output
return actionline_lang_output, other_lang_output
# -> Detection of Different Lanugages and Scripts in Script
def getInputs(filename1):
print("Detecting Languages and Scripts present in Script")
text = textract.process(filename1, encoding="utf8", errors='ignore')
filename = rf"{basePath}/conversion/translation/file_lines.txt"
f = open(filename, 'wb')
f.write(text)
f.close()
with open(rf"{basePath}/conversion/translation/file_lines.txt") as file:
li = file.readlines()
print("line", li)
total_line = len(li)
print("total_lines,", total_line)
txt_file_200 = "".join((li)[:200])
filename = rf"{basePath}/conversion/translation/file_lines_200.txt"
f = open(filename, 'wb')
f.write(txt_file_200)
f.close()
print("txt_file_200", txt_file_200)
exit()
refined, total_scenes = getRefined(filename1)
sluglines, without_slug = getSlugAndNonSlug(refined)
characters = getSpeakers(without_slug)
scenes, actionline, parenthetical_lis, speakers, dialogues = getScenes(
refined, total_scenes, characters)
print("line 520:scenes: ", scenes)
language_of_all_dialogues = []
script_of_all_dialogues = []
count = 0
length = len(scenes)
if (length > 5):
length = 5
scenes = scenes[:length]
for scene in tqdm(scenes[:length]):
for i, line in enumerate(scene):
if i == 0:
continue
if isinstance(line, str):
if count == 0:
non_dial_src_lang = language_detector(line)
non_dial_src_script = script_det(line)
count += 1
print("Non Dialogue/Actionline Language:", non_dial_src_lang)
print("Non Dialogue/Actionline Script:", non_dial_src_script)
else:
[speaker] = line.keys()
if speaker == 'Transition':
continue
dial_src_lang = language_detector(line[speaker][2])
language_of_all_dialogues.append(dial_src_lang)
script_of_all_dialogues.append(script_det(line[speaker][2]))
# -> For Detecting presence of different languages in dialogues (whole sentences)
A, B, C = A_B_C(language_of_all_dialogues, non_dial_src_lang)
totaldials = len(language_of_all_dialogues)
dial_src_script = mode(script_of_all_dialogues)
dial_src_lang = A
one_step_process = "Yes" if dial_src_script == code_script[A] else "Can_not_say"
# word_lang_with_actionline = word_with_actionline(scenes, A, non_dial_src_lang)
# word_lang_with_other = word_with_other(scenes, A, non_dial_src_lang)
# -> For Detecting presence of different languages in dialogues (words)
word_lang_with_actionline, word_lang_with_other = word_with_actionline_other_lang(
scenes, A, non_dial_src_lang)
print("A = {} B = {} C = {}".format(A, B, C))
print("dial_language", A)
print("dial_src_script", dial_src_script)
if round(B / totaldials, 2) > 0.15:
print("UI option3 - yes")
UI_option3 = "Yes"
else:
print("UI option3 - no")
UI_option3 = "No"
if round(C / totaldials, 2) > 0.20:
print("UI option4 - yes")
UI_option4 = "Yes"
else:
print("UI option4 - no")
UI_option4 = "No"
if word_lang_with_actionline == "True":
print("UI option5 - Yes")
UI_option5 = "Yes"
else:
print("UI_option5 - NO")
UI_option5 = "No"
print("checking other lang", word_lang_with_other)
if word_lang_with_other == "True":
print("UI option6 - Yes")
UI_option6 = "Yes"
else:
print("UI option6 - No")
UI_option6 = "No"
print("*******************------------Detection------------***********************")
print(UI_option3, UI_option4, UI_option5, UI_option6, non_dial_src_script)
return [non_dial_src_lang, dial_src_lang, dial_src_script, non_dial_src_script, UI_option3, UI_option4, UI_option5,
UI_option6]