Conversion_Kitchen_Code/kitchen_counter/conversion/subtitling/translationold.py

331 lines
14 KiB
Python
Raw Normal View History

2024-04-27 09:33:09 +00:00
import docx
import sys
from translation_resources import ibm_watson, google, aws, azure, lingvanex, yandex
from script_detector import script_cat
from script_writing import default_script
from translation_metric import manual_diff_score, bleu_diff_score, gleu_diff_score, meteor_diff_score, rouge_diff_score, diff_score, critera4_5
from selection_source import selection_source, function5, function41, function311, function221, function2111, function11111, selection_source_transliteration, two_sources_two_outputs
from tqdm import tqdm
import os
import string
from optimisation1 import all_translator
import argparse
import boto3
from botocore.exceptions import ClientError
from pptx import Presentation
from pptx.enum.lang import MSO_LANGUAGE_ID
from docx.shared import Inches, Cm, Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.table import WD_TABLE_ALIGNMENT, WD_ALIGN_VERTICAL
from docx2pdf import convert
LANGUAGE_CODE_TO_LANGUAGE_ID = {
'af': MSO_LANGUAGE_ID.AFRIKAANS,
'am': MSO_LANGUAGE_ID.AMHARIC,
'ar': MSO_LANGUAGE_ID.ARABIC,
'bg': MSO_LANGUAGE_ID.BULGARIAN,
'bn': MSO_LANGUAGE_ID.BENGALI,
'bs': MSO_LANGUAGE_ID.BOSNIAN,
'cs': MSO_LANGUAGE_ID.CZECH,
'da': MSO_LANGUAGE_ID.DANISH,
'de': MSO_LANGUAGE_ID.GERMAN,
'el': MSO_LANGUAGE_ID.GREEK,
'en': MSO_LANGUAGE_ID.ENGLISH_US,
'es': MSO_LANGUAGE_ID.SPANISH,
'et': MSO_LANGUAGE_ID.ESTONIAN,
'fi': MSO_LANGUAGE_ID.FINNISH,
'fr': MSO_LANGUAGE_ID.FRENCH,
'fr-CA': MSO_LANGUAGE_ID.FRENCH_CANADIAN,
'ha': MSO_LANGUAGE_ID.HAUSA,
'he': MSO_LANGUAGE_ID.HEBREW,
'hi': MSO_LANGUAGE_ID.HINDI,
'hr': MSO_LANGUAGE_ID.CROATIAN,
'hu': MSO_LANGUAGE_ID.HUNGARIAN,
'id': MSO_LANGUAGE_ID.INDONESIAN,
'it': MSO_LANGUAGE_ID.ITALIAN,
'ja': MSO_LANGUAGE_ID.JAPANESE,
'ka': MSO_LANGUAGE_ID.GEORGIAN,
'ko': MSO_LANGUAGE_ID.KOREAN,
'lv': MSO_LANGUAGE_ID.LATVIAN,
'ms': MSO_LANGUAGE_ID.MALAYSIAN,
'nl': MSO_LANGUAGE_ID.DUTCH,
'no': MSO_LANGUAGE_ID.NORWEGIAN_BOKMOL,
'pl': MSO_LANGUAGE_ID.POLISH,
'ps': MSO_LANGUAGE_ID.PASHTO,
'pt': MSO_LANGUAGE_ID.BRAZILIAN_PORTUGUESE,
'ro': MSO_LANGUAGE_ID.ROMANIAN,
'ru': MSO_LANGUAGE_ID.RUSSIAN,
'sk': MSO_LANGUAGE_ID.SLOVAK,
'sl': MSO_LANGUAGE_ID.SLOVENIAN,
'so': MSO_LANGUAGE_ID.SOMALI,
'sq': MSO_LANGUAGE_ID.ALBANIAN,
'sr': MSO_LANGUAGE_ID.SERBIAN_LATIN,
'sv': MSO_LANGUAGE_ID.SWEDISH,
'sw': MSO_LANGUAGE_ID.SWAHILI,
'ta': MSO_LANGUAGE_ID.TAMIL,
'th': MSO_LANGUAGE_ID.THAI,
'tr': MSO_LANGUAGE_ID.TURKISH,
'uk': MSO_LANGUAGE_ID.UKRAINIAN,
'ur': MSO_LANGUAGE_ID.URDU,
'vi': MSO_LANGUAGE_ID.VIETNAMESE,
'zh': MSO_LANGUAGE_ID.CHINESE_SINGAPORE ,
'zh-TW': MSO_LANGUAGE_ID.CHINESE_HONG_KONG_SAR,
}
TERMINOLOGY_NAME = 'pptx-translator-terminology'
translate = boto3.client(service_name='translate')
def add_dial_comparison_doc_ppt(doc, table, sentence, output):
row_Cells = table.add_row().cells
row_Cells[0].text= sentence
row_Cells[1].text= output
def add_dial_comparison_doc_srt(doc, table, sentence, output):
row_Cells = table.add_row().cells
row_Cells[0].text= sentence
row_Cells[1].text= output
row_Cells[2].text= output
def translate_presentation(presentation, source_language_code, target_language_code, terminology_names, doc, table):
etc_list = ["", " ", ',', ' ,']
slide_number = 1
for slide in presentation.slides:
print('Slide {slide_number} of {number_of_slides}'.format(
slide_number=slide_number,
number_of_slides=len(presentation.slides)))
slide_number += 1
# translate comments
if slide.has_notes_slide:
text_frame = slide.notes_slide.notes_text_frame
if len(text_frame.text) > 0:
#print("text", text_frame.text)
try:
# response = translate.translate_text(
# Text=text_frame.text,
# SourceLanguageCode=source_language_code,
# TargetLanguageCode=target_language_code,
# TerminologyNames=terminology_names)
# slide.notes_slide.notes_text_frame.text = response.get('TranslatedText')
if text_frame.text in etc_list:
continue
#print("text", text_frame.text)
sentence = text_frame.text
output = all_translator(text_frame.text, source_language_code, target_language_code)
slide.notes_slide.notes_text_frame.text = output
if output not in list(string.punctuation):
add_dial_comparison_doc_ppt(doc, table, sentence, output)
except ClientError as client_error:
if (client_error.response['Error']['Code'] == 'ValidationException'):
# Text not valid. Maybe the size of the text exceeds the size limit of the service.
# Amazon Translate limits: https://docs.aws.amazon.com/translate/latest/dg/what-is-limits.html
# We just ignore and don't translate the text.
print('Invalid text. Ignoring...')
for shape in slide.shapes:
#print("shape", shape)
if not shape.has_text_frame:
continue
# print("shape", shape.text_frame.text)
# print("shape", shape.text_frame)
for paragraph in shape.text_frame.paragraphs:
for index, paragraph_run in enumerate(paragraph.runs):
try:
#print("text", paragraph_run.text)
# response = translate.translate_text(
# Text=paragraph_run.text,
# SourceLanguageCode=source_language_code,
# TargetLanguageCode=target_language_code,
# TerminologyNames=terminology_names)
# paragraph.runs[index].text = response.get('TranslatedText')
if paragraph_run.text in etc_list:
continue
#print("paragraph", paragraph_run.text)
sentence = paragraph_run.text
output = all_translator(paragraph_run.text, source_language_code, target_language_code)
paragraph.runs[index].text = output
if output not in list(string.punctuation):
add_dial_comparison_doc_ppt(doc, table, sentence, output)
#paragraph.runs[index].font.language_id = LANGUAGE_CODE_TO_LANGUAGE_ID[target_language_code]
except ClientError as client_error:
if (client_error.response['Error']['Code'] == 'ValidationException'):
# Text not valid. Maybe the size of the text exceeds the size limit of the service.
# Amazon Translate limits: https://docs.aws.amazon.com/translate/latest/dg/what-is-limits.html
# We just ignore and don't translate the text.
print('Invalid text. Ignoring...')
def import_terminology(terminology_file_path):
print('Importing terminology data from {file_path}...'.format(file_path=terminology_file_path))
with open(terminology_file_path, 'rb') as f:
translate.import_terminology(Name=TERMINOLOGY_NAME,
MergeStrategy='OVERWRITE',
TerminologyData={'File': bytearray(f.read()), 'Format': 'CSV'})
def punct_remover_w_o_digits(string):
punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~…।'''
for x in string.lower():
if x in punctuations:
string = string.replace(x, "")
return string
def translated_srt(filename, source_lang, target_lang, doc_srt, table_srt):
doc = docx.Document(filename)
for para in tqdm(doc.paragraphs):
text = punct_remover_w_o_digits(para.text)
if text == "":
continue
if text[:5].isdigit()==False:
sentence = para.text
output = all_translator(para.text, source_lang, target_lang)
para.text = output
add_dial_comparison_doc_srt(doc_srt, table_srt, sentence, output)
if target_lang == 'ja':
language = "japanese"
if target_lang == 'es':
language = "spanish"
if target_lang == 'fr':
language = "french"
if target_lang == 'hi':
language = "hindi"
if target_lang == 'en':
language = "english"
if target_lang == 'mr':
language = "marathi"
if target_lang == 'ar':
language = "arabic"
if target_lang == 'ta':
language = "tamil"
if target_lang == 'te':
language = "telugu"
doc.save(language+".docx")
doc_srt.save("srt_table"+target_lang+".docx")
#convert("srt_table"+target_lang+".docx")
def main():
argument_parser = argparse.ArgumentParser(
'Translates pptx files from source language to target language using Amazon Translate service')
argument_parser.add_argument(
'source_language_code', type=str,
help='The language code for the language of the source text. Example: en')
argument_parser.add_argument(
'target_language_code', type=str,
help='The language code requested for the language of the target text. Example: pt')
argument_parser.add_argument(
'input_file_path', type=str,
help='The path of the pptx file that should be translated')
argument_parser.add_argument(
'input_srt_file', type=str,
help='The path of the srt file that should be translated')
argument_parser.add_argument(
'--terminology', type=str,
help='The path of the terminology CSV file')
args = argument_parser.parse_args()
#print("srt", args.input_srt_file)
terminology_names = []
if args.terminology:
import_terminology(args.terminology)
terminology_names = [TERMINOLOGY_NAME]
print('Translating {file_path} from {source_language_code} to {target_language_code}...'.format(
file_path=args.input_file_path,
source_language_code=args.source_language_code,
target_language_code=args.target_language_code))
presentation = Presentation(args.input_file_path)
doc_ppt = docx.Document()
sections = doc_ppt.sections
for section in sections:
section.top_margin = Inches(0.2)
section.bottom_margin = Inches(0.2)
section.left_margin = Inches(0.2)
section.right_margin = Inches(0.2)
section = doc_ppt.sections[-1]
new_height = section.page_width
section.page_width = section.page_height
section.page_height = new_height
name = args.input_file_path
doc_ppt.add_heading(name, 0)
doc_para = doc_ppt.add_paragraph()
table_ppt = doc_ppt.add_table(rows=1,cols=2)
table_ppt.style = 'TableGrid'
hdr_Cells = table_ppt.rows[0].cells
hdr_Cells[0].paragraphs[0].add_run("Original Sentence").bold=True
hdr_Cells[1].paragraphs[0].add_run("Translated Sentence").bold=True
translate_presentation(presentation,
args.source_language_code,
args.target_language_code,
terminology_names, doc_ppt, table_ppt)
if args.target_language_code == 'ja':
language = "japanese"
if args.target_language_code == 'es':
language = "spanish"
if args.target_language_code == 'fr':
language = "french"
if args.target_language_code == 'hi':
language = "hindi"
if args.target_language_code == 'en':
language = "english"
if args.target_language_code == 'mr':
language = "marathi"
if args.target_language_code == 'ar':
language = "arabic"
if args.target_language_code == 'ta':
language = "tamil"
if args.target_language_code == 'te':
language = "telugu"
output_file_path = language + ".pptx"
#
# output_file_path = args.input_file_path.replace(
# '.pptx', '-{language_code}.pptx'.format(language_code=args.target_language_code))
# print('Saving {output_file_path}...'.format(output_file_path=output_file_path))
print(output_file_path)
presentation.save(output_file_path)
doc_ppt.save("srt_table"+args.target_language_code+".docx")
convert("srt_table"+args.target_language_code+".docx")
doc_srt = docx.Document()
sections = doc_srt.sections
for section in sections:
section.top_margin = Inches(0.2)
section.bottom_margin = Inches(0.2)
section.left_margin = Inches(0.2)
section.right_margin = Inches(0.2)
section = doc_srt.sections[-1]
new_height = section.page_width
section.page_width = section.page_height
section.page_height = new_height
name = args.input_srt_file
doc_srt.add_heading(name, 0)
doc_para = doc_srt.add_paragraph()
table_srt = doc_srt.add_table(rows=1,cols=3)
table_srt.style = 'TableGrid'
hdr_Cells = table_srt.rows[0].cells
hdr_Cells[0].paragraphs[0].add_run("Original Sentence").bold=True
hdr_Cells[1].paragraphs[0].add_run("Translated Sentence").bold=True
hdr_Cells[2].paragraphs[0].add_run("LPP Corrected Sentence").bold=True
translated_srt(args.input_srt_file, args.source_language_code,
args.target_language_code, doc_srt, table_srt)
if __name__== '__main__':
main()