362 lines
15 KiB
Python
362 lines
15 KiB
Python
|
import time
|
||
|
import docx
|
||
|
import sys
|
||
|
from .translation_resources import ibm_watson, google, aws, azure, lingvanex, yandex
|
||
|
from .script_detector import script_cat
|
||
|
from .script_writing import default_script
|
||
|
from .translation_metric import manual_diff_score, bleu_diff_score, gleu_diff_score, meteor_diff_score, rouge_diff_score, diff_score, critera4_5
|
||
|
from .selection_source import selection_source, function5, function41, function311, function221, function2111, function11111, selection_source_transliteration, two_sources_two_outputs
|
||
|
from tqdm import tqdm
|
||
|
import os
|
||
|
import string
|
||
|
from .optimisation1 import all_translator
|
||
|
import argparse
|
||
|
import boto3
|
||
|
from botocore.exceptions import ClientError
|
||
|
from pptx import Presentation
|
||
|
from pptx.enum.lang import MSO_LANGUAGE_ID
|
||
|
from docx.shared import Inches, Cm, Pt
|
||
|
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
||
|
from docx.enum.table import WD_TABLE_ALIGNMENT, WD_ALIGN_VERTICAL
|
||
|
from docx2pdf import convert
|
||
|
|
||
|
from pptx.enum.text import MSO_ANCHOR, MSO_AUTO_SIZE
|
||
|
from pptx.enum.shapes import MSO_SHAPE_TYPE
|
||
|
import docx2txt
|
||
|
|
||
|
LANGUAGE_CODE_TO_LANGUAGE_ID = {
|
||
|
'af': MSO_LANGUAGE_ID.AFRIKAANS,
|
||
|
'am': MSO_LANGUAGE_ID.AMHARIC,
|
||
|
'ar': MSO_LANGUAGE_ID.ARABIC,
|
||
|
'bg': MSO_LANGUAGE_ID.BULGARIAN,
|
||
|
'bn': MSO_LANGUAGE_ID.BENGALI,
|
||
|
'bs': MSO_LANGUAGE_ID.BOSNIAN,
|
||
|
'cs': MSO_LANGUAGE_ID.CZECH,
|
||
|
'da': MSO_LANGUAGE_ID.DANISH,
|
||
|
'de': MSO_LANGUAGE_ID.GERMAN,
|
||
|
'el': MSO_LANGUAGE_ID.GREEK,
|
||
|
'en': MSO_LANGUAGE_ID.ENGLISH_US,
|
||
|
'es': MSO_LANGUAGE_ID.SPANISH,
|
||
|
'et': MSO_LANGUAGE_ID.ESTONIAN,
|
||
|
'fi': MSO_LANGUAGE_ID.FINNISH,
|
||
|
'fr': MSO_LANGUAGE_ID.FRENCH,
|
||
|
'fr-CA': MSO_LANGUAGE_ID.FRENCH_CANADIAN,
|
||
|
'ha': MSO_LANGUAGE_ID.HAUSA,
|
||
|
'he': MSO_LANGUAGE_ID.HEBREW,
|
||
|
'hi': MSO_LANGUAGE_ID.HINDI,
|
||
|
'hr': MSO_LANGUAGE_ID.CROATIAN,
|
||
|
'hu': MSO_LANGUAGE_ID.HUNGARIAN,
|
||
|
'id': MSO_LANGUAGE_ID.INDONESIAN,
|
||
|
'it': MSO_LANGUAGE_ID.ITALIAN,
|
||
|
'ja': MSO_LANGUAGE_ID.JAPANESE,
|
||
|
'ka': MSO_LANGUAGE_ID.GEORGIAN,
|
||
|
'ko': MSO_LANGUAGE_ID.KOREAN,
|
||
|
'lv': MSO_LANGUAGE_ID.LATVIAN,
|
||
|
'ms': MSO_LANGUAGE_ID.MALAYSIAN,
|
||
|
'nl': MSO_LANGUAGE_ID.DUTCH,
|
||
|
'no': MSO_LANGUAGE_ID.NORWEGIAN_BOKMOL,
|
||
|
'pl': MSO_LANGUAGE_ID.POLISH,
|
||
|
'ps': MSO_LANGUAGE_ID.PASHTO,
|
||
|
'pt': MSO_LANGUAGE_ID.BRAZILIAN_PORTUGUESE,
|
||
|
'ro': MSO_LANGUAGE_ID.ROMANIAN,
|
||
|
'ru': MSO_LANGUAGE_ID.RUSSIAN,
|
||
|
'sk': MSO_LANGUAGE_ID.SLOVAK,
|
||
|
'sl': MSO_LANGUAGE_ID.SLOVENIAN,
|
||
|
'so': MSO_LANGUAGE_ID.SOMALI,
|
||
|
'sq': MSO_LANGUAGE_ID.ALBANIAN,
|
||
|
'sr': MSO_LANGUAGE_ID.SERBIAN_LATIN,
|
||
|
'sv': MSO_LANGUAGE_ID.SWEDISH,
|
||
|
'sw': MSO_LANGUAGE_ID.SWAHILI,
|
||
|
'ta': MSO_LANGUAGE_ID.TAMIL,
|
||
|
'th': MSO_LANGUAGE_ID.THAI,
|
||
|
'tr': MSO_LANGUAGE_ID.TURKISH,
|
||
|
'uk': MSO_LANGUAGE_ID.UKRAINIAN,
|
||
|
'ur': MSO_LANGUAGE_ID.URDU,
|
||
|
'vi': MSO_LANGUAGE_ID.VIETNAMESE,
|
||
|
'zh': MSO_LANGUAGE_ID.CHINESE_SINGAPORE,
|
||
|
'zh-TW': MSO_LANGUAGE_ID.CHINESE_HONG_KONG_SAR,
|
||
|
}
|
||
|
|
||
|
TERMINOLOGY_NAME = 'pptx-translator-terminology'
|
||
|
translate = boto3.client(service_name='translate',
|
||
|
region_name='us-east-2', use_ssl=True)
|
||
|
|
||
|
|
||
|
def add_dial_comparison_doc_ppt(doc, table, sentence, output):
|
||
|
row_Cells = table.add_row().cells
|
||
|
row_Cells[0].text = sentence
|
||
|
row_Cells[1].text = output
|
||
|
|
||
|
|
||
|
def add_dial_comparison_doc_srt(doc, table, sentence, output):
|
||
|
row_Cells = table.add_row().cells
|
||
|
row_Cells[0].text = sentence
|
||
|
row_Cells[1].text = output
|
||
|
row_Cells[2].text = output
|
||
|
|
||
|
|
||
|
def translate_presentation(presentation, source_language_code, target_language_code, terminology_names, doc, table):
|
||
|
etc_list = ["", " ", ',', ' ,']
|
||
|
slide_number = 1
|
||
|
for slide in presentation.slides:
|
||
|
print('Slide {slide_number} of {number_of_slides}'.format(
|
||
|
slide_number=slide_number,
|
||
|
number_of_slides=len(presentation.slides)))
|
||
|
slide_number += 1
|
||
|
|
||
|
group_shapes = [
|
||
|
shp for shp in slide.shapes
|
||
|
if shp.shape_type == MSO_SHAPE_TYPE.GROUP]
|
||
|
|
||
|
for group_shape in group_shapes:
|
||
|
for shape in group_shape.shapes:
|
||
|
if shape.has_text_frame:
|
||
|
# print(shape.text)
|
||
|
|
||
|
if shape.text in etc_list:
|
||
|
continue
|
||
|
|
||
|
sentence = shape.text
|
||
|
output = all_translator(
|
||
|
shape.text, source_language_code, target_language_code)
|
||
|
|
||
|
#slide.notes_slide.notes_text_frame.text = output
|
||
|
#shape.text = output
|
||
|
shape.text_frame.text = output
|
||
|
#shape.text_frame.auto_size = MSO_AUTO_SIZE.NONE
|
||
|
#shape.text.auto_size = MSO_AUTO_SIZE.NONE
|
||
|
#shape.text_frame.text.font.language_id = LANGUAGE_CODE_TO_LANGUAGE_ID[target_language_code]
|
||
|
shape.text_frame.paragraphs[0].runs[0].font.language_id = LANGUAGE_CODE_TO_LANGUAGE_ID[target_language_code]
|
||
|
|
||
|
if output not in list(string.punctuation):
|
||
|
add_dial_comparison_doc_ppt(
|
||
|
doc, table, sentence, output)
|
||
|
|
||
|
# translate comments
|
||
|
if slide.has_notes_slide:
|
||
|
text_frame = slide.notes_slide.notes_text_frame
|
||
|
if len(text_frame.text) > 0:
|
||
|
#print("text", text_frame.text)
|
||
|
try:
|
||
|
# response = translate.translate_text(
|
||
|
# Text=text_frame.text,
|
||
|
# SourceLanguageCode=source_language_code,
|
||
|
# TargetLanguageCode=target_language_code,
|
||
|
# TerminologyNames=terminology_names)
|
||
|
# slide.notes_slide.notes_text_frame.text = response.get('TranslatedText')
|
||
|
if text_frame.text in etc_list:
|
||
|
continue
|
||
|
#print("text", text_frame.text)
|
||
|
sentence = text_frame.text
|
||
|
output = all_translator(
|
||
|
text_frame.text, source_language_code, target_language_code)
|
||
|
slide.notes_slide.notes_text_frame.text = output
|
||
|
if output not in list(string.punctuation):
|
||
|
add_dial_comparison_doc_ppt(
|
||
|
doc, table, sentence, output)
|
||
|
|
||
|
except ClientError as client_error:
|
||
|
if (client_error.response['Error']['Code'] == 'ValidationException'):
|
||
|
# Text not valid. Maybe the size of the text exceeds the size limit of the service.
|
||
|
# Amazon Translate limits: https://docs.aws.amazon.com/translate/latest/dg/what-is-limits.html
|
||
|
# We just ignore and don't translate the text.
|
||
|
print('Invalid text. Ignoring...')
|
||
|
|
||
|
for shape in slide.shapes:
|
||
|
#print("shape", shape)
|
||
|
|
||
|
if not shape.has_text_frame:
|
||
|
continue
|
||
|
# print("shape", shape.text_frame.text)
|
||
|
# print("shape", shape.text_frame)
|
||
|
|
||
|
for paragraph in shape.text_frame.paragraphs:
|
||
|
for index, paragraph_run in enumerate(paragraph.runs):
|
||
|
try:
|
||
|
#print("text", paragraph_run.text)
|
||
|
# response = translate.translate_text(
|
||
|
# Text=paragraph_run.text,
|
||
|
# SourceLanguageCode=source_language_code,
|
||
|
# TargetLanguageCode=target_language_code,
|
||
|
# TerminologyNames=terminology_names)
|
||
|
# paragraph.runs[index].text = response.get('TranslatedText')
|
||
|
|
||
|
if paragraph_run.text in etc_list:
|
||
|
continue
|
||
|
#print("paragraph", paragraph_run.text)
|
||
|
sentence = paragraph_run.text
|
||
|
output = all_translator(
|
||
|
paragraph_run.text, source_language_code, target_language_code)
|
||
|
paragraph.runs[index].text = output
|
||
|
|
||
|
if output not in list(string.punctuation):
|
||
|
add_dial_comparison_doc_ppt(
|
||
|
doc, table, sentence, output)
|
||
|
|
||
|
paragraph.runs[index].font.language_id = LANGUAGE_CODE_TO_LANGUAGE_ID[target_language_code]
|
||
|
except ClientError as client_error:
|
||
|
if (client_error.response['Error']['Code'] == 'ValidationException'):
|
||
|
# Text not valid. Maybe the size of the text exceeds the size limit of the service.
|
||
|
# Amazon Translate limits: https://docs.aws.amazon.com/translate/latest/dg/what-is-limits.html
|
||
|
# We just ignore and don't translate the text.
|
||
|
print('Invalid text. Ignoring...')
|
||
|
|
||
|
|
||
|
def import_terminology(terminology_file_path):
|
||
|
print('Importing terminology data from {file_path}...'.format(
|
||
|
file_path=terminology_file_path))
|
||
|
with open(terminology_file_path, 'rb') as f:
|
||
|
translate.import_terminology(Name=TERMINOLOGY_NAME,
|
||
|
MergeStrategy='OVERWRITE',
|
||
|
TerminologyData={'File': bytearray(f.read()), 'Format': 'CSV'})
|
||
|
|
||
|
|
||
|
def punct_remover_w_o_digits(string):
|
||
|
punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~…।'''
|
||
|
for x in string.lower():
|
||
|
if x in punctuations:
|
||
|
string = string.replace(x, "")
|
||
|
return string
|
||
|
|
||
|
# def translated_srt(filename, source_lang, target_lang, doc_srt, table_srt):
|
||
|
|
||
|
|
||
|
def translated_srt(filename, source_lang, target_lang, loc):
|
||
|
#doc = docx.Document(filename)
|
||
|
etc_list = ["", " ", ',', ' ,']
|
||
|
doc = filename
|
||
|
for para in tqdm(doc.paragraphs):
|
||
|
text = punct_remover_w_o_digits(para.text)
|
||
|
if text in etc_list:
|
||
|
continue
|
||
|
#print("text now", text)
|
||
|
if text[:5].isdigit() == False:
|
||
|
sentence = para.text
|
||
|
output = all_translator(para.text, source_lang, target_lang)
|
||
|
para.text = output
|
||
|
|
||
|
#add_dial_comparison_doc_srt(doc_srt, table_srt, sentence, output)
|
||
|
|
||
|
doc.save(loc+"/translated_srt_"+target_lang+".docx")
|
||
|
text = docx2txt.process(loc+"/translated_srt_"+target_lang+".docx")
|
||
|
with open(rf"{loc}/translated_srt_{target_lang}.srt", "w") as text_file:
|
||
|
print(text, file=text_file)
|
||
|
trans_sub_path = rf"{loc}/translated_srt_{target_lang}.srt"
|
||
|
print(trans_sub_path)
|
||
|
return str(trans_sub_path)
|
||
|
# doc_srt.save("srt_table"+target_lang+".docx")
|
||
|
# convert("srt_table"+target_lang+".docx")
|
||
|
|
||
|
|
||
|
def translate_sub(input_srt_file, source_language_code, target_language_code, file_path):
|
||
|
# argument_parser = argparse.ArgumentParser(
|
||
|
# 'Translates pptx files from source language to target language using Amazon Translate service')
|
||
|
# argument_parser.add_argument(
|
||
|
# 'source_language_code', type=str,
|
||
|
# help='The language code for the language of the source text. Example: en')
|
||
|
# argument_parser.add_argument(
|
||
|
# 'target_language_code', type=str,
|
||
|
# help='The language code requested for the language of the target text. Example: pt')
|
||
|
# # argument_parser.add_argument(
|
||
|
# # 'input_file_path', type=str,
|
||
|
# # help='The path of the pptx file that should be translated')
|
||
|
# argument_parser.add_argument(
|
||
|
# 'input_srt_file', type=str,
|
||
|
# help='The path of the srt file that should be translated')
|
||
|
# argument_parser.add_argument(
|
||
|
# 'file_path', type=str,
|
||
|
# help='The path where the srt file is saved')
|
||
|
|
||
|
# argument_parser.add_argument(
|
||
|
# '--terminology', type=str,
|
||
|
# help='The path of the terminology CSV file')
|
||
|
# args = argument_parser.parse_args()
|
||
|
|
||
|
# #print("srt", args.input_srt_file)
|
||
|
# terminology_names = []
|
||
|
# if args.terminology:
|
||
|
# import_terminology(args.terminology)
|
||
|
# terminology_names = [TERMINOLOGY_NAME]
|
||
|
|
||
|
# print('Translating {file_path} from {source_language_code} to {target_language_code}...'.format(
|
||
|
# file_path=args.input_file_path,
|
||
|
# source_language_code=args.source_language_code,
|
||
|
# target_language_code=args.target_language_code))
|
||
|
#presentation = Presentation(args.input_file_path)
|
||
|
|
||
|
# doc_ppt = docx.Document()
|
||
|
# sections = doc_ppt.sections
|
||
|
# for section in sections:
|
||
|
# section.top_margin = Inches(0.2)
|
||
|
# section.bottom_margin = Inches(0.2)
|
||
|
# section.left_margin = Inches(0.2)
|
||
|
# section.right_margin = Inches(0.2)
|
||
|
# section = doc_ppt.sections[-1]
|
||
|
# new_height = section.page_width
|
||
|
# section.page_width = section.page_height
|
||
|
# section.page_height = new_height
|
||
|
# name = args.input_file_path
|
||
|
# doc_ppt.add_heading(name, 0)
|
||
|
# doc_para = doc_ppt.add_paragraph()
|
||
|
# table_ppt = doc_ppt.add_table(rows=1,cols=2)
|
||
|
# table_ppt.style = 'Table Grid'
|
||
|
# hdr_Cells = table_ppt.rows[0].cells
|
||
|
# hdr_Cells[0].paragraphs[0].add_run("Original Sentence").bold=True
|
||
|
# hdr_Cells[1].paragraphs[0].add_run("Translated Sentence").bold=True
|
||
|
'''
|
||
|
translate_presentation(presentation,
|
||
|
args.source_language_code,
|
||
|
args.target_language_code,
|
||
|
terminology_names, doc_ppt, table_ppt)
|
||
|
'''
|
||
|
|
||
|
# output_file_path = args.input_file_path.replace(
|
||
|
# '.pptx', '-{language_code}.pptx'.format(language_code=args.target_language_code))
|
||
|
# print('Saving {output_file_path}...'.format(output_file_path=output_file_path))
|
||
|
# presentation.save(output_file_path)
|
||
|
# doc_ppt.save("ppt_table"+ args.source_language_code + ".docx")
|
||
|
# convert("ppt_table.docx")
|
||
|
|
||
|
doc = docx.Document()
|
||
|
file = input_srt_file
|
||
|
path = file_path
|
||
|
file = open(file, "r")
|
||
|
lines = file.readlines()
|
||
|
file.close()
|
||
|
lines = (line.rstrip() for line in lines)
|
||
|
for line in lines:
|
||
|
doc.add_paragraph(line, style='No Spacing')
|
||
|
doc.save(rf"{path}/converted_srt.docx")
|
||
|
|
||
|
time.sleep(10)
|
||
|
print("after sleep")
|
||
|
file = docx.Document(rf"{path}/converted_srt.docx")
|
||
|
|
||
|
doc_srt = docx.Document()
|
||
|
sections = doc_srt.sections
|
||
|
for section in sections:
|
||
|
section.top_margin = Inches(0.2)
|
||
|
section.bottom_margin = Inches(0.2)
|
||
|
section.left_margin = Inches(0.2)
|
||
|
section.right_margin = Inches(0.2)
|
||
|
section = doc_srt.sections[-1]
|
||
|
new_height = section.page_width
|
||
|
section.page_width = section.page_height
|
||
|
section.page_height = new_height
|
||
|
name = input_srt_file
|
||
|
doc_srt.add_heading(name, 0)
|
||
|
doc_para = doc_srt.add_paragraph()
|
||
|
table_srt = doc_srt.add_table(rows=1, cols=3)
|
||
|
table_srt.style = 'Table Grid'
|
||
|
hdr_Cells = table_srt.rows[0].cells
|
||
|
hdr_Cells[0].paragraphs[0].add_run("Original Sentence").bold = True
|
||
|
hdr_Cells[1].paragraphs[0].add_run("Translated Sentence").bold = True
|
||
|
hdr_Cells[2].paragraphs[0].add_run("LPP Corrected Sentence").bold = True
|
||
|
|
||
|
return translated_srt(file, source_language_code,
|
||
|
target_language_code, path)
|
||
|
|
||
|
|
||
|
# if __name__ == '__main__':
|
||
|
# main()
|