import time import docx import sys from translation_resources import ibm_watson, google, aws, azure, lingvanex, yandex from script_detector import script_cat from script_writing import default_script from translation_metric import manual_diff_score, bleu_diff_score, gleu_diff_score, meteor_diff_score, rouge_diff_score, diff_score, critera4_5 from selection_source import selection_source, function5, function41, function311, function221, function2111, function11111, selection_source_transliteration, two_sources_two_outputs from tqdm import tqdm import os import string from optimisation1 import all_translator import argparse import boto3 from botocore.exceptions import ClientError from pptx import Presentation from pptx.enum.lang import MSO_LANGUAGE_ID from docx.shared import Inches, Cm, Pt from docx.enum.text import WD_ALIGN_PARAGRAPH from docx.enum.table import WD_TABLE_ALIGNMENT, WD_ALIGN_VERTICAL from docx2pdf import convert from pptx.enum.text import MSO_ANCHOR, MSO_AUTO_SIZE from pptx.enum.shapes import MSO_SHAPE_TYPE LANGUAGE_CODE_TO_LANGUAGE_ID = { 'af': MSO_LANGUAGE_ID.AFRIKAANS, 'am': MSO_LANGUAGE_ID.AMHARIC, 'ar': MSO_LANGUAGE_ID.ARABIC, 'bg': MSO_LANGUAGE_ID.BULGARIAN, 'bn': MSO_LANGUAGE_ID.BENGALI, 'bs': MSO_LANGUAGE_ID.BOSNIAN, 'cs': MSO_LANGUAGE_ID.CZECH, 'da': MSO_LANGUAGE_ID.DANISH, 'de': MSO_LANGUAGE_ID.GERMAN, 'el': MSO_LANGUAGE_ID.GREEK, 'en': MSO_LANGUAGE_ID.ENGLISH_US, 'es': MSO_LANGUAGE_ID.SPANISH, 'et': MSO_LANGUAGE_ID.ESTONIAN, 'fi': MSO_LANGUAGE_ID.FINNISH, 'fr': MSO_LANGUAGE_ID.FRENCH, 'fr-CA': MSO_LANGUAGE_ID.FRENCH_CANADIAN, 'ha': MSO_LANGUAGE_ID.HAUSA, 'he': MSO_LANGUAGE_ID.HEBREW, 'hi': MSO_LANGUAGE_ID.HINDI, 'hr': MSO_LANGUAGE_ID.CROATIAN, 'hu': MSO_LANGUAGE_ID.HUNGARIAN, 'id': MSO_LANGUAGE_ID.INDONESIAN, 'it': MSO_LANGUAGE_ID.ITALIAN, 'ja': MSO_LANGUAGE_ID.JAPANESE, 'ka': MSO_LANGUAGE_ID.GEORGIAN, 'ko': MSO_LANGUAGE_ID.KOREAN, 'lv': MSO_LANGUAGE_ID.LATVIAN, 'ms': MSO_LANGUAGE_ID.MALAYSIAN, 'nl': MSO_LANGUAGE_ID.DUTCH, 'no': MSO_LANGUAGE_ID.NORWEGIAN_BOKMOL, 'pl': MSO_LANGUAGE_ID.POLISH, 'ps': MSO_LANGUAGE_ID.PASHTO, 'pt': MSO_LANGUAGE_ID.BRAZILIAN_PORTUGUESE, 'ro': MSO_LANGUAGE_ID.ROMANIAN, 'ru': MSO_LANGUAGE_ID.RUSSIAN, 'sk': MSO_LANGUAGE_ID.SLOVAK, 'sl': MSO_LANGUAGE_ID.SLOVENIAN, 'so': MSO_LANGUAGE_ID.SOMALI, 'sq': MSO_LANGUAGE_ID.ALBANIAN, 'sr': MSO_LANGUAGE_ID.SERBIAN_LATIN, 'sv': MSO_LANGUAGE_ID.SWEDISH, 'sw': MSO_LANGUAGE_ID.SWAHILI, 'ta': MSO_LANGUAGE_ID.TAMIL, 'th': MSO_LANGUAGE_ID.THAI, 'tr': MSO_LANGUAGE_ID.TURKISH, 'uk': MSO_LANGUAGE_ID.UKRAINIAN, 'ur': MSO_LANGUAGE_ID.URDU, 'vi': MSO_LANGUAGE_ID.VIETNAMESE, 'zh': MSO_LANGUAGE_ID.CHINESE_SINGAPORE , 'zh-TW': MSO_LANGUAGE_ID.CHINESE_HONG_KONG_SAR, } TERMINOLOGY_NAME = 'pptx-translator-terminology' translate = boto3.client(service_name='translate', region_name='us-east-2', use_ssl=True) def add_dial_comparison_doc_ppt(doc, table, sentence, output): row_Cells = table.add_row().cells row_Cells[0].text= sentence row_Cells[1].text= output def add_dial_comparison_doc_srt(doc, table, sentence, output): row_Cells = table.add_row().cells row_Cells[0].text= sentence row_Cells[1].text= output row_Cells[2].text= output def translate_presentation(presentation, source_language_code, target_language_code, terminology_names, doc, table): etc_list = ["", " ", ',', ' ,'] slide_number = 1 for slide in presentation.slides: print('Slide {slide_number} of {number_of_slides}'.format( slide_number=slide_number, number_of_slides=len(presentation.slides))) slide_number += 1 group_shapes = [ shp for shp in slide.shapes if shp.shape_type == MSO_SHAPE_TYPE.GROUP] for group_shape in group_shapes: for shape in group_shape.shapes: if shape.has_text_frame: #print(shape.text) if shape.text in etc_list: continue sentence = shape.text output = all_translator(shape.text, source_language_code, target_language_code) #slide.notes_slide.notes_text_frame.text = output #shape.text = output shape.text_frame.text = output #shape.text_frame.auto_size = MSO_AUTO_SIZE.NONE #shape.text.auto_size = MSO_AUTO_SIZE.NONE #shape.text_frame.text.font.language_id = LANGUAGE_CODE_TO_LANGUAGE_ID[target_language_code] shape.text_frame.paragraphs[0].runs[0].font.language_id = LANGUAGE_CODE_TO_LANGUAGE_ID[target_language_code] if output not in list(string.punctuation): add_dial_comparison_doc_ppt(doc, table, sentence, output) # translate comments if slide.has_notes_slide: text_frame = slide.notes_slide.notes_text_frame if len(text_frame.text) > 0: #print("text", text_frame.text) try: # response = translate.translate_text( # Text=text_frame.text, # SourceLanguageCode=source_language_code, # TargetLanguageCode=target_language_code, # TerminologyNames=terminology_names) # slide.notes_slide.notes_text_frame.text = response.get('TranslatedText') if text_frame.text in etc_list: continue #print("text", text_frame.text) sentence = text_frame.text output = all_translator(text_frame.text, source_language_code, target_language_code) slide.notes_slide.notes_text_frame.text = output if output not in list(string.punctuation): add_dial_comparison_doc_ppt(doc, table, sentence, output) except ClientError as client_error: if (client_error.response['Error']['Code'] == 'ValidationException'): # Text not valid. Maybe the size of the text exceeds the size limit of the service. # Amazon Translate limits: https://docs.aws.amazon.com/translate/latest/dg/what-is-limits.html # We just ignore and don't translate the text. print('Invalid text. Ignoring...') for shape in slide.shapes: #print("shape", shape) if not shape.has_text_frame: continue # print("shape", shape.text_frame.text) # print("shape", shape.text_frame) for paragraph in shape.text_frame.paragraphs: for index, paragraph_run in enumerate(paragraph.runs): try: #print("text", paragraph_run.text) # response = translate.translate_text( # Text=paragraph_run.text, # SourceLanguageCode=source_language_code, # TargetLanguageCode=target_language_code, # TerminologyNames=terminology_names) # paragraph.runs[index].text = response.get('TranslatedText') if paragraph_run.text in etc_list: continue #print("paragraph", paragraph_run.text) sentence = paragraph_run.text output = all_translator(paragraph_run.text, source_language_code, target_language_code) paragraph.runs[index].text = output if output not in list(string.punctuation): add_dial_comparison_doc_ppt(doc, table, sentence, output) paragraph.runs[index].font.language_id = LANGUAGE_CODE_TO_LANGUAGE_ID[target_language_code] except ClientError as client_error: if (client_error.response['Error']['Code'] == 'ValidationException'): # Text not valid. Maybe the size of the text exceeds the size limit of the service. # Amazon Translate limits: https://docs.aws.amazon.com/translate/latest/dg/what-is-limits.html # We just ignore and don't translate the text. print('Invalid text. Ignoring...') def import_terminology(terminology_file_path): print('Importing terminology data from {file_path}...'.format(file_path=terminology_file_path)) with open(terminology_file_path, 'rb') as f: translate.import_terminology(Name=TERMINOLOGY_NAME, MergeStrategy='OVERWRITE', TerminologyData={'File': bytearray(f.read()), 'Format': 'CSV'}) """ #def punct_remover_w_o_digits(string): # punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~…।''' for x in string.lower(): if x in punctuations: string = string.replace(x, "") return string """ def translated_srt(filename, source_lang, target_lang, doc_srt, table_srt): #doc = docx.Document(filename) etc_list = ["", " ", ',', ' ,'] doc = filename for para in tqdm(doc.paragraphs): # text = punct_remover_w_o_digits(para.text) # if text in etc_list: # continue #print("text now", text) if (para.text[:2].isdigit()== True and para.text[3] ==":"): continue sentence = para.text if (sentence.startswith("NOTE")): para.text= "" continue print ("inout", para.text) output = all_translator(para.text, source_lang, target_lang) para.text = output add_dial_comparison_doc_srt(doc_srt, table_srt, sentence, output) doc.save("translated_srt "+target_lang+".docx") doc_srt.save("srt_table"+target_lang+".docx") #convert("srt_table"+target_lang+".docx") def main(): argument_parser = argparse.ArgumentParser( 'Translates pptx files from source language to target language using Amazon Translate service') argument_parser.add_argument( 'source_language_code', type=str, help='The language code for the language of the source text. Example: en') argument_parser.add_argument( 'target_language_code', type=str, help='The language code requested for the language of the target text. Example: pt') # argument_parser.add_argument( # 'input_file_path', type=str, # help='The path of the pptx file that should be translated') argument_parser.add_argument( 'input_srt_file', type=str, help='The path of the srt file that should be translated') argument_parser.add_argument( '--terminology', type=str, help='The path of the terminology CSV file') args = argument_parser.parse_args() #print("srt", args.input_srt_file) terminology_names = [] if args.terminology: import_terminology(args.terminology) terminology_names = [TERMINOLOGY_NAME] # print('Translating {file_path} from {source_language_code} to {target_language_code}...'.format( # file_path=args.input_file_path, # source_language_code=args.source_language_code, # target_language_code=args.target_language_code)) #presentation = Presentation(args.input_file_path) # doc_ppt = docx.Document() # sections = doc_ppt.sections # for section in sections: # section.top_margin = Inches(0.2) # section.bottom_margin = Inches(0.2) # section.left_margin = Inches(0.2) # section.right_margin = Inches(0.2) # section = doc_ppt.sections[-1] # new_height = section.page_width # section.page_width = section.page_height # section.page_height = new_height # name = args.input_file_path # doc_ppt.add_heading(name, 0) # doc_para = doc_ppt.add_paragraph() # table_ppt = doc_ppt.add_table(rows=1,cols=2) # table_ppt.style = 'Table Grid' # hdr_Cells = table_ppt.rows[0].cells # hdr_Cells[0].paragraphs[0].add_run("Original Sentence").bold=True # hdr_Cells[1].paragraphs[0].add_run("Translated Sentence").bold=True ''' translate_presentation(presentation, args.source_language_code, args.target_language_code, terminology_names, doc_ppt, table_ppt) ''' # output_file_path = args.input_file_path.replace( # '.pptx', '-{language_code}.pptx'.format(language_code=args.target_language_code)) # print('Saving {output_file_path}...'.format(output_file_path=output_file_path)) # presentation.save(output_file_path) # doc_ppt.save("ppt_table"+ args.source_language_code + ".docx") # convert("ppt_table.docx") doc = docx.Document() file = args.input_srt_file file = open( file, "r") lines = file.readlines() file.close() lines = (line.rstrip() for line in lines) for line in lines: doc.add_paragraph(line, style = 'No Spacing') doc.save("converted_srt.docx") time.sleep(10) print("after sleep") file = docx.Document("converted_srt.docx") doc_srt = docx.Document() sections = doc_srt.sections for section in sections: section.top_margin = Inches(0.2) section.bottom_margin = Inches(0.2) section.left_margin = Inches(0.2) section.right_margin = Inches(0.2) section = doc_srt.sections[-1] new_height = section.page_width section.page_width = section.page_height section.page_height = new_height name = args.input_srt_file doc_srt.add_heading(name, 0) doc_para = doc_srt.add_paragraph() table_srt = doc_srt.add_table(rows=1,cols=3) table_srt.style = 'Table Grid' hdr_Cells = table_srt.rows[0].cells hdr_Cells[0].paragraphs[0].add_run("Original Sentence").bold=True hdr_Cells[1].paragraphs[0].add_run("Translated Sentence").bold=True hdr_Cells[2].paragraphs[0].add_run("LPP Corrected Sentence").bold=True translated_srt(file, args.source_language_code, args.target_language_code, doc_srt, table_srt) if __name__== '__main__': main()