331 lines
		
	
	
		
			14 KiB
		
	
	
	
		
			Python
		
	
	
	
		
		
			
		
	
	
			331 lines
		
	
	
		
			14 KiB
		
	
	
	
		
			Python
		
	
	
	
|  | import docx | ||
|  | import sys | ||
|  | from translation_resources import ibm_watson, google, aws, azure, lingvanex, yandex  | ||
|  | from script_detector import script_cat | ||
|  | from script_writing import default_script | ||
|  | from translation_metric import manual_diff_score, bleu_diff_score, gleu_diff_score, meteor_diff_score, rouge_diff_score, diff_score, critera4_5 | ||
|  | from selection_source import selection_source, function5, function41, function311, function221, function2111, function11111, selection_source_transliteration, two_sources_two_outputs | ||
|  | from tqdm import tqdm | ||
|  | import os | ||
|  | import string | ||
|  | from optimisation1 import all_translator | ||
|  | import argparse | ||
|  | import boto3 | ||
|  | from botocore.exceptions import ClientError | ||
|  | from pptx import Presentation | ||
|  | from pptx.enum.lang import MSO_LANGUAGE_ID | ||
|  | from docx.shared import Inches, Cm, Pt | ||
|  | from docx.enum.text import WD_ALIGN_PARAGRAPH | ||
|  | from docx.enum.table import WD_TABLE_ALIGNMENT, WD_ALIGN_VERTICAL | ||
|  | from docx2pdf import convert | ||
|  | 
 | ||
|  | LANGUAGE_CODE_TO_LANGUAGE_ID = { | ||
|  |     'af': MSO_LANGUAGE_ID.AFRIKAANS, | ||
|  |     'am': MSO_LANGUAGE_ID.AMHARIC, | ||
|  |     'ar': MSO_LANGUAGE_ID.ARABIC, | ||
|  |     'bg': MSO_LANGUAGE_ID.BULGARIAN, | ||
|  |     'bn': MSO_LANGUAGE_ID.BENGALI, | ||
|  |     'bs': MSO_LANGUAGE_ID.BOSNIAN, | ||
|  |     'cs': MSO_LANGUAGE_ID.CZECH, | ||
|  |     'da': MSO_LANGUAGE_ID.DANISH, | ||
|  |     'de': MSO_LANGUAGE_ID.GERMAN, | ||
|  |     'el': MSO_LANGUAGE_ID.GREEK, | ||
|  |     'en': MSO_LANGUAGE_ID.ENGLISH_US, | ||
|  |     'es': MSO_LANGUAGE_ID.SPANISH, | ||
|  |     'et': MSO_LANGUAGE_ID.ESTONIAN, | ||
|  |     'fi': MSO_LANGUAGE_ID.FINNISH, | ||
|  |     'fr': MSO_LANGUAGE_ID.FRENCH, | ||
|  |     'fr-CA': MSO_LANGUAGE_ID.FRENCH_CANADIAN, | ||
|  |     'ha': MSO_LANGUAGE_ID.HAUSA, | ||
|  |     'he': MSO_LANGUAGE_ID.HEBREW, | ||
|  |     'hi': MSO_LANGUAGE_ID.HINDI, | ||
|  |     'hr': MSO_LANGUAGE_ID.CROATIAN, | ||
|  |     'hu': MSO_LANGUAGE_ID.HUNGARIAN, | ||
|  |     'id': MSO_LANGUAGE_ID.INDONESIAN, | ||
|  |     'it': MSO_LANGUAGE_ID.ITALIAN, | ||
|  |     'ja': MSO_LANGUAGE_ID.JAPANESE, | ||
|  |     'ka': MSO_LANGUAGE_ID.GEORGIAN, | ||
|  |     'ko': MSO_LANGUAGE_ID.KOREAN, | ||
|  |     'lv': MSO_LANGUAGE_ID.LATVIAN, | ||
|  |     'ms': MSO_LANGUAGE_ID.MALAYSIAN, | ||
|  |     'nl': MSO_LANGUAGE_ID.DUTCH, | ||
|  |     'no': MSO_LANGUAGE_ID.NORWEGIAN_BOKMOL, | ||
|  |     'pl': MSO_LANGUAGE_ID.POLISH, | ||
|  |     'ps': MSO_LANGUAGE_ID.PASHTO, | ||
|  |     'pt': MSO_LANGUAGE_ID.BRAZILIAN_PORTUGUESE, | ||
|  |     'ro': MSO_LANGUAGE_ID.ROMANIAN, | ||
|  |     'ru': MSO_LANGUAGE_ID.RUSSIAN, | ||
|  |     'sk': MSO_LANGUAGE_ID.SLOVAK, | ||
|  |     'sl': MSO_LANGUAGE_ID.SLOVENIAN, | ||
|  |     'so': MSO_LANGUAGE_ID.SOMALI, | ||
|  |     'sq': MSO_LANGUAGE_ID.ALBANIAN, | ||
|  |     'sr': MSO_LANGUAGE_ID.SERBIAN_LATIN, | ||
|  |     'sv': MSO_LANGUAGE_ID.SWEDISH, | ||
|  |     'sw': MSO_LANGUAGE_ID.SWAHILI, | ||
|  |     'ta': MSO_LANGUAGE_ID.TAMIL, | ||
|  |     'th': MSO_LANGUAGE_ID.THAI, | ||
|  |     'tr': MSO_LANGUAGE_ID.TURKISH, | ||
|  |     'uk': MSO_LANGUAGE_ID.UKRAINIAN, | ||
|  |     'ur': MSO_LANGUAGE_ID.URDU, | ||
|  |     'vi': MSO_LANGUAGE_ID.VIETNAMESE, | ||
|  |     'zh': MSO_LANGUAGE_ID.CHINESE_SINGAPORE , | ||
|  |     'zh-TW': MSO_LANGUAGE_ID.CHINESE_HONG_KONG_SAR, | ||
|  | } | ||
|  | 
 | ||
|  | TERMINOLOGY_NAME = 'pptx-translator-terminology' | ||
|  | translate = boto3.client(service_name='translate') | ||
|  | 
 | ||
|  | def add_dial_comparison_doc_ppt(doc, table, sentence, output): | ||
|  |     row_Cells = table.add_row().cells | ||
|  |     row_Cells[0].text= sentence | ||
|  |     row_Cells[1].text= output | ||
|  | 
 | ||
|  | def add_dial_comparison_doc_srt(doc, table, sentence, output): | ||
|  |     row_Cells = table.add_row().cells | ||
|  |     row_Cells[0].text= sentence | ||
|  |     row_Cells[1].text= output | ||
|  |     row_Cells[2].text= output | ||
|  | 
 | ||
|  | def translate_presentation(presentation, source_language_code, target_language_code, terminology_names, doc, table): | ||
|  |     etc_list = ["", " ", ',', ' ,'] | ||
|  |     slide_number = 1 | ||
|  |     for slide in presentation.slides: | ||
|  |         print('Slide {slide_number} of {number_of_slides}'.format( | ||
|  |                 slide_number=slide_number, | ||
|  |                 number_of_slides=len(presentation.slides))) | ||
|  |         slide_number += 1 | ||
|  | 
 | ||
|  |         # translate comments | ||
|  |         if slide.has_notes_slide: | ||
|  |             text_frame = slide.notes_slide.notes_text_frame | ||
|  |             if len(text_frame.text) > 0: | ||
|  |                 #print("text", text_frame.text) | ||
|  |                 try: | ||
|  |                     # response = translate.translate_text( | ||
|  |                     #         Text=text_frame.text, | ||
|  |                     #         SourceLanguageCode=source_language_code, | ||
|  |                     #         TargetLanguageCode=target_language_code, | ||
|  |                     #         TerminologyNames=terminology_names) | ||
|  |                     # slide.notes_slide.notes_text_frame.text = response.get('TranslatedText') | ||
|  |                     if text_frame.text in etc_list: | ||
|  |                         continue | ||
|  |                     #print("text", text_frame.text) | ||
|  |                     sentence = text_frame.text | ||
|  |                     output = all_translator(text_frame.text, source_language_code, target_language_code) | ||
|  |                     slide.notes_slide.notes_text_frame.text = output | ||
|  |                     if output not in list(string.punctuation): | ||
|  |                       add_dial_comparison_doc_ppt(doc, table, sentence, output) | ||
|  | 
 | ||
|  |                 except ClientError as client_error: | ||
|  |                     if (client_error.response['Error']['Code'] == 'ValidationException'): | ||
|  |                         # Text not valid. Maybe the size of the text exceeds the size limit of the service. | ||
|  |                         # Amazon Translate limits: https://docs.aws.amazon.com/translate/latest/dg/what-is-limits.html | ||
|  |                         # We just ignore and don't translate the text. | ||
|  |                         print('Invalid text. Ignoring...') | ||
|  |          | ||
|  |   | ||
|  |         for shape in slide.shapes: | ||
|  |             #print("shape", shape) | ||
|  | 
 | ||
|  |             if not shape.has_text_frame: | ||
|  |                 continue | ||
|  |             # print("shape", shape.text_frame.text) | ||
|  |             # print("shape", shape.text_frame) | ||
|  | 
 | ||
|  |             for paragraph in shape.text_frame.paragraphs: | ||
|  |                 for index, paragraph_run in enumerate(paragraph.runs): | ||
|  |                     try: | ||
|  |                         #print("text", paragraph_run.text) | ||
|  |                         # response = translate.translate_text( | ||
|  |                         #         Text=paragraph_run.text, | ||
|  |                         #         SourceLanguageCode=source_language_code, | ||
|  |                         #         TargetLanguageCode=target_language_code, | ||
|  |                         #         TerminologyNames=terminology_names) | ||
|  |                         # paragraph.runs[index].text = response.get('TranslatedText') | ||
|  |                          | ||
|  |                         if paragraph_run.text in etc_list: | ||
|  |                           continue | ||
|  |                         #print("paragraph", paragraph_run.text) | ||
|  |                         sentence = paragraph_run.text | ||
|  |                         output =  all_translator(paragraph_run.text, source_language_code, target_language_code) | ||
|  |                         paragraph.runs[index].text = output | ||
|  | 
 | ||
|  |                         if output not in list(string.punctuation): | ||
|  |                           add_dial_comparison_doc_ppt(doc, table, sentence, output)  | ||
|  | 
 | ||
|  |                         #paragraph.runs[index].font.language_id = LANGUAGE_CODE_TO_LANGUAGE_ID[target_language_code] | ||
|  |                     except ClientError as client_error: | ||
|  |                         if (client_error.response['Error']['Code'] == 'ValidationException'): | ||
|  |                             # Text not valid. Maybe the size of the text exceeds the size limit of the service. | ||
|  |                             # Amazon Translate limits: https://docs.aws.amazon.com/translate/latest/dg/what-is-limits.html | ||
|  |                             # We just ignore and don't translate the text. | ||
|  |                             print('Invalid text. Ignoring...')  | ||
|  | 
 | ||
|  | def import_terminology(terminology_file_path): | ||
|  |     print('Importing terminology data from {file_path}...'.format(file_path=terminology_file_path)) | ||
|  |     with open(terminology_file_path, 'rb') as f: | ||
|  |         translate.import_terminology(Name=TERMINOLOGY_NAME, | ||
|  |                                      MergeStrategy='OVERWRITE', | ||
|  |                                      TerminologyData={'File': bytearray(f.read()), 'Format': 'CSV'}) | ||
|  | 
 | ||
|  | def punct_remover_w_o_digits(string): | ||
|  |   punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~…।''' | ||
|  |   for x in string.lower(): | ||
|  |     if x in punctuations: | ||
|  |       string = string.replace(x, "") | ||
|  |   return string | ||
|  | 
 | ||
|  | def translated_srt(filename, source_lang, target_lang, doc_srt, table_srt): | ||
|  |     doc = docx.Document(filename) | ||
|  |     for para in tqdm(doc.paragraphs): | ||
|  |         text = punct_remover_w_o_digits(para.text) | ||
|  |         if text == "": | ||
|  |           continue | ||
|  |         if text[:5].isdigit()==False: | ||
|  |           sentence =  para.text | ||
|  |           output = all_translator(para.text, source_lang, target_lang) | ||
|  |           para.text = output | ||
|  |          | ||
|  |           add_dial_comparison_doc_srt(doc_srt, table_srt, sentence, output) | ||
|  | 
 | ||
|  |     if target_lang == 'ja': | ||
|  |         language = "japanese" | ||
|  |     if target_lang == 'es': | ||
|  |         language = "spanish" | ||
|  |     if target_lang == 'fr': | ||
|  |         language = "french" | ||
|  |     if target_lang == 'hi': | ||
|  |         language = "hindi" | ||
|  |     if target_lang == 'en': | ||
|  |         language = "english" | ||
|  |     if target_lang == 'mr': | ||
|  |         language = "marathi" | ||
|  |     if target_lang == 'ar': | ||
|  |         language = "arabic" | ||
|  |     if target_lang == 'ta': | ||
|  |         language = "tamil" | ||
|  |     if target_lang == 'te': | ||
|  |         language = "telugu" | ||
|  | 
 | ||
|  | 
 | ||
|  | 
 | ||
|  |     doc.save(language+".docx") | ||
|  |     doc_srt.save("srt_table"+target_lang+".docx") | ||
|  |     #convert("srt_table"+target_lang+".docx")    | ||
|  | 
 | ||
|  | def main(): | ||
|  |     argument_parser = argparse.ArgumentParser( | ||
|  |             'Translates pptx files from source language to target language using Amazon Translate service') | ||
|  |     argument_parser.add_argument( | ||
|  |             'source_language_code', type=str, | ||
|  |             help='The language code for the language of the source text. Example: en') | ||
|  |     argument_parser.add_argument( | ||
|  |             'target_language_code', type=str, | ||
|  |             help='The language code requested for the language of the target text. Example: pt') | ||
|  |     argument_parser.add_argument( | ||
|  |             'input_file_path', type=str, | ||
|  |             help='The path of the pptx file that should be translated') | ||
|  |     argument_parser.add_argument( | ||
|  |             'input_srt_file', type=str, | ||
|  |             help='The path of the srt file that should be translated')    | ||
|  | 
 | ||
|  |     argument_parser.add_argument( | ||
|  |             '--terminology', type=str, | ||
|  |             help='The path of the terminology CSV file') | ||
|  |     args = argument_parser.parse_args() | ||
|  | 
 | ||
|  |     #print("srt", args.input_srt_file) | ||
|  |     terminology_names = [] | ||
|  |     if args.terminology: | ||
|  |         import_terminology(args.terminology) | ||
|  |         terminology_names = [TERMINOLOGY_NAME] | ||
|  | 
 | ||
|  |     print('Translating {file_path} from {source_language_code} to {target_language_code}...'.format( | ||
|  |             file_path=args.input_file_path, | ||
|  |             source_language_code=args.source_language_code, | ||
|  |             target_language_code=args.target_language_code)) | ||
|  |     presentation = Presentation(args.input_file_path) | ||
|  | 
 | ||
|  |     doc_ppt = docx.Document() | ||
|  |     sections = doc_ppt.sections | ||
|  |     for section in sections: | ||
|  |         section.top_margin = Inches(0.2) | ||
|  |         section.bottom_margin = Inches(0.2) | ||
|  |         section.left_margin = Inches(0.2) | ||
|  |         section.right_margin = Inches(0.2) | ||
|  |     section = doc_ppt.sections[-1] | ||
|  |     new_height = section.page_width | ||
|  |     section.page_width = section.page_height | ||
|  |     section.page_height = new_height | ||
|  |     name = args.input_file_path | ||
|  |     doc_ppt.add_heading(name, 0) | ||
|  |     doc_para = doc_ppt.add_paragraph() | ||
|  |     table_ppt = doc_ppt.add_table(rows=1,cols=2) | ||
|  |     table_ppt.style = 'TableGrid' | ||
|  |     hdr_Cells = table_ppt.rows[0].cells | ||
|  |     hdr_Cells[0].paragraphs[0].add_run("Original Sentence").bold=True | ||
|  |     hdr_Cells[1].paragraphs[0].add_run("Translated Sentence").bold=True | ||
|  |      | ||
|  |     translate_presentation(presentation, | ||
|  |                            args.source_language_code, | ||
|  |                            args.target_language_code, | ||
|  |                            terminology_names, doc_ppt, table_ppt) | ||
|  | 
 | ||
|  | 
 | ||
|  |     if args.target_language_code == 'ja': | ||
|  |         language = "japanese" | ||
|  |     if args.target_language_code == 'es': | ||
|  |         language = "spanish" | ||
|  |     if args.target_language_code == 'fr': | ||
|  |         language = "french" | ||
|  |     if args.target_language_code == 'hi': | ||
|  |         language = "hindi" | ||
|  |     if args.target_language_code == 'en': | ||
|  |         language = "english" | ||
|  |     if args.target_language_code == 'mr': | ||
|  |         language = "marathi" | ||
|  |     if args.target_language_code == 'ar': | ||
|  |         language = "arabic" | ||
|  |     if args.target_language_code == 'ta': | ||
|  |         language = "tamil" | ||
|  |     if args.target_language_code == 'te': | ||
|  |         language = "telugu" | ||
|  | 
 | ||
|  | 
 | ||
|  |     output_file_path = language + ".pptx" | ||
|  |     # | ||
|  |     # output_file_path = args.input_file_path.replace( | ||
|  |     #         '.pptx', '-{language_code}.pptx'.format(language_code=args.target_language_code)) | ||
|  |     # print('Saving {output_file_path}...'.format(output_file_path=output_file_path)) | ||
|  |     print(output_file_path) | ||
|  |     presentation.save(output_file_path) | ||
|  |     doc_ppt.save("srt_table"+args.target_language_code+".docx") | ||
|  |     convert("srt_table"+args.target_language_code+".docx") | ||
|  | 
 | ||
|  |     doc_srt = docx.Document() | ||
|  |     sections = doc_srt.sections | ||
|  |     for section in sections: | ||
|  |         section.top_margin = Inches(0.2) | ||
|  |         section.bottom_margin = Inches(0.2) | ||
|  |         section.left_margin = Inches(0.2) | ||
|  |         section.right_margin = Inches(0.2) | ||
|  |     section = doc_srt.sections[-1] | ||
|  |     new_height = section.page_width | ||
|  |     section.page_width = section.page_height | ||
|  |     section.page_height = new_height | ||
|  |     name = args.input_srt_file | ||
|  |     doc_srt.add_heading(name, 0) | ||
|  |     doc_para = doc_srt.add_paragraph() | ||
|  |     table_srt = doc_srt.add_table(rows=1,cols=3) | ||
|  |     table_srt.style = 'TableGrid' | ||
|  |     hdr_Cells = table_srt.rows[0].cells | ||
|  |     hdr_Cells[0].paragraphs[0].add_run("Original Sentence").bold=True | ||
|  |     hdr_Cells[1].paragraphs[0].add_run("Translated Sentence").bold=True | ||
|  |     hdr_Cells[2].paragraphs[0].add_run("LPP Corrected Sentence").bold=True | ||
|  | 
 | ||
|  |     translated_srt(args.input_srt_file, args.source_language_code, | ||
|  |                            args.target_language_code, doc_srt, table_srt) | ||
|  | 
 | ||
|  | if __name__== '__main__': | ||
|  |   main() |