#for extraction of text and images from pdf import logging import os.path from adobe.pdfservices.operation.auth.credentials import Credentials from adobe.pdfservices.operation.exception.exceptions import ServiceApiException, ServiceUsageException, SdkException from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_pdf_options import ExtractPDFOptions from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_renditions_element_type import \ ExtractRenditionsElementType from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_element_type import ExtractElementType from adobe.pdfservices.operation.execution_context import ExecutionContext from adobe.pdfservices.operation.io.file_ref import FileRef from adobe.pdfservices.operation.pdfops.extract_pdf_operation import ExtractPDFOperation #for zip extraction from zipfile import ZipFile #for parsing json import json #for adding tables in docx from docx.enum.text import WD_ALIGN_PARAGRAPH from docx import Document import pandas as pd from docx.shared import Mm from docx.shared import Inches, Cm, Pt from docx.oxml.shared import OxmlElement from docx.oxml.ns import qn from docx.enum.section import WD_SECTION #Delete Intermediate Files import shutil # For Translation from conversion.translation.translation_function import translate_comparison # For headers and footers import PyPDF2 basePath = "/home/user/mnf/project/MNF/media/scripts/book/translated" basepath = "/home/user/mnf/project/MNF" api_creds = "/home/user/mnf/project/MNF/conversion/booktranslator/api_creds/pdfservices-api-credentials.json" logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO")) def extract_header_footer(pdf_path, headerFlag, footerFlag): header_text = [] footer_text = [] with open(pdf_path, 'rb') as file: reader = PyPDF2.PdfFileReader(file) for page in reader.pages: # print(page.images) # for image_file_object in page.images: # with open(str(count) + image_file_object.name, "wb") as fp: # fp.write(image_file_object.data) # Extract the text from the top region of the page if headerFlag: header = page.extractText().splitlines()[0] header_text.append(header) # Append the extracted header to the overall header text if footerFlag: footer = page.extractText().splitlines()[-1] footer_text.append(footer) # Append the extracted footer to the overall header text return header_text, footer_text def set_cell_margins(cell, **kwargs): tc = cell._tc tcPr = tc.get_or_add_tcPr() tcMar = OxmlElement('w:tcMar') for m in ["top", "start", "bottom", "end"]: if m in kwargs: node = OxmlElement("w:{}".format(m)) node.set(qn('w:w'), str(kwargs.get(m))) node.set(qn('w:type'), 'dxa') tcMar.append(node) tcPr.append(tcMar) def add_table_to_doc(doc, df): columns = list(df.columns) table = doc.add_table(rows=1, cols=len(columns), style="Table Grid") table.autofit = True for col in range(len(columns)): set_cell_margins(table.cell(0, col), top=100, start=100, bottom=100, end=50) table.cell(0, col).text = columns[col].replace(" _x000D_", "").capitalize() for i, row in enumerate(df.itertuples()): table_row = table.add_row().cells for col in range(len(columns)): set_cell_margins(table_row[col], top=100, start=100, bottom=100, end=50) table_row[col].text = str(row[col + 1]).replace(" _x000D_", "") return doc def pdf_text_images_extractor(api_creds, inputFile, outputzip): try: # get base path. base_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) # Initial setup, create credentials instance. credentials = Credentials.service_account_credentials_builder() \ .from_file(api_creds) \ .build() # Create an ExecutionContext using credentials and create a new operation instance. execution_context = ExecutionContext.create(credentials) extract_pdf_operation = ExtractPDFOperation.create_new() # Set operation input from a source file. source = FileRef.create_from_local_file(inputFile) #source = FileRef.createFromURL(inputUrl) extract_pdf_operation.set_input(source) # Build ExtractPDF options and set them into the operation extract_pdf_options: ExtractPDFOptions = ExtractPDFOptions.builder() \ .with_elements_to_extract([ExtractElementType.TEXT, ExtractElementType.TABLES]) \ .with_elements_to_extract_renditions([ExtractRenditionsElementType.TABLES, ExtractRenditionsElementType.FIGURES]) \ .build() extract_pdf_operation.set_options(extract_pdf_options) # Execute the operation. result: FileRef = extract_pdf_operation.execute(execution_context) # Save the result to the specified location. print("Saving the Zip") outputs = (outputzip.split("/"))[:-1] outputos = (outputzip.split("/")) filename = (outputos[-1].split("."))[0] outputzip_path1 = "" for i in outputs: outputzip_path1 = outputzip_path1 + str(i) + "/" if not os.path.exists(outputzip_path1 + str(filename)): os.makedirs(outputzip_path1 + str(filename), mode=0o777, exist_ok=False) outputfile = open(outputzip_path1 + str(filename) + "/" + str(filename) + ".zip", "wb") result.write_to_stream(outputfile) outputfile.close() return str(outputzip_path1 + str(filename) + "/" + str(filename) + ".zip") except (ServiceApiException, ServiceUsageException, SdkException): # print(exception) logging.exception("Exception encountered while executing operation") def zip_extractor(filename): filename_final = "" filenameo = (filename.split("/"))[:-1] for i in filenameo: filename_final = filename_final + str(i) + "/" with ZipFile(filename, 'r') as zipObj: # Extract all the contents of zip file in current directory zipObj.extractall(filename_final + "contents") def json_parser(filename): # Opening JSON file f = open(filename, encoding="utf8") # returns JSON object as a dictionary data = json.load(f) # Iterating through the json list print(data['extended_metadata']['page_count']) print(data['extended_metadata']['language']) all_pages_data = [] curr_page_contents = [] current_page = 0 for element in data['elements']: print("1789") print(element) #for detection of headings and paragraphs if list(element['Path'])[11] == "H" or list(element['Path'])[11] == "P": if current_page == element["Page"]: pass else: all_pages_data.append(curr_page_contents) current_page += 1 curr_page_contents = [] #for Filtering wrong detection of paragraph for ill detection of text and giving it a paragraph tag try: current_element = ["Text",element["Text"],element["TextSize"], element["Font"]["family_name"], element["Font"]["italic"],element["Font"]["weight"]] try: output = element["attributes"]["SpaceAfter"] current_element.append(output) except: current_element.append("") try: output = element["attributes"]["TextAlign"] current_element.append(output) except: current_element.append("") curr_page_contents.append(current_element) except: continue #for detection of a list between paragraphs elif list(element['Path'])[11] == "L": if current_page == element["Page"]: pass else: all_pages_data.append(curr_page_contents) current_page += 1 curr_page_contents = [] differ_creator = (element["Path"]).split("/") if differ_creator[-1] == "Lbl": current_element = ["List Numbering", element["Text"], element["TextSize"], element["Font"]["family_name"], element["Font"]["italic"],element["Font"]["weight"]] else: current_element = ["List Data", element["Text"], element["TextSize"], element["Font"]["family_name"], element["Font"]["italic"],element["Font"]["weight"]] curr_page_contents.append(current_element) #for detection of figures elif list(element['Path'])[11] == "F": if current_page == element["Page"]: pass else: all_pages_data.append(curr_page_contents) current_page += 1 curr_page_contents = [] current_element = ["Figure",element["filePaths"][0],element["attributes"]["Placement"], element["attributes"]["BBox"][0],element["attributes"]["BBox"][1], element["attributes"]["BBox"][2],element["attributes"]["BBox"][3]] curr_page_contents.append(current_element) #for detection of tables elif list(element['Path'])[11] == "S": if current_page == element["Page"]: pass else: all_pages_data.append(curr_page_contents) current_page += 1 curr_page_contents = [] if list(element['Path'])[11:21] == "Sect/Table": curr_page_contents.append(["Table",element["attributes"]["NumRow"],element["attributes"]["NumCol"],element["filePaths"][0]]) else: pass all_pages_data.append(curr_page_contents) # Closing file f.close() return all_pages_data def word_creator(all_data,doc_name,media_path_ref, src_lang, tar_lang, header_footer_present, headers, footers): odc_name_modified = (media_path_ref.split("/"))[:-1] media_path_dir = "" for i in odc_name_modified: media_path_dir = media_path_dir + str(i) + "/" listo = "" doc = Document() file = open("/home/user/mnf/project/MNF/translation_data.txt", "w") for count, page in enumerate(all_data): print("headers are:",headers) if header_footer_present == "header" or header_footer_present == "both": try: header = doc.sections[count].header header_text = header.paragraphs[0].add_run(str(headers[count])) header_text.font.size = Pt(12) header_text.font.bold = True except Exception as e: print("Adding Header has the below error: ",e) for ele in page: print("Current Element",ele) #writing text in docx if ele[0] == "Text": style = doc.styles['Normal'] font = style.font font.name = str(ele[3]) font.size = Pt(int(ele[2])) act = doc.add_paragraph(style=style) act_format = act.paragraph_format if ele[6] == "": act_format.space_after = Pt(12) else: act_format.space_after = Pt(int(ele[6])) if ele[7] == "": act_format.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY else: if ele[7] == "Justify": act_format.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY elif ele[7] == "Start": act_format.alignment = WD_ALIGN_PARAGRAPH.LEFT elif ele[7] == "Center": act_format.alignment = WD_ALIGN_PARAGRAPH.CENTER elif ele[7] == "End": act_format.alignment = WD_ALIGN_PARAGRAPH.RIGHT else: act_format.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY act_format.line_spacing = Pt(12) act_format.left_indent = Inches(0) # if (non_dial_dest_lang == 'hi') or (non_dial_dest_lang == 'gu'): # act.style.font.name = 'Mangal' # else: # act.style.font.name = 'Courier New' trans_text, trans = translate_comparison(ele[1], src_lang, tar_lang, True) file.write(str(ele[1])) file.write(str(trans)) para = act.add_run(trans_text) if ele[4] == "true": para.italic = True if ele[5] > 400: para.bold = True #adding table in docx elif ele[0] == "Table": # read xlsx file hr_df = pd.read_excel(media_path_dir + str(ele[3])) doc = Document() section = doc.sections[0] section.left_margin = Mm(5) section.right_margin = Mm(5) # add tables add_table_to_doc(doc, hr_df.iloc[:5]) #adding list in docx elif ele[0] == "List Numbering": if (list(ele[1])[0]).isdigit(): listo = "Ordered" else: listo = "UnOrdered" #adding list in docx elif ele[0] == "List Data": if listo == "Ordered": para = doc.add_paragraph(translate_comparison(ele[1], src_lang, tar_lang, True), style='List Number') listo="" else: para = doc.add_paragraph(translate_comparison(ele[1], src_lang, tar_lang, True), style='List Bullet') listo = "" if ele[4] == "true": para.italic = True if ele[5] > 300: para.bold = True #adding figure in docx elif ele[0] == "Figure": doc.add_picture(media_path_dir + "/contents/" + str(ele[1]), width=Pt(int(ele[5])-int(ele[3])), height=Pt(int(ele[6])-int(ele[4]))) last_paragraph = doc.paragraphs[-1] last_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER print("footers are:", footers) if header_footer_present == "footer" or header_footer_present == "both": try: footer = doc.sections[count].footer footer_text = footer.paragraphs[0].add_run(str(footer[count])) footer_text.font.size = Pt(12) footer_text.font.bold = True except Exception as e: print("Adding Footer has the below error: ", e) doc.add_page_break() doc.add_section(WD_SECTION.NEW_PAGE) # save to file print("Saving the Doc") print(doc_name) print(doc.sections) for count,section in enumerate(doc.sections): print(count) doc.save(doc_name) def convert_books(inputfile, src_lang, tar_lang, greater_than_10 , header_footer_present): outputzipname_pre = str((str(inputfile).split("."))[0]) + ".zip" outputzipname = outputzipname_pre.replace("/book/","/book/zips/") print("Extracting header and footer of every page if present") headerFlag = False footerFlag = False if header_footer_present == "both": headerFlag = True footerFlag = True elif header_footer_present == "header": headerFlag = True elif header_footer_present == "footer": footerFlag = True else: pass headers, footers = extract_header_footer(inputfile, headerFlag, footerFlag) print("founded headers and footers",headers,footers) if greater_than_10: pdf = PyPDF2.PdfFileReader(inputfile, "rb") numpages = pdf.getNumPages() intermediate_pages = [] # if numpages % 10 != 0: hard_pages = numpages // 10 for i in range(hard_pages): page = (10 * i, 10 * (i + 1)) intermediate_pages.append(page) final_pages = numpages - 10 * hard_pages intermediate_pages.append( (10 * hard_pages, 10 * hard_pages + final_pages)) print("counter 1234") print(intermediate_pages) for page_start, page_end in intermediate_pages: # from PyPDF2 import PdfFileWriter, PdfFileReader # pdf_reader = PdfFileReader(open(filename, "rb")) pdf_writer1 = PyPDF2.PdfFileWriter() for page in range(page_start, page_end): pdf_writer1.addPage(pdf.getPage(page)) if not os.path.exists(f"{basepath}/media/scripts/book/intermediate_files/{(((str(inputfile).split('/'))[-1]).split('.'))[0]}/"): os.mkdir(f"{basepath}/media/scripts/book/intermediate_files/{(((str(inputfile).split('/'))[-1]).split('.'))[0]}/", mode=0o777) with open( f"{basepath}/media/scripts/book/intermediate_files/" f"{(((str(inputfile).split('/'))[-1]).split('.'))[0]}/{page_start}_{page_end}.pdf", "wb", ) as file1: pdf_writer1.write(file1) # Making Docx for each 10pages pdf for page_start, page_end in intermediate_pages: pdf_file = f"{basepath}/media/scripts/book/intermediate_files/{(((str(inputfile).split('/'))[-1]).split('.'))[0]}/{page_start}_{page_end}.pdf" print("Current File -> ",pdf_file) loc = pdf_text_images_extractor(api_creds, pdf_file, f"{basepath}/media/scripts/book/intermediate_files/" f"{(((str(inputfile).split('/'))[-1]).split('.'))[0]}/{page_start}_{page_end}.zip") print("Extracting all contents of zip") zip_extractor(loc) print("Parsing the Json File and getting all the details") locs = (loc.split("/"))[:-1] json_dir = "" for i in locs: json_dir = json_dir + str(i) + "/" json_file_path = json_dir + "/contents/structuredData.json" all_pages_data = json_parser(json_file_path) print("Creating the Doc") print(basePath + f"/{(json_dir.split('/'))[-2]}.docx") word_creator(all_pages_data, basePath + f"/{(json_dir.split('/'))[-2]}.docx",json_dir + f"/{(json_dir.split('/'))[-2]}.docx", src_lang, tar_lang, header_footer_present , headers, footers) #deleting the directory of zip contents print("Deleting the Directory") print(json_dir) #os.rmdir(json_dir) # shutil.rmtree(json_dir, ignore_errors=True) # return str(basePath + f"/{(json_dir.split('/'))[-2]}.docx") # combining all the docx's in one Docx pass else: print("Extracted All the texts and images from PDF-API CALL ") loc = pdf_text_images_extractor(api_creds, inputfile, outputzipname) print("Extracting all contents of zip") zip_extractor(loc) print("Parsing the Json File and getting all the details") locs = (loc.split("/"))[:-1] json_dir = "" for i in locs: json_dir = json_dir + str(i) + "/" json_file_path = json_dir + "/contents/structuredData.json" all_pages_data = json_parser(json_file_path) print("Creating the Doc") print(basePath + f"/{(json_dir.split('/'))[-2]}.docx") word_creator(all_pages_data, basePath + f"/{(json_dir.split('/'))[-2]}.docx",json_dir + f"/{(json_dir.split('/'))[-2]}.docx", src_lang, tar_lang, header_footer_present , headers, footers) #deleting the directory of zip contents print("Deleting the Directory") print(json_dir) #os.rmdir(json_dir) shutil.rmtree(json_dir, ignore_errors=True) return str(basePath + f"/{(json_dir.split('/'))[-2]}.docx") # # def convert_books_grt10pages(inputfile, src_lang, tar_lang): # outputzipname_pre = str((str(inputfile).split("."))[0]) + ".zip" # outputzipname = outputzipname_pre.replace("/book/", "/book/zips/") # # print("Extracted All the texts and images from PDF-API CALL ") # loc = pdf_text_images_extractor(api_creds, inputfile, outputzipname) # # input2 = "C:\\Users\\ANSU\\Downloads\\testtt12.pdf" # inputfile = "C:\\Users\\ANSU\\Desktop\\MNF\\convertBook\\Adobe\\adobe-dc-pdf-services-sdk-extract-python-samples\\resources\\ihuuh_tnew.pdf" # outputzipname = "someoutput2.zip" # json_file = "C:\\Users\\ANSU\\Desktop\\MNF\\convertBook\\contents\\structuredData.json" #convert_books("/home/user/mnf/project/MNF/conversion/booktranslator/ihuuh_tnew.pdf") #pdf_text_images_extractor(api_creds,"/home/user/mnf/project/MNF/conversion/booktranslator/ihuuh_tnew.pdf","output.zip") #zip_extractor(outputzipname) # all_pages_data = json_parser(json_file) # #print(all_pages_data) # word_creator(all_pages_data)