#for extraction of text and images from pdf import logging import os.path from adobe.pdfservices.operation.auth.credentials import Credentials from adobe.pdfservices.operation.exception.exceptions import ServiceApiException, ServiceUsageException, SdkException from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_pdf_options import ExtractPDFOptions from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_renditions_element_type import \ ExtractRenditionsElementType from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_element_type import ExtractElementType from adobe.pdfservices.operation.execution_context import ExecutionContext from adobe.pdfservices.operation.io.file_ref import FileRef from adobe.pdfservices.operation.pdfops.extract_pdf_operation import ExtractPDFOperation #for zip extraction from zipfile import ZipFile #for parsing json import json #for adding tables in docx from docx.enum.text import WD_ALIGN_PARAGRAPH from docx import Document import pandas as pd from docx.shared import Mm from docx.shared import Inches, Cm, Pt from docx.oxml.shared import OxmlElement from docx.oxml.ns import qn api_creds = "/home/user/mnf/project/MNF/conversion/booktranslator/api_creds/pdfservices-api-credentials.json" logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO")) def set_cell_margins(cell, **kwargs): tc = cell._tc tcPr = tc.get_or_add_tcPr() tcMar = OxmlElement('w:tcMar') for m in ["top", "start", "bottom", "end"]: if m in kwargs: node = OxmlElement("w:{}".format(m)) node.set(qn('w:w'), str(kwargs.get(m))) node.set(qn('w:type'), 'dxa') tcMar.append(node) tcPr.append(tcMar) def add_table_to_doc(doc, df): columns = list(df.columns) table = doc.add_table(rows=1, cols=len(columns), style="Table Grid") table.autofit = True for col in range(len(columns)): set_cell_margins(table.cell(0, col), top=100, start=100, bottom=100, end=50) table.cell(0, col).text = columns[col].replace(" _x000D_", "").capitalize() for i, row in enumerate(df.itertuples()): table_row = table.add_row().cells for col in range(len(columns)): set_cell_margins(table_row[col], top=100, start=100, bottom=100, end=50) table_row[col].text = str(row[col + 1]).replace(" _x000D_", "") return doc def pdf_text_images_extractor(api_creds, inputFile, outputzip): try: # get base path. base_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) # Initial setup, create credentials instance. credentials = Credentials.service_account_credentials_builder() \ .from_file(api_creds) \ .build() # Create an ExecutionContext using credentials and create a new operation instance. execution_context = ExecutionContext.create(credentials) extract_pdf_operation = ExtractPDFOperation.create_new() # Set operation input from a source file. source = FileRef.create_from_local_file(inputFile) extract_pdf_operation.set_input(source) # Build ExtractPDF options and set them into the operation extract_pdf_options: ExtractPDFOptions = ExtractPDFOptions.builder() \ .with_elements_to_extract([ExtractElementType.TEXT, ExtractElementType.TABLES]) \ .with_elements_to_extract_renditions([ExtractRenditionsElementType.TABLES, ExtractRenditionsElementType.FIGURES]) \ .build() extract_pdf_operation.set_options(extract_pdf_options) # Execute the operation. result: FileRef = extract_pdf_operation.execute(execution_context) # print("33333333333333333333333333") # print(result) # f = open("myfile.txt", "wb") # print("12121212") # # Save the result to the specified location. # result.write_to_stream(f) # print("ttttttttttttt") # f.close() result.save_as("/tmp/extra") except (ServiceApiException, ServiceUsageException, SdkException): logging.exception("Exception encountered while executing operation") def zip_extractor(filename): with ZipFile(filename, 'r') as zipObj: # Extract all the contents of zip file in current directory zipObj.extractall("contents") def json_parser(filename): # Opening JSON file f = open(filename, encoding="utf8") # returns JSON object as a dictionary data = json.load(f) # Iterating through the json list print(data['extended_metadata']['page_count']) print(data['extended_metadata']['language']) all_pages_data = [] curr_page_contents = [] current_page = 0 for element in data['elements']: #for detection of headings and paragraphs if list(element['Path'])[11] == "H" or list(element['Path'])[11] == "P": if current_page == element["Page"]: pass else: all_pages_data.append(curr_page_contents) current_page += 1 curr_page_contents = [] current_element = ["Text",element["Text"],element["TextSize"], element["Font"]["family_name"], element["Font"]["italic"],element["Font"]["weight"]] try: output = element["attributes"]["SpaceAfter"] current_element.append(output) except: current_element.append("") try: output = element["attributes"]["TextAlign"] current_element.append(output) except: current_element.append("") curr_page_contents.append(current_element) #for detection of a list between paragraphs elif list(element['Path'])[11] == "L": if current_page == element["Page"]: pass else: all_pages_data.append(curr_page_contents) current_page += 1 curr_page_contents = [] differ_creator = (element["Path"]).split("/") if differ_creator[-1] == "Lbl": current_element = ["List Numbering", element["Text"], element["TextSize"], element["Font"]["family_name"], element["Font"]["italic"],element["Font"]["weight"]] else: current_element = ["List Data", element["Text"], element["TextSize"], element["Font"]["family_name"], element["Font"]["italic"],element["Font"]["weight"]] curr_page_contents.append(current_element) #for detection of figures elif list(element['Path'])[11] == "F": if current_page == element["Page"]: pass else: all_pages_data.append(curr_page_contents) current_page += 1 curr_page_contents = [] current_element = ["Figure",element["filePaths"][0],element["attributes"]["Placement"], element["attributes"]["BBox"][0],element["attributes"]["BBox"][1], element["attributes"]["BBox"][2],element["attributes"]["BBox"][3]] curr_page_contents.append(current_element) #for detection of tables elif list(element['Path'])[11] == "S": if current_page == element["Page"]: pass else: all_pages_data.append(curr_page_contents) current_page += 1 curr_page_contents = [] if list(element['Path'])[11:21] == "Sect/Table": curr_page_contents.append(["Table",element["attributes"]["NumRow"],element["attributes"]["NumCol"],element["filePaths"][0]]) else: pass all_pages_data.append(curr_page_contents) # Closing file f.close() return all_pages_data def word_creator(all_data): listo = "" doc = Document() for page in all_data: for ele in page: #writing text in docx if ele[0] == "Text": style = doc.styles['Normal'] font = style.font font.name = str(ele[3]) font.size = Pt(int(ele[2])) act = doc.add_paragraph(style=style) act_format = act.paragraph_format if ele[6] == "": act_format.space_after = Pt(12) else: act_format.space_after = Pt(int(ele[6])) if ele[7] == "": act_format.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY else: if ele[7] == "Justify": act_format.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY elif ele[7] == "Start": act_format.alignment = WD_ALIGN_PARAGRAPH.LEFT elif ele[7] == "Center": act_format.alignment = WD_ALIGN_PARAGRAPH.CENTER elif ele[7] == "End": act_format.alignment = WD_ALIGN_PARAGRAPH.RIGHT else: act_format.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY act_format.line_spacing = Pt(12) act_format.left_indent = Inches(0) # if (non_dial_dest_lang == 'hi') or (non_dial_dest_lang == 'gu'): # act.style.font.name = 'Mangal' # else: # act.style.font.name = 'Courier New' para = act.add_run(ele[1]) if ele[4] == "true": para.italic = True if ele[5] > 400: para.bold = True #adding table in docx elif ele[0] == "Table": # read xlsx file hr_df = pd.read_excel('C:\\Users\\ANSU\\Desktop\\MNF\\convertBook\\contents_table\\tables\\fileoutpart0.xlsx') doc = Document() section = doc.sections[0] section.left_margin = Mm(5) section.right_margin = Mm(5) # add tables add_table_to_doc(doc, hr_df.iloc[:5]) #adding list in docx elif ele[0] == "List Numbering": if (list(ele[1])[0]).isdigit(): listo = "Ordered" else: listo = "UnOrdered" #adding list in docx elif ele[0] == "List Data": if listo == "Ordered": para = doc.add_paragraph(ele[1], style='List Number') listo="" else: para = doc.add_paragraph(ele[1], style='List Bullet') listo = "" if ele[4] == "true": para.italic = True if ele[5] > 300: para.bold = True #adding figure in docx elif ele[0] == "Figure": print(ele) doc.add_picture("C:\\Users\\ANSU\\Desktop\\MNF\\convertBook\\contents\\"+str(ele[1]), width=Pt(int(ele[5])-int(ele[3])), height=Pt(int(ele[6])-int(ele[4]))) last_paragraph = doc.paragraphs[-1] last_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER doc.add_page_break() # save to file doc.save("hr_data11111111.docx") def convert_books(inputfile): outputzipname = str((str(inputfile).split("."))[0]) + ".zip" # ext = str(inputfile).split(".") # if ext[-1] == "pdf": # pass # # def convert_to_pdf(input_docx, out_folder): # p = subprocess.Popen( # [ # "libreoffice", # "--headless", # "--convert-to", # "pdf", # "--outdir", # out_folder, # input_docx, # ] # ) # print(["--convert-to", "pdf", input_docx]) # p.communicate() print(outputzipname) fldrs = str(outputzipname).split("/") leno = len(fldrs) # for i in range(leno): # if i+1 == leno: print("1111111111111111111111111111111111111111111111111111111111111111111111111") print(fldrs[-2]) fldrs[-2] = fldrs[-2] + "/zdddips/" print(fldrs[-2]) #print(fldrs[i+1]) outputziploc = "/home/user" pdf_text_images_extractor(api_creds, inputfile, outputzipname) print("6666666666666666666666666666666666666666666666666666666666666666666666666666") #zip_extractor(outputzipname) return 1 # input2 = "C:\\Users\\ANSU\\Downloads\\testtt12.pdf" # inputfile = "C:\\Users\\ANSU\\Desktop\\MNF\\convertBook\\Adobe\\adobe-dc-pdf-services-sdk-extract-python-samples\\resources\\ihuuh_tnew.pdf" # outputzipname = "someoutput2.zip" # json_file = "C:\\Users\\ANSU\\Desktop\\MNF\\convertBook\\contents\\structuredData.json" #convert_books("/home/user/mnf/project/MNF/conversion/booktranslator/ihuuh_tnew.pdf") #pdf_text_images_extractor(api_creds,"/home/user/mnf/project/MNF/conversion/booktranslator/ihuuh_tnew.pdf","output.zip") #zip_extractor(outputzipname) # all_pages_data = json_parser(json_file) # #print(all_pages_data) # word_creator(all_pages_data)