406 lines
14 KiB
Python
Executable File
406 lines
14 KiB
Python
Executable File
#for extraction of text and images from pdf
|
|
import logging
|
|
import os.path
|
|
from adobe.pdfservices.operation.auth.credentials import Credentials
|
|
from adobe.pdfservices.operation.exception.exceptions import ServiceApiException, ServiceUsageException, SdkException
|
|
from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_pdf_options import ExtractPDFOptions
|
|
from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_renditions_element_type import \
|
|
ExtractRenditionsElementType
|
|
from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_element_type import ExtractElementType
|
|
from adobe.pdfservices.operation.execution_context import ExecutionContext
|
|
from adobe.pdfservices.operation.io.file_ref import FileRef
|
|
from adobe.pdfservices.operation.pdfops.extract_pdf_operation import ExtractPDFOperation
|
|
|
|
#for zip extraction
|
|
from zipfile import ZipFile
|
|
|
|
#for parsing json
|
|
import json
|
|
|
|
|
|
#for adding tables in docx
|
|
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
|
from docx import Document
|
|
import pandas as pd
|
|
from docx.shared import Mm
|
|
from docx.shared import Inches, Cm, Pt
|
|
from docx.oxml.shared import OxmlElement
|
|
from docx.oxml.ns import qn
|
|
|
|
|
|
api_creds = "/home/user/mnf/project/MNF/conversion/booktranslator/api_creds/pdfservices-api-credentials.json"
|
|
logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO"))
|
|
|
|
|
|
|
|
def set_cell_margins(cell, **kwargs):
|
|
|
|
tc = cell._tc
|
|
tcPr = tc.get_or_add_tcPr()
|
|
tcMar = OxmlElement('w:tcMar')
|
|
|
|
for m in ["top", "start", "bottom", "end"]:
|
|
if m in kwargs:
|
|
node = OxmlElement("w:{}".format(m))
|
|
node.set(qn('w:w'), str(kwargs.get(m)))
|
|
node.set(qn('w:type'), 'dxa')
|
|
tcMar.append(node)
|
|
|
|
tcPr.append(tcMar)
|
|
|
|
|
|
|
|
def add_table_to_doc(doc, df):
|
|
|
|
columns = list(df.columns)
|
|
|
|
table = doc.add_table(rows=1, cols=len(columns), style="Table Grid")
|
|
table.autofit = True
|
|
|
|
for col in range(len(columns)):
|
|
set_cell_margins(table.cell(0, col), top=100, start=100, bottom=100, end=50)
|
|
table.cell(0, col).text = columns[col].replace(" _x000D_", "").capitalize()
|
|
|
|
|
|
for i, row in enumerate(df.itertuples()):
|
|
table_row = table.add_row().cells
|
|
for col in range(len(columns)):
|
|
set_cell_margins(table_row[col], top=100, start=100, bottom=100, end=50)
|
|
table_row[col].text = str(row[col + 1]).replace(" _x000D_", "")
|
|
|
|
return doc
|
|
|
|
|
|
|
|
def pdf_text_images_extractor(api_creds, inputFile, outputzip):
|
|
try:
|
|
# get base path.
|
|
base_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
|
|
# Initial setup, create credentials instance.
|
|
credentials = Credentials.service_account_credentials_builder() \
|
|
.from_file(api_creds) \
|
|
.build()
|
|
|
|
# Create an ExecutionContext using credentials and create a new operation instance.
|
|
execution_context = ExecutionContext.create(credentials)
|
|
extract_pdf_operation = ExtractPDFOperation.create_new()
|
|
|
|
# Set operation input from a source file.
|
|
source = FileRef.create_from_local_file(inputFile)
|
|
extract_pdf_operation.set_input(source)
|
|
|
|
# Build ExtractPDF options and set them into the operation
|
|
extract_pdf_options: ExtractPDFOptions = ExtractPDFOptions.builder() \
|
|
.with_elements_to_extract([ExtractElementType.TEXT, ExtractElementType.TABLES]) \
|
|
.with_elements_to_extract_renditions([ExtractRenditionsElementType.TABLES,
|
|
ExtractRenditionsElementType.FIGURES]) \
|
|
.build()
|
|
extract_pdf_operation.set_options(extract_pdf_options)
|
|
|
|
# Execute the operation.
|
|
result: FileRef = extract_pdf_operation.execute(execution_context)
|
|
# print("33333333333333333333333333")
|
|
# print(result)
|
|
# f = open("myfile.txt", "wb")
|
|
# print("12121212")
|
|
# # Save the result to the specified location.
|
|
# result.write_to_stream(f)
|
|
# print("ttttttttttttt")
|
|
# f.close()
|
|
result.save_as("/tmp/extra")
|
|
except (ServiceApiException, ServiceUsageException, SdkException):
|
|
logging.exception("Exception encountered while executing operation")
|
|
|
|
|
|
|
|
def zip_extractor(filename):
|
|
|
|
with ZipFile(filename, 'r') as zipObj:
|
|
# Extract all the contents of zip file in current directory
|
|
zipObj.extractall("contents")
|
|
|
|
|
|
|
|
def json_parser(filename):
|
|
# Opening JSON file
|
|
f = open(filename, encoding="utf8")
|
|
|
|
# returns JSON object as a dictionary
|
|
data = json.load(f)
|
|
|
|
# Iterating through the json list
|
|
print(data['extended_metadata']['page_count'])
|
|
print(data['extended_metadata']['language'])
|
|
|
|
|
|
all_pages_data = []
|
|
curr_page_contents = []
|
|
current_page = 0
|
|
|
|
for element in data['elements']:
|
|
|
|
#for detection of headings and paragraphs
|
|
if list(element['Path'])[11] == "H" or list(element['Path'])[11] == "P":
|
|
|
|
if current_page == element["Page"]:
|
|
pass
|
|
else:
|
|
all_pages_data.append(curr_page_contents)
|
|
current_page += 1
|
|
curr_page_contents = []
|
|
|
|
current_element = ["Text",element["Text"],element["TextSize"], element["Font"]["family_name"],
|
|
element["Font"]["italic"],element["Font"]["weight"]]
|
|
try:
|
|
output = element["attributes"]["SpaceAfter"]
|
|
current_element.append(output)
|
|
except:
|
|
current_element.append("")
|
|
try:
|
|
output = element["attributes"]["TextAlign"]
|
|
current_element.append(output)
|
|
except:
|
|
current_element.append("")
|
|
|
|
curr_page_contents.append(current_element)
|
|
|
|
|
|
|
|
#for detection of a list between paragraphs
|
|
elif list(element['Path'])[11] == "L":
|
|
|
|
if current_page == element["Page"]:
|
|
pass
|
|
else:
|
|
all_pages_data.append(curr_page_contents)
|
|
current_page += 1
|
|
curr_page_contents = []
|
|
|
|
differ_creator = (element["Path"]).split("/")
|
|
if differ_creator[-1] == "Lbl":
|
|
current_element = ["List Numbering", element["Text"], element["TextSize"], element["Font"]["family_name"],
|
|
element["Font"]["italic"],element["Font"]["weight"]]
|
|
else:
|
|
current_element = ["List Data", element["Text"], element["TextSize"],
|
|
element["Font"]["family_name"],
|
|
element["Font"]["italic"],element["Font"]["weight"]]
|
|
|
|
curr_page_contents.append(current_element)
|
|
|
|
|
|
|
|
#for detection of figures
|
|
elif list(element['Path'])[11] == "F":
|
|
|
|
if current_page == element["Page"]:
|
|
pass
|
|
else:
|
|
all_pages_data.append(curr_page_contents)
|
|
current_page += 1
|
|
curr_page_contents = []
|
|
|
|
current_element = ["Figure",element["filePaths"][0],element["attributes"]["Placement"],
|
|
element["attributes"]["BBox"][0],element["attributes"]["BBox"][1],
|
|
element["attributes"]["BBox"][2],element["attributes"]["BBox"][3]]
|
|
|
|
|
|
curr_page_contents.append(current_element)
|
|
|
|
|
|
|
|
#for detection of tables
|
|
elif list(element['Path'])[11] == "S":
|
|
|
|
if current_page == element["Page"]:
|
|
pass
|
|
else:
|
|
all_pages_data.append(curr_page_contents)
|
|
current_page += 1
|
|
curr_page_contents = []
|
|
|
|
if list(element['Path'])[11:21] == "Sect/Table":
|
|
curr_page_contents.append(["Table",element["attributes"]["NumRow"],element["attributes"]["NumCol"],element["filePaths"][0]])
|
|
else:
|
|
pass
|
|
|
|
|
|
|
|
all_pages_data.append(curr_page_contents)
|
|
# Closing file
|
|
f.close()
|
|
return all_pages_data
|
|
|
|
|
|
|
|
def word_creator(all_data):
|
|
listo = ""
|
|
|
|
doc = Document()
|
|
|
|
for page in all_data:
|
|
|
|
for ele in page:
|
|
|
|
|
|
|
|
#writing text in docx
|
|
if ele[0] == "Text":
|
|
|
|
style = doc.styles['Normal']
|
|
font = style.font
|
|
font.name = str(ele[3])
|
|
font.size = Pt(int(ele[2]))
|
|
act = doc.add_paragraph(style=style)
|
|
act_format = act.paragraph_format
|
|
if ele[6] == "":
|
|
act_format.space_after = Pt(12)
|
|
else:
|
|
act_format.space_after = Pt(int(ele[6]))
|
|
if ele[7] == "":
|
|
act_format.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
|
|
else:
|
|
if ele[7] == "Justify":
|
|
act_format.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
|
|
elif ele[7] == "Start":
|
|
act_format.alignment = WD_ALIGN_PARAGRAPH.LEFT
|
|
elif ele[7] == "Center":
|
|
act_format.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
|
elif ele[7] == "End":
|
|
act_format.alignment = WD_ALIGN_PARAGRAPH.RIGHT
|
|
else:
|
|
act_format.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
|
|
act_format.line_spacing = Pt(12)
|
|
act_format.left_indent = Inches(0)
|
|
# if (non_dial_dest_lang == 'hi') or (non_dial_dest_lang == 'gu'):
|
|
# act.style.font.name = 'Mangal'
|
|
# else:
|
|
# act.style.font.name = 'Courier New'
|
|
para = act.add_run(ele[1])
|
|
if ele[4] == "true":
|
|
para.italic = True
|
|
if ele[5] > 400:
|
|
para.bold = True
|
|
|
|
|
|
|
|
#adding table in docx
|
|
elif ele[0] == "Table":
|
|
|
|
# read xlsx file
|
|
hr_df = pd.read_excel('C:\\Users\\ANSU\\Desktop\\MNF\\convertBook\\contents_table\\tables\\fileoutpart0.xlsx')
|
|
|
|
|
|
doc = Document()
|
|
section = doc.sections[0]
|
|
section.left_margin = Mm(5)
|
|
section.right_margin = Mm(5)
|
|
|
|
# add tables
|
|
add_table_to_doc(doc, hr_df.iloc[:5])
|
|
|
|
|
|
|
|
#adding list in docx
|
|
elif ele[0] == "List Numbering":
|
|
|
|
if (list(ele[1])[0]).isdigit():
|
|
listo = "Ordered"
|
|
else:
|
|
listo = "UnOrdered"
|
|
|
|
|
|
|
|
#adding list in docx
|
|
elif ele[0] == "List Data":
|
|
if listo == "Ordered":
|
|
|
|
para = doc.add_paragraph(ele[1],
|
|
style='List Number')
|
|
listo=""
|
|
else:
|
|
para = doc.add_paragraph(ele[1],
|
|
style='List Bullet')
|
|
listo = ""
|
|
|
|
if ele[4] == "true":
|
|
para.italic = True
|
|
if ele[5] > 300:
|
|
para.bold = True
|
|
|
|
|
|
|
|
#adding figure in docx
|
|
elif ele[0] == "Figure":
|
|
print(ele)
|
|
doc.add_picture("C:\\Users\\ANSU\\Desktop\\MNF\\convertBook\\contents\\"+str(ele[1]), width=Pt(int(ele[5])-int(ele[3])), height=Pt(int(ele[6])-int(ele[4])))
|
|
last_paragraph = doc.paragraphs[-1]
|
|
last_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
|
|
|
|
|
|
|
|
|
doc.add_page_break()
|
|
# save to file
|
|
doc.save("hr_data11111111.docx")
|
|
|
|
|
|
|
|
|
|
def convert_books(inputfile):
|
|
|
|
outputzipname = str((str(inputfile).split("."))[0]) + ".zip"
|
|
|
|
|
|
# ext = str(inputfile).split(".")
|
|
# if ext[-1] == "pdf":
|
|
# pass
|
|
#
|
|
# def convert_to_pdf(input_docx, out_folder):
|
|
# p = subprocess.Popen(
|
|
# [
|
|
# "libreoffice",
|
|
# "--headless",
|
|
# "--convert-to",
|
|
# "pdf",
|
|
# "--outdir",
|
|
# out_folder,
|
|
# input_docx,
|
|
# ]
|
|
# )
|
|
# print(["--convert-to", "pdf", input_docx])
|
|
# p.communicate()
|
|
|
|
print(outputzipname)
|
|
fldrs = str(outputzipname).split("/")
|
|
leno = len(fldrs)
|
|
# for i in range(leno):
|
|
# if i+1 == leno:
|
|
print("1111111111111111111111111111111111111111111111111111111111111111111111111")
|
|
print(fldrs[-2])
|
|
fldrs[-2] = fldrs[-2] + "/zdddips/"
|
|
print(fldrs[-2])
|
|
#print(fldrs[i+1])
|
|
outputziploc = "/home/user"
|
|
pdf_text_images_extractor(api_creds, inputfile, outputzipname)
|
|
print("6666666666666666666666666666666666666666666666666666666666666666666666666666")
|
|
#zip_extractor(outputzipname)
|
|
return 1
|
|
|
|
|
|
# input2 = "C:\\Users\\ANSU\\Downloads\\testtt12.pdf"
|
|
# inputfile = "C:\\Users\\ANSU\\Desktop\\MNF\\convertBook\\Adobe\\adobe-dc-pdf-services-sdk-extract-python-samples\\resources\\ihuuh_tnew.pdf"
|
|
# outputzipname = "someoutput2.zip"
|
|
# json_file = "C:\\Users\\ANSU\\Desktop\\MNF\\convertBook\\contents\\structuredData.json"
|
|
|
|
|
|
#convert_books("/home/user/mnf/project/MNF/conversion/booktranslator/ihuuh_tnew.pdf")
|
|
#pdf_text_images_extractor(api_creds,"/home/user/mnf/project/MNF/conversion/booktranslator/ihuuh_tnew.pdf","output.zip")
|
|
|
|
#zip_extractor(outputzipname)
|
|
|
|
# all_pages_data = json_parser(json_file)
|
|
# #print(all_pages_data)
|
|
# word_creator(all_pages_data)
|
|
|