Conversion_Kitchen_Code/kitchen_counter/conversion/booktranslator/newConvertBook_name.py

406 lines
14 KiB
Python
Raw Normal View History

2024-04-27 09:33:09 +00:00
#for extraction of text and images from pdf
import logging
import os.path
from adobe.pdfservices.operation.auth.credentials import Credentials
from adobe.pdfservices.operation.exception.exceptions import ServiceApiException, ServiceUsageException, SdkException
from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_pdf_options import ExtractPDFOptions
from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_renditions_element_type import \
ExtractRenditionsElementType
from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_element_type import ExtractElementType
from adobe.pdfservices.operation.execution_context import ExecutionContext
from adobe.pdfservices.operation.io.file_ref import FileRef
from adobe.pdfservices.operation.pdfops.extract_pdf_operation import ExtractPDFOperation
#for zip extraction
from zipfile import ZipFile
#for parsing json
import json
#for adding tables in docx
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx import Document
import pandas as pd
from docx.shared import Mm
from docx.shared import Inches, Cm, Pt
from docx.oxml.shared import OxmlElement
from docx.oxml.ns import qn
api_creds = "/home/user/mnf/project/MNF/conversion/booktranslator/api_creds/pdfservices-api-credentials.json"
logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO"))
def set_cell_margins(cell, **kwargs):
tc = cell._tc
tcPr = tc.get_or_add_tcPr()
tcMar = OxmlElement('w:tcMar')
for m in ["top", "start", "bottom", "end"]:
if m in kwargs:
node = OxmlElement("w:{}".format(m))
node.set(qn('w:w'), str(kwargs.get(m)))
node.set(qn('w:type'), 'dxa')
tcMar.append(node)
tcPr.append(tcMar)
def add_table_to_doc(doc, df):
columns = list(df.columns)
table = doc.add_table(rows=1, cols=len(columns), style="Table Grid")
table.autofit = True
for col in range(len(columns)):
set_cell_margins(table.cell(0, col), top=100, start=100, bottom=100, end=50)
table.cell(0, col).text = columns[col].replace(" _x000D_", "").capitalize()
for i, row in enumerate(df.itertuples()):
table_row = table.add_row().cells
for col in range(len(columns)):
set_cell_margins(table_row[col], top=100, start=100, bottom=100, end=50)
table_row[col].text = str(row[col + 1]).replace(" _x000D_", "")
return doc
def pdf_text_images_extractor(api_creds, inputFile, outputzip):
try:
# get base path.
base_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
# Initial setup, create credentials instance.
credentials = Credentials.service_account_credentials_builder() \
.from_file(api_creds) \
.build()
# Create an ExecutionContext using credentials and create a new operation instance.
execution_context = ExecutionContext.create(credentials)
extract_pdf_operation = ExtractPDFOperation.create_new()
# Set operation input from a source file.
source = FileRef.create_from_local_file(inputFile)
extract_pdf_operation.set_input(source)
# Build ExtractPDF options and set them into the operation
extract_pdf_options: ExtractPDFOptions = ExtractPDFOptions.builder() \
.with_elements_to_extract([ExtractElementType.TEXT, ExtractElementType.TABLES]) \
.with_elements_to_extract_renditions([ExtractRenditionsElementType.TABLES,
ExtractRenditionsElementType.FIGURES]) \
.build()
extract_pdf_operation.set_options(extract_pdf_options)
# Execute the operation.
result: FileRef = extract_pdf_operation.execute(execution_context)
# print("33333333333333333333333333")
# print(result)
# f = open("myfile.txt", "wb")
# print("12121212")
# # Save the result to the specified location.
# result.write_to_stream(f)
# print("ttttttttttttt")
# f.close()
result.save_as("/tmp/extra")
except (ServiceApiException, ServiceUsageException, SdkException):
logging.exception("Exception encountered while executing operation")
def zip_extractor(filename):
with ZipFile(filename, 'r') as zipObj:
# Extract all the contents of zip file in current directory
zipObj.extractall("contents")
def json_parser(filename):
# Opening JSON file
f = open(filename, encoding="utf8")
# returns JSON object as a dictionary
data = json.load(f)
# Iterating through the json list
print(data['extended_metadata']['page_count'])
print(data['extended_metadata']['language'])
all_pages_data = []
curr_page_contents = []
current_page = 0
for element in data['elements']:
#for detection of headings and paragraphs
if list(element['Path'])[11] == "H" or list(element['Path'])[11] == "P":
if current_page == element["Page"]:
pass
else:
all_pages_data.append(curr_page_contents)
current_page += 1
curr_page_contents = []
current_element = ["Text",element["Text"],element["TextSize"], element["Font"]["family_name"],
element["Font"]["italic"],element["Font"]["weight"]]
try:
output = element["attributes"]["SpaceAfter"]
current_element.append(output)
except:
current_element.append("")
try:
output = element["attributes"]["TextAlign"]
current_element.append(output)
except:
current_element.append("")
curr_page_contents.append(current_element)
#for detection of a list between paragraphs
elif list(element['Path'])[11] == "L":
if current_page == element["Page"]:
pass
else:
all_pages_data.append(curr_page_contents)
current_page += 1
curr_page_contents = []
differ_creator = (element["Path"]).split("/")
if differ_creator[-1] == "Lbl":
current_element = ["List Numbering", element["Text"], element["TextSize"], element["Font"]["family_name"],
element["Font"]["italic"],element["Font"]["weight"]]
else:
current_element = ["List Data", element["Text"], element["TextSize"],
element["Font"]["family_name"],
element["Font"]["italic"],element["Font"]["weight"]]
curr_page_contents.append(current_element)
#for detection of figures
elif list(element['Path'])[11] == "F":
if current_page == element["Page"]:
pass
else:
all_pages_data.append(curr_page_contents)
current_page += 1
curr_page_contents = []
current_element = ["Figure",element["filePaths"][0],element["attributes"]["Placement"],
element["attributes"]["BBox"][0],element["attributes"]["BBox"][1],
element["attributes"]["BBox"][2],element["attributes"]["BBox"][3]]
curr_page_contents.append(current_element)
#for detection of tables
elif list(element['Path'])[11] == "S":
if current_page == element["Page"]:
pass
else:
all_pages_data.append(curr_page_contents)
current_page += 1
curr_page_contents = []
if list(element['Path'])[11:21] == "Sect/Table":
curr_page_contents.append(["Table",element["attributes"]["NumRow"],element["attributes"]["NumCol"],element["filePaths"][0]])
else:
pass
all_pages_data.append(curr_page_contents)
# Closing file
f.close()
return all_pages_data
def word_creator(all_data):
listo = ""
doc = Document()
for page in all_data:
for ele in page:
#writing text in docx
if ele[0] == "Text":
style = doc.styles['Normal']
font = style.font
font.name = str(ele[3])
font.size = Pt(int(ele[2]))
act = doc.add_paragraph(style=style)
act_format = act.paragraph_format
if ele[6] == "":
act_format.space_after = Pt(12)
else:
act_format.space_after = Pt(int(ele[6]))
if ele[7] == "":
act_format.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
else:
if ele[7] == "Justify":
act_format.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
elif ele[7] == "Start":
act_format.alignment = WD_ALIGN_PARAGRAPH.LEFT
elif ele[7] == "Center":
act_format.alignment = WD_ALIGN_PARAGRAPH.CENTER
elif ele[7] == "End":
act_format.alignment = WD_ALIGN_PARAGRAPH.RIGHT
else:
act_format.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
act_format.line_spacing = Pt(12)
act_format.left_indent = Inches(0)
# if (non_dial_dest_lang == 'hi') or (non_dial_dest_lang == 'gu'):
# act.style.font.name = 'Mangal'
# else:
# act.style.font.name = 'Courier New'
para = act.add_run(ele[1])
if ele[4] == "true":
para.italic = True
if ele[5] > 400:
para.bold = True
#adding table in docx
elif ele[0] == "Table":
# read xlsx file
hr_df = pd.read_excel('C:\\Users\\ANSU\\Desktop\\MNF\\convertBook\\contents_table\\tables\\fileoutpart0.xlsx')
doc = Document()
section = doc.sections[0]
section.left_margin = Mm(5)
section.right_margin = Mm(5)
# add tables
add_table_to_doc(doc, hr_df.iloc[:5])
#adding list in docx
elif ele[0] == "List Numbering":
if (list(ele[1])[0]).isdigit():
listo = "Ordered"
else:
listo = "UnOrdered"
#adding list in docx
elif ele[0] == "List Data":
if listo == "Ordered":
para = doc.add_paragraph(ele[1],
style='List Number')
listo=""
else:
para = doc.add_paragraph(ele[1],
style='List Bullet')
listo = ""
if ele[4] == "true":
para.italic = True
if ele[5] > 300:
para.bold = True
#adding figure in docx
elif ele[0] == "Figure":
print(ele)
doc.add_picture("C:\\Users\\ANSU\\Desktop\\MNF\\convertBook\\contents\\"+str(ele[1]), width=Pt(int(ele[5])-int(ele[3])), height=Pt(int(ele[6])-int(ele[4])))
last_paragraph = doc.paragraphs[-1]
last_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
doc.add_page_break()
# save to file
doc.save("hr_data11111111.docx")
def convert_books(inputfile):
outputzipname = str((str(inputfile).split("."))[0]) + ".zip"
# ext = str(inputfile).split(".")
# if ext[-1] == "pdf":
# pass
#
# def convert_to_pdf(input_docx, out_folder):
# p = subprocess.Popen(
# [
# "libreoffice",
# "--headless",
# "--convert-to",
# "pdf",
# "--outdir",
# out_folder,
# input_docx,
# ]
# )
# print(["--convert-to", "pdf", input_docx])
# p.communicate()
print(outputzipname)
fldrs = str(outputzipname).split("/")
leno = len(fldrs)
# for i in range(leno):
# if i+1 == leno:
print("1111111111111111111111111111111111111111111111111111111111111111111111111")
print(fldrs[-2])
fldrs[-2] = fldrs[-2] + "/zdddips/"
print(fldrs[-2])
#print(fldrs[i+1])
outputziploc = "/home/user"
pdf_text_images_extractor(api_creds, inputfile, outputzipname)
print("6666666666666666666666666666666666666666666666666666666666666666666666666666")
#zip_extractor(outputzipname)
return 1
# input2 = "C:\\Users\\ANSU\\Downloads\\testtt12.pdf"
# inputfile = "C:\\Users\\ANSU\\Desktop\\MNF\\convertBook\\Adobe\\adobe-dc-pdf-services-sdk-extract-python-samples\\resources\\ihuuh_tnew.pdf"
# outputzipname = "someoutput2.zip"
# json_file = "C:\\Users\\ANSU\\Desktop\\MNF\\convertBook\\contents\\structuredData.json"
#convert_books("/home/user/mnf/project/MNF/conversion/booktranslator/ihuuh_tnew.pdf")
#pdf_text_images_extractor(api_creds,"/home/user/mnf/project/MNF/conversion/booktranslator/ihuuh_tnew.pdf","output.zip")
#zip_extractor(outputzipname)
# all_pages_data = json_parser(json_file)
# #print(all_pages_data)
# word_creator(all_pages_data)