Conversion_Kitchen_Code/kitchen_counter/conversion/booktranslator/newConvertBook.py

565 lines
22 KiB
Python
Raw Normal View History

2024-04-27 09:33:09 +00:00
#for extraction of text and images from pdf
import logging
import os.path
from adobe.pdfservices.operation.auth.credentials import Credentials
from adobe.pdfservices.operation.exception.exceptions import ServiceApiException, ServiceUsageException, SdkException
from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_pdf_options import ExtractPDFOptions
from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_renditions_element_type import \
ExtractRenditionsElementType
from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_element_type import ExtractElementType
from adobe.pdfservices.operation.execution_context import ExecutionContext
from adobe.pdfservices.operation.io.file_ref import FileRef
from adobe.pdfservices.operation.pdfops.extract_pdf_operation import ExtractPDFOperation
#for zip extraction
from zipfile import ZipFile
#for parsing json
import json
#for adding tables in docx
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx import Document
import pandas as pd
from docx.shared import Mm
from docx.shared import Inches, Cm, Pt
from docx.oxml.shared import OxmlElement
from docx.oxml.ns import qn
from docx.enum.section import WD_SECTION
#Delete Intermediate Files
import shutil
# For Translation
from conversion.translation.translation_function import translate_comparison
# For headers and footers
import PyPDF2
basePath = "/home/user/mnf/project/MNF/media/scripts/book/translated"
basepath = "/home/user/mnf/project/MNF"
api_creds = "/home/user/mnf/project/MNF/conversion/booktranslator/api_creds/pdfservices-api-credentials.json"
logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO"))
def extract_header_footer(pdf_path, headerFlag, footerFlag):
header_text = []
footer_text = []
with open(pdf_path, 'rb') as file:
reader = PyPDF2.PdfFileReader(file)
for page in reader.pages:
# print(page.images)
# for image_file_object in page.images:
# with open(str(count) + image_file_object.name, "wb") as fp:
# fp.write(image_file_object.data)
# Extract the text from the top region of the page
if headerFlag:
header = page.extractText().splitlines()[0]
header_text.append(header) # Append the extracted header to the overall header text
if footerFlag:
footer = page.extractText().splitlines()[-1]
footer_text.append(footer) # Append the extracted footer to the overall header text
return header_text, footer_text
def set_cell_margins(cell, **kwargs):
tc = cell._tc
tcPr = tc.get_or_add_tcPr()
tcMar = OxmlElement('w:tcMar')
for m in ["top", "start", "bottom", "end"]:
if m in kwargs:
node = OxmlElement("w:{}".format(m))
node.set(qn('w:w'), str(kwargs.get(m)))
node.set(qn('w:type'), 'dxa')
tcMar.append(node)
tcPr.append(tcMar)
def add_table_to_doc(doc, df):
columns = list(df.columns)
table = doc.add_table(rows=1, cols=len(columns), style="Table Grid")
table.autofit = True
for col in range(len(columns)):
set_cell_margins(table.cell(0, col), top=100, start=100, bottom=100, end=50)
table.cell(0, col).text = columns[col].replace(" _x000D_", "").capitalize()
for i, row in enumerate(df.itertuples()):
table_row = table.add_row().cells
for col in range(len(columns)):
set_cell_margins(table_row[col], top=100, start=100, bottom=100, end=50)
table_row[col].text = str(row[col + 1]).replace(" _x000D_", "")
return doc
def pdf_text_images_extractor(api_creds, inputFile, outputzip):
try:
# get base path.
base_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
# Initial setup, create credentials instance.
credentials = Credentials.service_account_credentials_builder() \
.from_file(api_creds) \
.build()
# Create an ExecutionContext using credentials and create a new operation instance.
execution_context = ExecutionContext.create(credentials)
extract_pdf_operation = ExtractPDFOperation.create_new()
# Set operation input from a source file.
source = FileRef.create_from_local_file(inputFile)
#source = FileRef.createFromURL(inputUrl)
extract_pdf_operation.set_input(source)
# Build ExtractPDF options and set them into the operation
extract_pdf_options: ExtractPDFOptions = ExtractPDFOptions.builder() \
.with_elements_to_extract([ExtractElementType.TEXT, ExtractElementType.TABLES]) \
.with_elements_to_extract_renditions([ExtractRenditionsElementType.TABLES,
ExtractRenditionsElementType.FIGURES]) \
.build()
extract_pdf_operation.set_options(extract_pdf_options)
# Execute the operation.
result: FileRef = extract_pdf_operation.execute(execution_context)
# Save the result to the specified location.
print("Saving the Zip")
outputs = (outputzip.split("/"))[:-1]
outputos = (outputzip.split("/"))
filename = (outputos[-1].split("."))[0]
outputzip_path1 = ""
for i in outputs:
outputzip_path1 = outputzip_path1 + str(i) + "/"
if not os.path.exists(outputzip_path1 + str(filename)):
os.makedirs(outputzip_path1 + str(filename), mode=0o777, exist_ok=False)
outputfile = open(outputzip_path1 + str(filename) + "/" + str(filename) + ".zip", "wb")
result.write_to_stream(outputfile)
outputfile.close()
return str(outputzip_path1 + str(filename) + "/" + str(filename) + ".zip")
except (ServiceApiException, ServiceUsageException, SdkException):
# print(exception)
logging.exception("Exception encountered while executing operation")
def zip_extractor(filename):
filename_final = ""
filenameo = (filename.split("/"))[:-1]
for i in filenameo:
filename_final = filename_final + str(i) + "/"
with ZipFile(filename, 'r') as zipObj:
# Extract all the contents of zip file in current directory
zipObj.extractall(filename_final + "contents")
def json_parser(filename):
# Opening JSON file
f = open(filename, encoding="utf8")
# returns JSON object as a dictionary
data = json.load(f)
# Iterating through the json list
print(data['extended_metadata']['page_count'])
print(data['extended_metadata']['language'])
all_pages_data = []
curr_page_contents = []
current_page = 0
for element in data['elements']:
print("1789")
print(element)
#for detection of headings and paragraphs
if list(element['Path'])[11] == "H" or list(element['Path'])[11] == "P":
if current_page == element["Page"]:
pass
else:
all_pages_data.append(curr_page_contents)
current_page += 1
curr_page_contents = []
#for Filtering wrong detection of paragraph for ill detection of text and giving it a paragraph tag
try:
current_element = ["Text",element["Text"],element["TextSize"], element["Font"]["family_name"],
element["Font"]["italic"],element["Font"]["weight"]]
try:
output = element["attributes"]["SpaceAfter"]
current_element.append(output)
except:
current_element.append("")
try:
output = element["attributes"]["TextAlign"]
current_element.append(output)
except:
current_element.append("")
curr_page_contents.append(current_element)
except:
continue
#for detection of a list between paragraphs
elif list(element['Path'])[11] == "L":
if current_page == element["Page"]:
pass
else:
all_pages_data.append(curr_page_contents)
current_page += 1
curr_page_contents = []
differ_creator = (element["Path"]).split("/")
if differ_creator[-1] == "Lbl":
current_element = ["List Numbering", element["Text"], element["TextSize"], element["Font"]["family_name"],
element["Font"]["italic"],element["Font"]["weight"]]
else:
current_element = ["List Data", element["Text"], element["TextSize"],
element["Font"]["family_name"],
element["Font"]["italic"],element["Font"]["weight"]]
curr_page_contents.append(current_element)
#for detection of figures
elif list(element['Path'])[11] == "F":
if current_page == element["Page"]:
pass
else:
all_pages_data.append(curr_page_contents)
current_page += 1
curr_page_contents = []
current_element = ["Figure",element["filePaths"][0],element["attributes"]["Placement"],
element["attributes"]["BBox"][0],element["attributes"]["BBox"][1],
element["attributes"]["BBox"][2],element["attributes"]["BBox"][3]]
curr_page_contents.append(current_element)
#for detection of tables
elif list(element['Path'])[11] == "S":
if current_page == element["Page"]:
pass
else:
all_pages_data.append(curr_page_contents)
current_page += 1
curr_page_contents = []
if list(element['Path'])[11:21] == "Sect/Table":
curr_page_contents.append(["Table",element["attributes"]["NumRow"],element["attributes"]["NumCol"],element["filePaths"][0]])
else:
pass
all_pages_data.append(curr_page_contents)
# Closing file
f.close()
return all_pages_data
def word_creator(all_data,doc_name,media_path_ref, src_lang, tar_lang, header_footer_present, headers, footers):
odc_name_modified = (media_path_ref.split("/"))[:-1]
media_path_dir = ""
for i in odc_name_modified:
media_path_dir = media_path_dir + str(i) + "/"
listo = ""
doc = Document()
file = open("/home/user/mnf/project/MNF/translation_data.txt", "w")
for count, page in enumerate(all_data):
print("headers are:",headers)
if header_footer_present == "header" or header_footer_present == "both":
try:
header = doc.sections[count].header
header_text = header.paragraphs[0].add_run(str(headers[count]))
header_text.font.size = Pt(12)
header_text.font.bold = True
except Exception as e:
print("Adding Header has the below error: ",e)
for ele in page:
print("Current Element",ele)
#writing text in docx
if ele[0] == "Text":
style = doc.styles['Normal']
font = style.font
font.name = str(ele[3])
font.size = Pt(int(ele[2]))
act = doc.add_paragraph(style=style)
act_format = act.paragraph_format
if ele[6] == "":
act_format.space_after = Pt(12)
else:
act_format.space_after = Pt(int(ele[6]))
if ele[7] == "":
act_format.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
else:
if ele[7] == "Justify":
act_format.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
elif ele[7] == "Start":
act_format.alignment = WD_ALIGN_PARAGRAPH.LEFT
elif ele[7] == "Center":
act_format.alignment = WD_ALIGN_PARAGRAPH.CENTER
elif ele[7] == "End":
act_format.alignment = WD_ALIGN_PARAGRAPH.RIGHT
else:
act_format.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
act_format.line_spacing = Pt(12)
act_format.left_indent = Inches(0)
# if (non_dial_dest_lang == 'hi') or (non_dial_dest_lang == 'gu'):
# act.style.font.name = 'Mangal'
# else:
# act.style.font.name = 'Courier New'
trans_text, trans = translate_comparison(ele[1], src_lang, tar_lang, True)
file.write(str(ele[1]))
file.write(str(trans))
para = act.add_run(trans_text)
if ele[4] == "true":
para.italic = True
if ele[5] > 400:
para.bold = True
#adding table in docx
elif ele[0] == "Table":
# read xlsx file
hr_df = pd.read_excel(media_path_dir + str(ele[3]))
doc = Document()
section = doc.sections[0]
section.left_margin = Mm(5)
section.right_margin = Mm(5)
# add tables
add_table_to_doc(doc, hr_df.iloc[:5])
#adding list in docx
elif ele[0] == "List Numbering":
if (list(ele[1])[0]).isdigit():
listo = "Ordered"
else:
listo = "UnOrdered"
#adding list in docx
elif ele[0] == "List Data":
if listo == "Ordered":
para = doc.add_paragraph(translate_comparison(ele[1], src_lang, tar_lang, True),
style='List Number')
listo=""
else:
para = doc.add_paragraph(translate_comparison(ele[1], src_lang, tar_lang, True),
style='List Bullet')
listo = ""
if ele[4] == "true":
para.italic = True
if ele[5] > 300:
para.bold = True
#adding figure in docx
elif ele[0] == "Figure":
doc.add_picture(media_path_dir + "/contents/" + str(ele[1]), width=Pt(int(ele[5])-int(ele[3])), height=Pt(int(ele[6])-int(ele[4])))
last_paragraph = doc.paragraphs[-1]
last_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
print("footers are:", footers)
if header_footer_present == "footer" or header_footer_present == "both":
try:
footer = doc.sections[count].footer
footer_text = footer.paragraphs[0].add_run(str(footer[count]))
footer_text.font.size = Pt(12)
footer_text.font.bold = True
except Exception as e:
print("Adding Footer has the below error: ", e)
doc.add_page_break()
doc.add_section(WD_SECTION.NEW_PAGE)
# save to file
print("Saving the Doc")
print(doc_name)
print(doc.sections)
for count,section in enumerate(doc.sections):
print(count)
doc.save(doc_name)
def convert_books(inputfile, src_lang, tar_lang, greater_than_10 , header_footer_present):
outputzipname_pre = str((str(inputfile).split("."))[0]) + ".zip"
outputzipname = outputzipname_pre.replace("/book/","/book/zips/")
print("Extracting header and footer of every page if present")
headerFlag = False
footerFlag = False
if header_footer_present == "both":
headerFlag = True
footerFlag = True
elif header_footer_present == "header":
headerFlag = True
elif header_footer_present == "footer":
footerFlag = True
else:
pass
headers, footers = extract_header_footer(inputfile, headerFlag, footerFlag)
print("founded headers and footers",headers,footers)
if greater_than_10:
pdf = PyPDF2.PdfFileReader(inputfile, "rb")
numpages = pdf.getNumPages()
intermediate_pages = []
# if numpages % 10 != 0:
hard_pages = numpages // 10
for i in range(hard_pages):
page = (10 * i, 10 * (i + 1))
intermediate_pages.append(page)
final_pages = numpages - 10 * hard_pages
intermediate_pages.append(
(10 * hard_pages, 10 * hard_pages + final_pages))
print("counter 1234")
print(intermediate_pages)
for page_start, page_end in intermediate_pages:
# from PyPDF2 import PdfFileWriter, PdfFileReader
# pdf_reader = PdfFileReader(open(filename, "rb"))
pdf_writer1 = PyPDF2.PdfFileWriter()
for page in range(page_start, page_end):
pdf_writer1.addPage(pdf.getPage(page))
if not os.path.exists(f"{basepath}/media/scripts/book/intermediate_files/{(((str(inputfile).split('/'))[-1]).split('.'))[0]}/"):
os.mkdir(f"{basepath}/media/scripts/book/intermediate_files/{(((str(inputfile).split('/'))[-1]).split('.'))[0]}/", mode=0o777)
with open(
f"{basepath}/media/scripts/book/intermediate_files/"
f"{(((str(inputfile).split('/'))[-1]).split('.'))[0]}/{page_start}_{page_end}.pdf",
"wb",
) as file1:
pdf_writer1.write(file1)
# Making Docx for each 10pages pdf
for page_start, page_end in intermediate_pages:
pdf_file = f"{basepath}/media/scripts/book/intermediate_files/{(((str(inputfile).split('/'))[-1]).split('.'))[0]}/{page_start}_{page_end}.pdf"
print("Current File -> ",pdf_file)
loc = pdf_text_images_extractor(api_creds, pdf_file, f"{basepath}/media/scripts/book/intermediate_files/"
f"{(((str(inputfile).split('/'))[-1]).split('.'))[0]}/{page_start}_{page_end}.zip")
print("Extracting all contents of zip")
zip_extractor(loc)
print("Parsing the Json File and getting all the details")
locs = (loc.split("/"))[:-1]
json_dir = ""
for i in locs:
json_dir = json_dir + str(i) + "/"
json_file_path = json_dir + "/contents/structuredData.json"
all_pages_data = json_parser(json_file_path)
print("Creating the Doc")
print(basePath + f"/{(json_dir.split('/'))[-2]}.docx")
word_creator(all_pages_data, basePath + f"/{(json_dir.split('/'))[-2]}.docx",json_dir + f"/{(json_dir.split('/'))[-2]}.docx", src_lang, tar_lang, header_footer_present , headers, footers)
#deleting the directory of zip contents
print("Deleting the Directory")
print(json_dir)
#os.rmdir(json_dir)
# shutil.rmtree(json_dir, ignore_errors=True)
# return str(basePath + f"/{(json_dir.split('/'))[-2]}.docx")
# combining all the docx's in one Docx
pass
else:
print("Extracted All the texts and images from PDF-API CALL ")
loc = pdf_text_images_extractor(api_creds, inputfile, outputzipname)
print("Extracting all contents of zip")
zip_extractor(loc)
print("Parsing the Json File and getting all the details")
locs = (loc.split("/"))[:-1]
json_dir = ""
for i in locs:
json_dir = json_dir + str(i) + "/"
json_file_path = json_dir + "/contents/structuredData.json"
all_pages_data = json_parser(json_file_path)
print("Creating the Doc")
print(basePath + f"/{(json_dir.split('/'))[-2]}.docx")
word_creator(all_pages_data, basePath + f"/{(json_dir.split('/'))[-2]}.docx",json_dir + f"/{(json_dir.split('/'))[-2]}.docx", src_lang, tar_lang, header_footer_present , headers, footers)
#deleting the directory of zip contents
print("Deleting the Directory")
print(json_dir)
#os.rmdir(json_dir)
shutil.rmtree(json_dir, ignore_errors=True)
return str(basePath + f"/{(json_dir.split('/'))[-2]}.docx")
#
# def convert_books_grt10pages(inputfile, src_lang, tar_lang):
# outputzipname_pre = str((str(inputfile).split("."))[0]) + ".zip"
# outputzipname = outputzipname_pre.replace("/book/", "/book/zips/")
#
# print("Extracted All the texts and images from PDF-API CALL ")
# loc = pdf_text_images_extractor(api_creds, inputfile, outputzipname)
#
# input2 = "C:\\Users\\ANSU\\Downloads\\testtt12.pdf"
# inputfile = "C:\\Users\\ANSU\\Desktop\\MNF\\convertBook\\Adobe\\adobe-dc-pdf-services-sdk-extract-python-samples\\resources\\ihuuh_tnew.pdf"
# outputzipname = "someoutput2.zip"
# json_file = "C:\\Users\\ANSU\\Desktop\\MNF\\convertBook\\contents\\structuredData.json"
#convert_books("/home/user/mnf/project/MNF/conversion/booktranslator/ihuuh_tnew.pdf")
#pdf_text_images_extractor(api_creds,"/home/user/mnf/project/MNF/conversion/booktranslator/ihuuh_tnew.pdf","output.zip")
#zip_extractor(outputzipname)
# all_pages_data = json_parser(json_file)
# #print(all_pages_data)
# word_creator(all_pages_data)