565 lines
22 KiB
Python
565 lines
22 KiB
Python
|
#for extraction of text and images from pdf
|
||
|
import logging
|
||
|
import os.path
|
||
|
from adobe.pdfservices.operation.auth.credentials import Credentials
|
||
|
from adobe.pdfservices.operation.exception.exceptions import ServiceApiException, ServiceUsageException, SdkException
|
||
|
from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_pdf_options import ExtractPDFOptions
|
||
|
from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_renditions_element_type import \
|
||
|
ExtractRenditionsElementType
|
||
|
from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_element_type import ExtractElementType
|
||
|
from adobe.pdfservices.operation.execution_context import ExecutionContext
|
||
|
from adobe.pdfservices.operation.io.file_ref import FileRef
|
||
|
from adobe.pdfservices.operation.pdfops.extract_pdf_operation import ExtractPDFOperation
|
||
|
|
||
|
#for zip extraction
|
||
|
from zipfile import ZipFile
|
||
|
|
||
|
#for parsing json
|
||
|
import json
|
||
|
|
||
|
#for adding tables in docx
|
||
|
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
||
|
from docx import Document
|
||
|
import pandas as pd
|
||
|
from docx.shared import Mm
|
||
|
from docx.shared import Inches, Cm, Pt
|
||
|
from docx.oxml.shared import OxmlElement
|
||
|
from docx.oxml.ns import qn
|
||
|
from docx.enum.section import WD_SECTION
|
||
|
|
||
|
#Delete Intermediate Files
|
||
|
import shutil
|
||
|
|
||
|
# For Translation
|
||
|
from conversion.translation.translation_function import translate_comparison
|
||
|
|
||
|
# For headers and footers
|
||
|
import PyPDF2
|
||
|
|
||
|
basePath = "/home/user/mnf/project/MNF/media/scripts/book/translated"
|
||
|
basepath = "/home/user/mnf/project/MNF"
|
||
|
|
||
|
api_creds = "/home/user/mnf/project/MNF/conversion/booktranslator/api_creds/pdfservices-api-credentials.json"
|
||
|
logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO"))
|
||
|
|
||
|
|
||
|
|
||
|
def extract_header_footer(pdf_path, headerFlag, footerFlag):
|
||
|
header_text = []
|
||
|
footer_text = []
|
||
|
|
||
|
with open(pdf_path, 'rb') as file:
|
||
|
reader = PyPDF2.PdfFileReader(file)
|
||
|
|
||
|
|
||
|
|
||
|
for page in reader.pages:
|
||
|
# print(page.images)
|
||
|
# for image_file_object in page.images:
|
||
|
# with open(str(count) + image_file_object.name, "wb") as fp:
|
||
|
# fp.write(image_file_object.data)
|
||
|
# Extract the text from the top region of the page
|
||
|
if headerFlag:
|
||
|
header = page.extractText().splitlines()[0]
|
||
|
header_text.append(header) # Append the extracted header to the overall header text
|
||
|
if footerFlag:
|
||
|
footer = page.extractText().splitlines()[-1]
|
||
|
footer_text.append(footer) # Append the extracted footer to the overall header text
|
||
|
return header_text, footer_text
|
||
|
|
||
|
|
||
|
|
||
|
def set_cell_margins(cell, **kwargs):
|
||
|
|
||
|
tc = cell._tc
|
||
|
tcPr = tc.get_or_add_tcPr()
|
||
|
tcMar = OxmlElement('w:tcMar')
|
||
|
|
||
|
for m in ["top", "start", "bottom", "end"]:
|
||
|
if m in kwargs:
|
||
|
node = OxmlElement("w:{}".format(m))
|
||
|
node.set(qn('w:w'), str(kwargs.get(m)))
|
||
|
node.set(qn('w:type'), 'dxa')
|
||
|
tcMar.append(node)
|
||
|
|
||
|
tcPr.append(tcMar)
|
||
|
|
||
|
|
||
|
|
||
|
def add_table_to_doc(doc, df):
|
||
|
|
||
|
columns = list(df.columns)
|
||
|
|
||
|
table = doc.add_table(rows=1, cols=len(columns), style="Table Grid")
|
||
|
table.autofit = True
|
||
|
|
||
|
for col in range(len(columns)):
|
||
|
set_cell_margins(table.cell(0, col), top=100, start=100, bottom=100, end=50)
|
||
|
table.cell(0, col).text = columns[col].replace(" _x000D_", "").capitalize()
|
||
|
|
||
|
|
||
|
for i, row in enumerate(df.itertuples()):
|
||
|
table_row = table.add_row().cells
|
||
|
for col in range(len(columns)):
|
||
|
set_cell_margins(table_row[col], top=100, start=100, bottom=100, end=50)
|
||
|
table_row[col].text = str(row[col + 1]).replace(" _x000D_", "")
|
||
|
|
||
|
return doc
|
||
|
|
||
|
|
||
|
|
||
|
def pdf_text_images_extractor(api_creds, inputFile, outputzip):
|
||
|
try:
|
||
|
# get base path.
|
||
|
base_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||
|
|
||
|
# Initial setup, create credentials instance.
|
||
|
credentials = Credentials.service_account_credentials_builder() \
|
||
|
.from_file(api_creds) \
|
||
|
.build()
|
||
|
|
||
|
# Create an ExecutionContext using credentials and create a new operation instance.
|
||
|
execution_context = ExecutionContext.create(credentials)
|
||
|
extract_pdf_operation = ExtractPDFOperation.create_new()
|
||
|
|
||
|
# Set operation input from a source file.
|
||
|
source = FileRef.create_from_local_file(inputFile)
|
||
|
#source = FileRef.createFromURL(inputUrl)
|
||
|
extract_pdf_operation.set_input(source)
|
||
|
|
||
|
# Build ExtractPDF options and set them into the operation
|
||
|
extract_pdf_options: ExtractPDFOptions = ExtractPDFOptions.builder() \
|
||
|
.with_elements_to_extract([ExtractElementType.TEXT, ExtractElementType.TABLES]) \
|
||
|
.with_elements_to_extract_renditions([ExtractRenditionsElementType.TABLES,
|
||
|
ExtractRenditionsElementType.FIGURES]) \
|
||
|
.build()
|
||
|
extract_pdf_operation.set_options(extract_pdf_options)
|
||
|
|
||
|
# Execute the operation.
|
||
|
result: FileRef = extract_pdf_operation.execute(execution_context)
|
||
|
|
||
|
# Save the result to the specified location.
|
||
|
print("Saving the Zip")
|
||
|
outputs = (outputzip.split("/"))[:-1]
|
||
|
outputos = (outputzip.split("/"))
|
||
|
filename = (outputos[-1].split("."))[0]
|
||
|
outputzip_path1 = ""
|
||
|
for i in outputs:
|
||
|
outputzip_path1 = outputzip_path1 + str(i) + "/"
|
||
|
if not os.path.exists(outputzip_path1 + str(filename)):
|
||
|
os.makedirs(outputzip_path1 + str(filename), mode=0o777, exist_ok=False)
|
||
|
outputfile = open(outputzip_path1 + str(filename) + "/" + str(filename) + ".zip", "wb")
|
||
|
result.write_to_stream(outputfile)
|
||
|
outputfile.close()
|
||
|
|
||
|
return str(outputzip_path1 + str(filename) + "/" + str(filename) + ".zip")
|
||
|
except (ServiceApiException, ServiceUsageException, SdkException):
|
||
|
# print(exception)
|
||
|
logging.exception("Exception encountered while executing operation")
|
||
|
|
||
|
|
||
|
def zip_extractor(filename):
|
||
|
|
||
|
filename_final = ""
|
||
|
filenameo = (filename.split("/"))[:-1]
|
||
|
for i in filenameo:
|
||
|
filename_final = filename_final + str(i) + "/"
|
||
|
|
||
|
with ZipFile(filename, 'r') as zipObj:
|
||
|
# Extract all the contents of zip file in current directory
|
||
|
zipObj.extractall(filename_final + "contents")
|
||
|
|
||
|
|
||
|
|
||
|
def json_parser(filename):
|
||
|
# Opening JSON file
|
||
|
f = open(filename, encoding="utf8")
|
||
|
|
||
|
# returns JSON object as a dictionary
|
||
|
data = json.load(f)
|
||
|
|
||
|
# Iterating through the json list
|
||
|
print(data['extended_metadata']['page_count'])
|
||
|
print(data['extended_metadata']['language'])
|
||
|
|
||
|
|
||
|
all_pages_data = []
|
||
|
curr_page_contents = []
|
||
|
current_page = 0
|
||
|
|
||
|
for element in data['elements']:
|
||
|
print("1789")
|
||
|
print(element)
|
||
|
#for detection of headings and paragraphs
|
||
|
if list(element['Path'])[11] == "H" or list(element['Path'])[11] == "P":
|
||
|
if current_page == element["Page"]:
|
||
|
pass
|
||
|
else:
|
||
|
all_pages_data.append(curr_page_contents)
|
||
|
current_page += 1
|
||
|
curr_page_contents = []
|
||
|
|
||
|
#for Filtering wrong detection of paragraph for ill detection of text and giving it a paragraph tag
|
||
|
try:
|
||
|
current_element = ["Text",element["Text"],element["TextSize"], element["Font"]["family_name"],
|
||
|
element["Font"]["italic"],element["Font"]["weight"]]
|
||
|
try:
|
||
|
output = element["attributes"]["SpaceAfter"]
|
||
|
current_element.append(output)
|
||
|
except:
|
||
|
current_element.append("")
|
||
|
try:
|
||
|
output = element["attributes"]["TextAlign"]
|
||
|
current_element.append(output)
|
||
|
except:
|
||
|
current_element.append("")
|
||
|
|
||
|
curr_page_contents.append(current_element)
|
||
|
except:
|
||
|
continue
|
||
|
|
||
|
#for detection of a list between paragraphs
|
||
|
elif list(element['Path'])[11] == "L":
|
||
|
|
||
|
if current_page == element["Page"]:
|
||
|
pass
|
||
|
else:
|
||
|
all_pages_data.append(curr_page_contents)
|
||
|
current_page += 1
|
||
|
curr_page_contents = []
|
||
|
|
||
|
differ_creator = (element["Path"]).split("/")
|
||
|
if differ_creator[-1] == "Lbl":
|
||
|
current_element = ["List Numbering", element["Text"], element["TextSize"], element["Font"]["family_name"],
|
||
|
element["Font"]["italic"],element["Font"]["weight"]]
|
||
|
else:
|
||
|
current_element = ["List Data", element["Text"], element["TextSize"],
|
||
|
element["Font"]["family_name"],
|
||
|
element["Font"]["italic"],element["Font"]["weight"]]
|
||
|
|
||
|
curr_page_contents.append(current_element)
|
||
|
|
||
|
|
||
|
|
||
|
#for detection of figures
|
||
|
elif list(element['Path'])[11] == "F":
|
||
|
|
||
|
if current_page == element["Page"]:
|
||
|
pass
|
||
|
else:
|
||
|
all_pages_data.append(curr_page_contents)
|
||
|
current_page += 1
|
||
|
curr_page_contents = []
|
||
|
|
||
|
current_element = ["Figure",element["filePaths"][0],element["attributes"]["Placement"],
|
||
|
element["attributes"]["BBox"][0],element["attributes"]["BBox"][1],
|
||
|
element["attributes"]["BBox"][2],element["attributes"]["BBox"][3]]
|
||
|
|
||
|
|
||
|
curr_page_contents.append(current_element)
|
||
|
|
||
|
|
||
|
|
||
|
#for detection of tables
|
||
|
elif list(element['Path'])[11] == "S":
|
||
|
|
||
|
if current_page == element["Page"]:
|
||
|
pass
|
||
|
else:
|
||
|
all_pages_data.append(curr_page_contents)
|
||
|
current_page += 1
|
||
|
curr_page_contents = []
|
||
|
|
||
|
if list(element['Path'])[11:21] == "Sect/Table":
|
||
|
curr_page_contents.append(["Table",element["attributes"]["NumRow"],element["attributes"]["NumCol"],element["filePaths"][0]])
|
||
|
else:
|
||
|
pass
|
||
|
|
||
|
|
||
|
|
||
|
all_pages_data.append(curr_page_contents)
|
||
|
# Closing file
|
||
|
f.close()
|
||
|
return all_pages_data
|
||
|
|
||
|
|
||
|
|
||
|
def word_creator(all_data,doc_name,media_path_ref, src_lang, tar_lang, header_footer_present, headers, footers):
|
||
|
odc_name_modified = (media_path_ref.split("/"))[:-1]
|
||
|
media_path_dir = ""
|
||
|
for i in odc_name_modified:
|
||
|
media_path_dir = media_path_dir + str(i) + "/"
|
||
|
|
||
|
listo = ""
|
||
|
|
||
|
doc = Document()
|
||
|
file = open("/home/user/mnf/project/MNF/translation_data.txt", "w")
|
||
|
for count, page in enumerate(all_data):
|
||
|
print("headers are:",headers)
|
||
|
if header_footer_present == "header" or header_footer_present == "both":
|
||
|
try:
|
||
|
header = doc.sections[count].header
|
||
|
header_text = header.paragraphs[0].add_run(str(headers[count]))
|
||
|
header_text.font.size = Pt(12)
|
||
|
header_text.font.bold = True
|
||
|
except Exception as e:
|
||
|
print("Adding Header has the below error: ",e)
|
||
|
for ele in page:
|
||
|
print("Current Element",ele)
|
||
|
#writing text in docx
|
||
|
if ele[0] == "Text":
|
||
|
style = doc.styles['Normal']
|
||
|
font = style.font
|
||
|
font.name = str(ele[3])
|
||
|
font.size = Pt(int(ele[2]))
|
||
|
act = doc.add_paragraph(style=style)
|
||
|
act_format = act.paragraph_format
|
||
|
if ele[6] == "":
|
||
|
act_format.space_after = Pt(12)
|
||
|
else:
|
||
|
act_format.space_after = Pt(int(ele[6]))
|
||
|
if ele[7] == "":
|
||
|
act_format.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
|
||
|
else:
|
||
|
if ele[7] == "Justify":
|
||
|
act_format.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
|
||
|
elif ele[7] == "Start":
|
||
|
act_format.alignment = WD_ALIGN_PARAGRAPH.LEFT
|
||
|
elif ele[7] == "Center":
|
||
|
act_format.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
|
elif ele[7] == "End":
|
||
|
act_format.alignment = WD_ALIGN_PARAGRAPH.RIGHT
|
||
|
else:
|
||
|
act_format.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
|
||
|
act_format.line_spacing = Pt(12)
|
||
|
act_format.left_indent = Inches(0)
|
||
|
# if (non_dial_dest_lang == 'hi') or (non_dial_dest_lang == 'gu'):
|
||
|
# act.style.font.name = 'Mangal'
|
||
|
# else:
|
||
|
# act.style.font.name = 'Courier New'
|
||
|
trans_text, trans = translate_comparison(ele[1], src_lang, tar_lang, True)
|
||
|
file.write(str(ele[1]))
|
||
|
file.write(str(trans))
|
||
|
para = act.add_run(trans_text)
|
||
|
if ele[4] == "true":
|
||
|
para.italic = True
|
||
|
if ele[5] > 400:
|
||
|
para.bold = True
|
||
|
|
||
|
|
||
|
|
||
|
#adding table in docx
|
||
|
elif ele[0] == "Table":
|
||
|
|
||
|
# read xlsx file
|
||
|
hr_df = pd.read_excel(media_path_dir + str(ele[3]))
|
||
|
|
||
|
|
||
|
doc = Document()
|
||
|
section = doc.sections[0]
|
||
|
section.left_margin = Mm(5)
|
||
|
section.right_margin = Mm(5)
|
||
|
|
||
|
# add tables
|
||
|
add_table_to_doc(doc, hr_df.iloc[:5])
|
||
|
|
||
|
|
||
|
|
||
|
#adding list in docx
|
||
|
elif ele[0] == "List Numbering":
|
||
|
|
||
|
if (list(ele[1])[0]).isdigit():
|
||
|
listo = "Ordered"
|
||
|
else:
|
||
|
listo = "UnOrdered"
|
||
|
|
||
|
|
||
|
|
||
|
#adding list in docx
|
||
|
elif ele[0] == "List Data":
|
||
|
if listo == "Ordered":
|
||
|
|
||
|
para = doc.add_paragraph(translate_comparison(ele[1], src_lang, tar_lang, True),
|
||
|
style='List Number')
|
||
|
listo=""
|
||
|
else:
|
||
|
para = doc.add_paragraph(translate_comparison(ele[1], src_lang, tar_lang, True),
|
||
|
style='List Bullet')
|
||
|
listo = ""
|
||
|
|
||
|
if ele[4] == "true":
|
||
|
para.italic = True
|
||
|
if ele[5] > 300:
|
||
|
para.bold = True
|
||
|
|
||
|
|
||
|
|
||
|
#adding figure in docx
|
||
|
elif ele[0] == "Figure":
|
||
|
|
||
|
doc.add_picture(media_path_dir + "/contents/" + str(ele[1]), width=Pt(int(ele[5])-int(ele[3])), height=Pt(int(ele[6])-int(ele[4])))
|
||
|
last_paragraph = doc.paragraphs[-1]
|
||
|
last_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
|
|
||
|
print("footers are:", footers)
|
||
|
if header_footer_present == "footer" or header_footer_present == "both":
|
||
|
try:
|
||
|
footer = doc.sections[count].footer
|
||
|
footer_text = footer.paragraphs[0].add_run(str(footer[count]))
|
||
|
footer_text.font.size = Pt(12)
|
||
|
footer_text.font.bold = True
|
||
|
except Exception as e:
|
||
|
print("Adding Footer has the below error: ", e)
|
||
|
|
||
|
doc.add_page_break()
|
||
|
doc.add_section(WD_SECTION.NEW_PAGE)
|
||
|
# save to file
|
||
|
print("Saving the Doc")
|
||
|
print(doc_name)
|
||
|
print(doc.sections)
|
||
|
for count,section in enumerate(doc.sections):
|
||
|
print(count)
|
||
|
doc.save(doc_name)
|
||
|
|
||
|
|
||
|
|
||
|
def convert_books(inputfile, src_lang, tar_lang, greater_than_10 , header_footer_present):
|
||
|
|
||
|
outputzipname_pre = str((str(inputfile).split("."))[0]) + ".zip"
|
||
|
outputzipname = outputzipname_pre.replace("/book/","/book/zips/")
|
||
|
|
||
|
|
||
|
print("Extracting header and footer of every page if present")
|
||
|
headerFlag = False
|
||
|
footerFlag = False
|
||
|
if header_footer_present == "both":
|
||
|
headerFlag = True
|
||
|
footerFlag = True
|
||
|
elif header_footer_present == "header":
|
||
|
headerFlag = True
|
||
|
elif header_footer_present == "footer":
|
||
|
footerFlag = True
|
||
|
else:
|
||
|
pass
|
||
|
headers, footers = extract_header_footer(inputfile, headerFlag, footerFlag)
|
||
|
print("founded headers and footers",headers,footers)
|
||
|
|
||
|
if greater_than_10:
|
||
|
pdf = PyPDF2.PdfFileReader(inputfile, "rb")
|
||
|
numpages = pdf.getNumPages()
|
||
|
intermediate_pages = []
|
||
|
# if numpages % 10 != 0:
|
||
|
hard_pages = numpages // 10
|
||
|
for i in range(hard_pages):
|
||
|
page = (10 * i, 10 * (i + 1))
|
||
|
intermediate_pages.append(page)
|
||
|
final_pages = numpages - 10 * hard_pages
|
||
|
|
||
|
intermediate_pages.append(
|
||
|
(10 * hard_pages, 10 * hard_pages + final_pages))
|
||
|
print("counter 1234")
|
||
|
print(intermediate_pages)
|
||
|
for page_start, page_end in intermediate_pages:
|
||
|
|
||
|
# from PyPDF2 import PdfFileWriter, PdfFileReader
|
||
|
# pdf_reader = PdfFileReader(open(filename, "rb"))
|
||
|
pdf_writer1 = PyPDF2.PdfFileWriter()
|
||
|
for page in range(page_start, page_end):
|
||
|
pdf_writer1.addPage(pdf.getPage(page))
|
||
|
if not os.path.exists(f"{basepath}/media/scripts/book/intermediate_files/{(((str(inputfile).split('/'))[-1]).split('.'))[0]}/"):
|
||
|
os.mkdir(f"{basepath}/media/scripts/book/intermediate_files/{(((str(inputfile).split('/'))[-1]).split('.'))[0]}/", mode=0o777)
|
||
|
with open(
|
||
|
f"{basepath}/media/scripts/book/intermediate_files/"
|
||
|
f"{(((str(inputfile).split('/'))[-1]).split('.'))[0]}/{page_start}_{page_end}.pdf",
|
||
|
"wb",
|
||
|
) as file1:
|
||
|
pdf_writer1.write(file1)
|
||
|
|
||
|
# Making Docx for each 10pages pdf
|
||
|
for page_start, page_end in intermediate_pages:
|
||
|
pdf_file = f"{basepath}/media/scripts/book/intermediate_files/{(((str(inputfile).split('/'))[-1]).split('.'))[0]}/{page_start}_{page_end}.pdf"
|
||
|
print("Current File -> ",pdf_file)
|
||
|
loc = pdf_text_images_extractor(api_creds, pdf_file, f"{basepath}/media/scripts/book/intermediate_files/"
|
||
|
f"{(((str(inputfile).split('/'))[-1]).split('.'))[0]}/{page_start}_{page_end}.zip")
|
||
|
|
||
|
print("Extracting all contents of zip")
|
||
|
zip_extractor(loc)
|
||
|
|
||
|
print("Parsing the Json File and getting all the details")
|
||
|
locs = (loc.split("/"))[:-1]
|
||
|
json_dir = ""
|
||
|
for i in locs:
|
||
|
json_dir = json_dir + str(i) + "/"
|
||
|
json_file_path = json_dir + "/contents/structuredData.json"
|
||
|
all_pages_data = json_parser(json_file_path)
|
||
|
|
||
|
print("Creating the Doc")
|
||
|
print(basePath + f"/{(json_dir.split('/'))[-2]}.docx")
|
||
|
word_creator(all_pages_data, basePath + f"/{(json_dir.split('/'))[-2]}.docx",json_dir + f"/{(json_dir.split('/'))[-2]}.docx", src_lang, tar_lang, header_footer_present , headers, footers)
|
||
|
|
||
|
#deleting the directory of zip contents
|
||
|
print("Deleting the Directory")
|
||
|
print(json_dir)
|
||
|
#os.rmdir(json_dir)
|
||
|
|
||
|
# shutil.rmtree(json_dir, ignore_errors=True)
|
||
|
|
||
|
# return str(basePath + f"/{(json_dir.split('/'))[-2]}.docx")
|
||
|
|
||
|
|
||
|
# combining all the docx's in one Docx
|
||
|
|
||
|
pass
|
||
|
else:
|
||
|
print("Extracted All the texts and images from PDF-API CALL ")
|
||
|
loc = pdf_text_images_extractor(api_creds, inputfile, outputzipname)
|
||
|
|
||
|
print("Extracting all contents of zip")
|
||
|
zip_extractor(loc)
|
||
|
|
||
|
print("Parsing the Json File and getting all the details")
|
||
|
locs = (loc.split("/"))[:-1]
|
||
|
json_dir = ""
|
||
|
for i in locs:
|
||
|
json_dir = json_dir + str(i) + "/"
|
||
|
json_file_path = json_dir + "/contents/structuredData.json"
|
||
|
all_pages_data = json_parser(json_file_path)
|
||
|
|
||
|
print("Creating the Doc")
|
||
|
print(basePath + f"/{(json_dir.split('/'))[-2]}.docx")
|
||
|
word_creator(all_pages_data, basePath + f"/{(json_dir.split('/'))[-2]}.docx",json_dir + f"/{(json_dir.split('/'))[-2]}.docx", src_lang, tar_lang, header_footer_present , headers, footers)
|
||
|
#deleting the directory of zip contents
|
||
|
print("Deleting the Directory")
|
||
|
print(json_dir)
|
||
|
#os.rmdir(json_dir)
|
||
|
|
||
|
shutil.rmtree(json_dir, ignore_errors=True)
|
||
|
|
||
|
return str(basePath + f"/{(json_dir.split('/'))[-2]}.docx")
|
||
|
|
||
|
|
||
|
#
|
||
|
# def convert_books_grt10pages(inputfile, src_lang, tar_lang):
|
||
|
# outputzipname_pre = str((str(inputfile).split("."))[0]) + ".zip"
|
||
|
# outputzipname = outputzipname_pre.replace("/book/", "/book/zips/")
|
||
|
#
|
||
|
# print("Extracted All the texts and images from PDF-API CALL ")
|
||
|
# loc = pdf_text_images_extractor(api_creds, inputfile, outputzipname)
|
||
|
#
|
||
|
|
||
|
# input2 = "C:\\Users\\ANSU\\Downloads\\testtt12.pdf"
|
||
|
# inputfile = "C:\\Users\\ANSU\\Desktop\\MNF\\convertBook\\Adobe\\adobe-dc-pdf-services-sdk-extract-python-samples\\resources\\ihuuh_tnew.pdf"
|
||
|
# outputzipname = "someoutput2.zip"
|
||
|
# json_file = "C:\\Users\\ANSU\\Desktop\\MNF\\convertBook\\contents\\structuredData.json"
|
||
|
|
||
|
|
||
|
#convert_books("/home/user/mnf/project/MNF/conversion/booktranslator/ihuuh_tnew.pdf")
|
||
|
#pdf_text_images_extractor(api_creds,"/home/user/mnf/project/MNF/conversion/booktranslator/ihuuh_tnew.pdf","output.zip")
|
||
|
|
||
|
#zip_extractor(outputzipname)
|
||
|
|
||
|
# all_pages_data = json_parser(json_file)
|
||
|
# #print(all_pages_data)
|
||
|
# word_creator(all_pages_data)
|
||
|
|