Conversion_Kitchen_Code/kitchen_counter/conversion/booktranslator/newConvertBook.py

#for extraction of text and images from pdf
import logging
import os.path
from adobe.pdfservices.operation.auth.credentials import Credentials
from adobe.pdfservices.operation.exception.exceptions import ServiceApiException, ServiceUsageException, SdkException
from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_pdf_options import ExtractPDFOptions
from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_renditions_element_type import \
    ExtractRenditionsElementType
from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_element_type import ExtractElementType
from adobe.pdfservices.operation.execution_context import ExecutionContext
from adobe.pdfservices.operation.io.file_ref import FileRef
from adobe.pdfservices.operation.pdfops.extract_pdf_operation import ExtractPDFOperation

#for zip extraction
from zipfile import ZipFile

#for parsing json
import json

#for adding tables in docx
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx import Document
import pandas as pd
from docx.shared import Mm
from docx.shared import Inches, Cm, Pt
from docx.oxml.shared import OxmlElement
from docx.oxml.ns import qn
from docx.enum.section import WD_SECTION

#Delete Intermediate Files
import shutil

# For Translation
from conversion.translation.translation_function import translate_comparison

# For headers and footers
import PyPDF2

basePath = "/home/user/mnf/project/MNF/media/scripts/book/translated"
basepath = "/home/user/mnf/project/MNF"

api_creds = "/home/user/mnf/project/MNF/conversion/booktranslator/api_creds/pdfservices-api-credentials.json"
logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO"))


def extract_header_footer(pdf_path, headerFlag, footerFlag):
    header_text = []
    footer_text = []

    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfFileReader(file)


        for page in reader.pages:
            # print(page.images)
            # for image_file_object in page.images:
            #     with open(str(count) + image_file_object.name, "wb") as fp:
            #         fp.write(image_file_object.data)
            # Extract the text from the top region of the page
            if headerFlag:
                header = page.extractText().splitlines()[0]
                header_text.append(header)  # Append the extracted header to the overall header text
            if footerFlag:
                footer = page.extractText().splitlines()[-1]
                footer_text.append(footer)  # Append the extracted footer to the overall header text
    return header_text, footer_text


def set_cell_margins(cell, **kwargs):

    tc = cell._tc
    tcPr = tc.get_or_add_tcPr()
    tcMar = OxmlElement('w:tcMar')

    for m in ["top", "start", "bottom", "end"]:
        if m in kwargs:
            node = OxmlElement("w:{}".format(m))
            node.set(qn('w:w'), str(kwargs.get(m)))
            node.set(qn('w:type'), 'dxa')
            tcMar.append(node)

    tcPr.append(tcMar)


def add_table_to_doc(doc, df):

    columns = list(df.columns)

    table = doc.add_table(rows=1, cols=len(columns), style="Table Grid")
    table.autofit = True

    for col in range(len(columns)):
        set_cell_margins(table.cell(0, col), top=100, start=100, bottom=100, end=50)
        table.cell(0, col).text = columns[col].replace(" _x000D_", "").capitalize()


    for i, row in enumerate(df.itertuples()):
        table_row = table.add_row().cells
        for col in range(len(columns)):
            set_cell_margins(table_row[col], top=100, start=100, bottom=100, end=50)
            table_row[col].text = str(row[col + 1]).replace(" _x000D_", "")

    return doc


def pdf_text_images_extractor(api_creds, inputFile, outputzip):
    try:
        # get base path.
        base_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

        # Initial setup, create credentials instance.
        credentials = Credentials.service_account_credentials_builder() \
            .from_file(api_creds) \
            .build()

        # Create an ExecutionContext using credentials and create a new operation instance.
        execution_context = ExecutionContext.create(credentials)
        extract_pdf_operation = ExtractPDFOperation.create_new()

        # Set operation input from a source file.
        source = FileRef.create_from_local_file(inputFile)
        #source = FileRef.createFromURL(inputUrl)
        extract_pdf_operation.set_input(source)

        # Build ExtractPDF options and set them into the operation
        extract_pdf_options: ExtractPDFOptions = ExtractPDFOptions.builder() \
            .with_elements_to_extract([ExtractElementType.TEXT, ExtractElementType.TABLES]) \
            .with_elements_to_extract_renditions([ExtractRenditionsElementType.TABLES,
                                                  ExtractRenditionsElementType.FIGURES]) \
            .build()
        extract_pdf_operation.set_options(extract_pdf_options)

        # Execute the operation.
        result: FileRef = extract_pdf_operation.execute(execution_context)

        # Save the result to the specified location.
        print("Saving the Zip")
        outputs = (outputzip.split("/"))[:-1]
        outputos = (outputzip.split("/"))
        filename = (outputos[-1].split("."))[0]
        outputzip_path1 = ""
        for i in outputs:
            outputzip_path1 = outputzip_path1 + str(i) + "/"
        if not os.path.exists(outputzip_path1 + str(filename)):
            os.makedirs(outputzip_path1 + str(filename), mode=0o777, exist_ok=False)
        outputfile = open(outputzip_path1 + str(filename) + "/" + str(filename) + ".zip", "wb")
        result.write_to_stream(outputfile)
        outputfile.close()

        return str(outputzip_path1 + str(filename) + "/" + str(filename) + ".zip")
    except (ServiceApiException, ServiceUsageException, SdkException):
        # print(exception)
        logging.exception("Exception encountered while executing operation")


def zip_extractor(filename):

    filename_final = ""
    filenameo = (filename.split("/"))[:-1]
    for i in filenameo:
        filename_final = filename_final + str(i) + "/"

    with ZipFile(filename, 'r') as zipObj:
        # Extract all the contents of zip file in current directory
        zipObj.extractall(filename_final + "contents")


def json_parser(filename):
    # Opening JSON file
    f = open(filename, encoding="utf8")

    # returns JSON object as a dictionary
    data = json.load(f)

    # Iterating through the json list
    print(data['extended_metadata']['page_count'])
    print(data['extended_metadata']['language'])


    all_pages_data = []
    curr_page_contents = []
    current_page = 0

    for element in data['elements']:
        print("1789")
        print(element)
        #for detection of headings and paragraphs
        if list(element['Path'])[11] == "H" or list(element['Path'])[11] == "P":
            if current_page == element["Page"]:
                pass
            else:
                all_pages_data.append(curr_page_contents)
                current_page += 1
                curr_page_contents = []
            
            #for Filtering wrong detection of paragraph for ill detection of text and giving it a paragraph tag
            try:
                current_element = ["Text",element["Text"],element["TextSize"], element["Font"]["family_name"],
                    element["Font"]["italic"],element["Font"]["weight"]]
                try:
                    output = element["attributes"]["SpaceAfter"]
                    current_element.append(output)
                except:
                    current_element.append("")
                try:
                    output = element["attributes"]["TextAlign"]
                    current_element.append(output)
                except:
                    current_element.append("")

                curr_page_contents.append(current_element)
            except:
                continue

        #for detection of a list between paragraphs
        elif list(element['Path'])[11] == "L":

            if current_page == element["Page"]:
                pass
            else:
                all_pages_data.append(curr_page_contents)
                current_page += 1
                curr_page_contents = []

            differ_creator = (element["Path"]).split("/")
            if differ_creator[-1] == "Lbl":
                current_element = ["List Numbering", element["Text"], element["TextSize"], element["Font"]["family_name"],
                                   element["Font"]["italic"],element["Font"]["weight"]]
            else:
                current_element = ["List Data", element["Text"], element["TextSize"],
                                   element["Font"]["family_name"],
                                   element["Font"]["italic"],element["Font"]["weight"]]

            curr_page_contents.append(current_element)


        #for detection of figures
        elif list(element['Path'])[11] == "F":

            if current_page == element["Page"]:
                pass
            else:
                all_pages_data.append(curr_page_contents)
                current_page += 1
                curr_page_contents = []

            current_element = ["Figure",element["filePaths"][0],element["attributes"]["Placement"],
                               element["attributes"]["BBox"][0],element["attributes"]["BBox"][1],
                               element["attributes"]["BBox"][2],element["attributes"]["BBox"][3]]


            curr_page_contents.append(current_element)


        #for detection of tables
        elif list(element['Path'])[11] == "S":

            if current_page == element["Page"]:
                pass
            else:
                all_pages_data.append(curr_page_contents)
                current_page += 1
                curr_page_contents = []

            if list(element['Path'])[11:21] == "Sect/Table":
                curr_page_contents.append(["Table",element["attributes"]["NumRow"],element["attributes"]["NumCol"],element["filePaths"][0]])
            else:
                pass


    all_pages_data.append(curr_page_contents)
    # Closing file
    f.close()
    return all_pages_data


def word_creator(all_data,doc_name,media_path_ref, src_lang, tar_lang, header_footer_present, headers, footers):
    odc_name_modified = (media_path_ref.split("/"))[:-1]
    media_path_dir = ""
    for i in odc_name_modified:
        media_path_dir = media_path_dir + str(i) + "/"

    listo = ""

    doc = Document()
    file = open("/home/user/mnf/project/MNF/translation_data.txt", "w")
    for count, page in enumerate(all_data):
        print("headers are:",headers)
        if header_footer_present == "header" or header_footer_present == "both":
            try:
                header = doc.sections[count].header
                header_text = header.paragraphs[0].add_run(str(headers[count]))
                header_text.font.size = Pt(12)
                header_text.font.bold = True
            except Exception as e:
                print("Adding Header has the below error: ",e)
        for ele in page:
            print("Current Element",ele)
            #writing text in docx
            if ele[0] == "Text":
                style = doc.styles['Normal']
                font = style.font
                font.name = str(ele[3])
                font.size = Pt(int(ele[2]))
                act = doc.add_paragraph(style=style)
                act_format = act.paragraph_format
                if ele[6] == "":
                    act_format.space_after = Pt(12)
                else:
                    act_format.space_after = Pt(int(ele[6]))
                if ele[7] == "":
                    act_format.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
                else:
                    if ele[7] == "Justify":
                        act_format.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
                    elif ele[7] == "Start":
                        act_format.alignment = WD_ALIGN_PARAGRAPH.LEFT
                    elif ele[7] == "Center":
                        act_format.alignment = WD_ALIGN_PARAGRAPH.CENTER
                    elif ele[7] == "End":
                        act_format.alignment = WD_ALIGN_PARAGRAPH.RIGHT
                    else:
                        act_format.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
                act_format.line_spacing = Pt(12)
                act_format.left_indent = Inches(0)
                # if (non_dial_dest_lang == 'hi') or (non_dial_dest_lang == 'gu'):
                #     act.style.font.name = 'Mangal'
                # else:
                #     act.style.font.name = 'Courier New'
                trans_text, trans = translate_comparison(ele[1], src_lang, tar_lang, True)
                file.write(str(ele[1]))
                file.write(str(trans))
                para = act.add_run(trans_text)
                if ele[4] == "true":
                    para.italic = True
                if ele[5] > 400:
                    para.bold = True


            #adding table in docx
            elif ele[0] == "Table":

                # read xlsx file
                hr_df = pd.read_excel(media_path_dir + str(ele[3]))


                doc = Document()
                section = doc.sections[0]
                section.left_margin = Mm(5)
                section.right_margin = Mm(5)

                # add tables
                add_table_to_doc(doc, hr_df.iloc[:5])


            #adding list in docx
            elif ele[0] == "List Numbering":

                if (list(ele[1])[0]).isdigit():
                    listo = "Ordered"
                else:
                    listo = "UnOrdered"


            #adding list in docx
            elif ele[0] == "List Data":
                if listo == "Ordered":

                    para = doc.add_paragraph(translate_comparison(ele[1], src_lang, tar_lang, True),
                                      style='List Number')
                    listo=""
                else:
                    para = doc.add_paragraph(translate_comparison(ele[1], src_lang, tar_lang, True),
                                      style='List Bullet')
                    listo = ""

                if ele[4] == "true":
                    para.italic = True
                if ele[5] > 300:
                    para.bold = True


            #adding figure in docx
            elif ele[0] == "Figure":

                doc.add_picture(media_path_dir + "/contents/" + str(ele[1]), width=Pt(int(ele[5])-int(ele[3])), height=Pt(int(ele[6])-int(ele[4])))
                last_paragraph = doc.paragraphs[-1]
                last_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER

        print("footers are:", footers)
        if header_footer_present == "footer" or header_footer_present == "both":
            try:
                footer = doc.sections[count].footer
                footer_text = footer.paragraphs[0].add_run(str(footer[count]))
                footer_text.font.size = Pt(12)
                footer_text.font.bold = True
            except Exception as e:
                print("Adding Footer has the below error: ", e)

        doc.add_page_break()
        doc.add_section(WD_SECTION.NEW_PAGE)
    # save to file
    print("Saving the Doc")
    print(doc_name)
    print(doc.sections)
    for count,section in enumerate(doc.sections):
        print(count)
    doc.save(doc_name)


def convert_books(inputfile, src_lang, tar_lang, greater_than_10 , header_footer_present):

    outputzipname_pre = str((str(inputfile).split("."))[0]) + ".zip"
    outputzipname = outputzipname_pre.replace("/book/","/book/zips/")


    print("Extracting header and footer of every page if present")
    headerFlag = False
    footerFlag = False
    if header_footer_present == "both":
        headerFlag = True
        footerFlag = True
    elif header_footer_present == "header":
        headerFlag = True
    elif header_footer_present == "footer":
        footerFlag = True
    else:
        pass
    headers, footers = extract_header_footer(inputfile, headerFlag, footerFlag)
    print("founded headers and footers",headers,footers)

    if greater_than_10:
        pdf = PyPDF2.PdfFileReader(inputfile, "rb")
        numpages = pdf.getNumPages()
        intermediate_pages = []
        # if numpages % 10 != 0:
        hard_pages = numpages // 10
        for i in range(hard_pages):
            page = (10 * i, 10 * (i + 1))
            intermediate_pages.append(page)
        final_pages = numpages - 10 * hard_pages

        intermediate_pages.append(
            (10 * hard_pages, 10 * hard_pages + final_pages))
        print("counter 1234")
        print(intermediate_pages)
        for page_start, page_end in intermediate_pages:

            # from PyPDF2 import PdfFileWriter, PdfFileReader
            # pdf_reader = PdfFileReader(open(filename, "rb"))
            pdf_writer1 = PyPDF2.PdfFileWriter()
            for page in range(page_start, page_end):
                pdf_writer1.addPage(pdf.getPage(page))
            if not os.path.exists(f"{basepath}/media/scripts/book/intermediate_files/{(((str(inputfile).split('/'))[-1]).split('.'))[0]}/"):
                os.mkdir(f"{basepath}/media/scripts/book/intermediate_files/{(((str(inputfile).split('/'))[-1]).split('.'))[0]}/", mode=0o777)
            with open(
                f"{basepath}/media/scripts/book/intermediate_files/"
                f"{(((str(inputfile).split('/'))[-1]).split('.'))[0]}/{page_start}_{page_end}.pdf",
                "wb",
            ) as file1:
                pdf_writer1.write(file1)

        # Making Docx for each 10pages pdf
        for page_start, page_end in intermediate_pages:
            pdf_file = f"{basepath}/media/scripts/book/intermediate_files/{(((str(inputfile).split('/'))[-1]).split('.'))[0]}/{page_start}_{page_end}.pdf"
            print("Current File -> ",pdf_file)
            loc = pdf_text_images_extractor(api_creds, pdf_file, f"{basepath}/media/scripts/book/intermediate_files/"
                f"{(((str(inputfile).split('/'))[-1]).split('.'))[0]}/{page_start}_{page_end}.zip")
            
            print("Extracting all contents of zip")
            zip_extractor(loc)

            print("Parsing the Json File and getting all the details")
            locs = (loc.split("/"))[:-1]
            json_dir = ""
            for i in locs:
                json_dir = json_dir + str(i) + "/"
            json_file_path = json_dir + "/contents/structuredData.json"
            all_pages_data = json_parser(json_file_path)

            print("Creating the Doc")
            print(basePath + f"/{(json_dir.split('/'))[-2]}.docx")
            word_creator(all_pages_data, basePath + f"/{(json_dir.split('/'))[-2]}.docx",json_dir + f"/{(json_dir.split('/'))[-2]}.docx", src_lang, tar_lang, header_footer_present , headers, footers)

            #deleting the directory of zip contents
            print("Deleting the Directory")
            print(json_dir)
            #os.rmdir(json_dir)
            
            # shutil.rmtree(json_dir, ignore_errors=True)

            # return str(basePath + f"/{(json_dir.split('/'))[-2]}.docx")


        # combining all the docx's in one Docx

        pass
    else:
        print("Extracted All the texts and images from PDF-API CALL ")
        loc = pdf_text_images_extractor(api_creds, inputfile, outputzipname)

        print("Extracting all contents of zip")
        zip_extractor(loc)

        print("Parsing the Json File and getting all the details")
        locs = (loc.split("/"))[:-1]
        json_dir = ""
        for i in locs:
            json_dir = json_dir + str(i) + "/"
        json_file_path = json_dir + "/contents/structuredData.json"
        all_pages_data = json_parser(json_file_path)

        print("Creating the Doc")
        print(basePath + f"/{(json_dir.split('/'))[-2]}.docx")
        word_creator(all_pages_data, basePath + f"/{(json_dir.split('/'))[-2]}.docx",json_dir + f"/{(json_dir.split('/'))[-2]}.docx", src_lang, tar_lang, header_footer_present , headers, footers)
        #deleting the directory of zip contents
        print("Deleting the Directory")
        print(json_dir)
        #os.rmdir(json_dir)
        
        shutil.rmtree(json_dir, ignore_errors=True)

        return str(basePath + f"/{(json_dir.split('/'))[-2]}.docx")


#
# def convert_books_grt10pages(inputfile, src_lang, tar_lang):
#     outputzipname_pre = str((str(inputfile).split("."))[0]) + ".zip"
#     outputzipname = outputzipname_pre.replace("/book/", "/book/zips/")
#
#     print("Extracted All the texts and images from PDF-API CALL ")
#     loc = pdf_text_images_extractor(api_creds, inputfile, outputzipname)
#

# input2 = "C:\\Users\\ANSU\\Downloads\\testtt12.pdf"
# inputfile = "C:\\Users\\ANSU\\Desktop\\MNF\\convertBook\\Adobe\\adobe-dc-pdf-services-sdk-extract-python-samples\\resources\\ihuuh_tnew.pdf"
# outputzipname = "someoutput2.zip"
# json_file = "C:\\Users\\ANSU\\Desktop\\MNF\\convertBook\\contents\\structuredData.json"


#convert_books("/home/user/mnf/project/MNF/conversion/booktranslator/ihuuh_tnew.pdf")
#pdf_text_images_extractor(api_creds,"/home/user/mnf/project/MNF/conversion/booktranslator/ihuuh_tnew.pdf","output.zip")

#zip_extractor(outputzipname)

# all_pages_data = json_parser(json_file)
# #print(all_pages_data)
# word_creator(all_pages_data)
first commit 2024-04-27 09:33:09 +00:00			`#for extraction of text and images from pdf`
			`import logging`
			`import os.path`
			`from adobe.pdfservices.operation.auth.credentials import Credentials`
			`from adobe.pdfservices.operation.exception.exceptions import ServiceApiException, ServiceUsageException, SdkException`
			`from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_pdf_options import ExtractPDFOptions`
			`from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_renditions_element_type import \`
			`ExtractRenditionsElementType`
			`from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_element_type import ExtractElementType`
			`from adobe.pdfservices.operation.execution_context import ExecutionContext`
			`from adobe.pdfservices.operation.io.file_ref import FileRef`
			`from adobe.pdfservices.operation.pdfops.extract_pdf_operation import ExtractPDFOperation`

			`#for zip extraction`
			`from zipfile import ZipFile`

			`#for parsing json`
			`import json`

			`#for adding tables in docx`
			`from docx.enum.text import WD_ALIGN_PARAGRAPH`
			`from docx import Document`
			`import pandas as pd`
			`from docx.shared import Mm`
			`from docx.shared import Inches, Cm, Pt`
			`from docx.oxml.shared import OxmlElement`
			`from docx.oxml.ns import qn`
			`from docx.enum.section import WD_SECTION`

			`#Delete Intermediate Files`
			`import shutil`

			`# For Translation`
			`from conversion.translation.translation_function import translate_comparison`

			`# For headers and footers`
			`import PyPDF2`

			`basePath = "/home/user/mnf/project/MNF/media/scripts/book/translated"`
			`basepath = "/home/user/mnf/project/MNF"`

			`api_creds = "/home/user/mnf/project/MNF/conversion/booktranslator/api_creds/pdfservices-api-credentials.json"`
			`logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO"))`



			`def extract_header_footer(pdf_path, headerFlag, footerFlag):`
			`header_text = []`
			`footer_text = []`

			`with open(pdf_path, 'rb') as file:`
			`reader = PyPDF2.PdfFileReader(file)`



			`for page in reader.pages:`
			`# print(page.images)`
			`# for image_file_object in page.images:`
			`# with open(str(count) + image_file_object.name, "wb") as fp:`
			`# fp.write(image_file_object.data)`
			`# Extract the text from the top region of the page`
			`if headerFlag:`
			`header = page.extractText().splitlines()[0]`
			`header_text.append(header) # Append the extracted header to the overall header text`
			`if footerFlag:`
			`footer = page.extractText().splitlines()[-1]`
			`footer_text.append(footer) # Append the extracted footer to the overall header text`
			`return header_text, footer_text`



			`def set_cell_margins(cell, **kwargs):`

			`tc = cell._tc`
			`tcPr = tc.get_or_add_tcPr()`
			`tcMar = OxmlElement('w:tcMar')`

			`for m in ["top", "start", "bottom", "end"]:`
			`if m in kwargs:`
			`node = OxmlElement("w:{}".format(m))`
			`node.set(qn('w:w'), str(kwargs.get(m)))`
			`node.set(qn('w:type'), 'dxa')`
			`tcMar.append(node)`

			`tcPr.append(tcMar)`



			`def add_table_to_doc(doc, df):`

			`columns = list(df.columns)`

			`table = doc.add_table(rows=1, cols=len(columns), style="Table Grid")`
			`table.autofit = True`

			`for col in range(len(columns)):`
			`set_cell_margins(table.cell(0, col), top=100, start=100, bottom=100, end=50)`
			`table.cell(0, col).text = columns[col].replace(" _x000D_", "").capitalize()`


			`for i, row in enumerate(df.itertuples()):`
			`table_row = table.add_row().cells`
			`for col in range(len(columns)):`
			`set_cell_margins(table_row[col], top=100, start=100, bottom=100, end=50)`
			`table_row[col].text = str(row[col + 1]).replace(" _x000D_", "")`

			`return doc`



			`def pdf_text_images_extractor(api_creds, inputFile, outputzip):`
			`try:`
			`# get base path.`
			`base_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))`

			`# Initial setup, create credentials instance.`
			`credentials = Credentials.service_account_credentials_builder() \`
			`.from_file(api_creds) \`
			`.build()`

			`# Create an ExecutionContext using credentials and create a new operation instance.`
			`execution_context = ExecutionContext.create(credentials)`
			`extract_pdf_operation = ExtractPDFOperation.create_new()`

			`# Set operation input from a source file.`
			`source = FileRef.create_from_local_file(inputFile)`
			`#source = FileRef.createFromURL(inputUrl)`
			`extract_pdf_operation.set_input(source)`

			`# Build ExtractPDF options and set them into the operation`
			`extract_pdf_options: ExtractPDFOptions = ExtractPDFOptions.builder() \`
			`.with_elements_to_extract([ExtractElementType.TEXT, ExtractElementType.TABLES]) \`
			`.with_elements_to_extract_renditions([ExtractRenditionsElementType.TABLES,`
			`ExtractRenditionsElementType.FIGURES]) \`
			`.build()`
			`extract_pdf_operation.set_options(extract_pdf_options)`

			`# Execute the operation.`
			`result: FileRef = extract_pdf_operation.execute(execution_context)`

			`# Save the result to the specified location.`
			`print("Saving the Zip")`
			`outputs = (outputzip.split("/"))[:-1]`
			`outputos = (outputzip.split("/"))`
			`filename = (outputos[-1].split("."))[0]`
			`outputzip_path1 = ""`
			`for i in outputs:`
			`outputzip_path1 = outputzip_path1 + str(i) + "/"`
			`if not os.path.exists(outputzip_path1 + str(filename)):`
			`os.makedirs(outputzip_path1 + str(filename), mode=0o777, exist_ok=False)`
			`outputfile = open(outputzip_path1 + str(filename) + "/" + str(filename) + ".zip", "wb")`
			`result.write_to_stream(outputfile)`
			`outputfile.close()`

			`return str(outputzip_path1 + str(filename) + "/" + str(filename) + ".zip")`
			`except (ServiceApiException, ServiceUsageException, SdkException):`
			`# print(exception)`
			`logging.exception("Exception encountered while executing operation")`


			`def zip_extractor(filename):`

			`filename_final = ""`
			`filenameo = (filename.split("/"))[:-1]`
			`for i in filenameo:`
			`filename_final = filename_final + str(i) + "/"`

			`with ZipFile(filename, 'r') as zipObj:`
			`# Extract all the contents of zip file in current directory`
			`zipObj.extractall(filename_final + "contents")`



			`def json_parser(filename):`
			`# Opening JSON file`
			`f = open(filename, encoding="utf8")`

			`# returns JSON object as a dictionary`
			`data = json.load(f)`

			`# Iterating through the json list`
			`print(data['extended_metadata']['page_count'])`
			`print(data['extended_metadata']['language'])`


			`all_pages_data = []`
			`curr_page_contents = []`
			`current_page = 0`

			`for element in data['elements']:`
			`print("1789")`
			`print(element)`
			`#for detection of headings and paragraphs`
			`if list(element['Path'])[11] == "H" or list(element['Path'])[11] == "P":`
			`if current_page == element["Page"]:`
			`pass`
			`else:`
			`all_pages_data.append(curr_page_contents)`
			`current_page += 1`
			`curr_page_contents = []`

			`#for Filtering wrong detection of paragraph for ill detection of text and giving it a paragraph tag`
			`try:`
			`current_element = ["Text",element["Text"],element["TextSize"], element["Font"]["family_name"],`
			`element["Font"]["italic"],element["Font"]["weight"]]`
			`try:`
			`output = element["attributes"]["SpaceAfter"]`
			`current_element.append(output)`
			`except:`
			`current_element.append("")`
			`try:`
			`output = element["attributes"]["TextAlign"]`
			`current_element.append(output)`
			`except:`
			`current_element.append("")`

			`curr_page_contents.append(current_element)`
			`except:`
			`continue`

			`#for detection of a list between paragraphs`
			`elif list(element['Path'])[11] == "L":`

			`if current_page == element["Page"]:`
			`pass`
			`else:`
			`all_pages_data.append(curr_page_contents)`
			`current_page += 1`
			`curr_page_contents = []`

			`differ_creator = (element["Path"]).split("/")`
			`if differ_creator[-1] == "Lbl":`
			`current_element = ["List Numbering", element["Text"], element["TextSize"], element["Font"]["family_name"],`
			`element["Font"]["italic"],element["Font"]["weight"]]`
			`else:`
			`current_element = ["List Data", element["Text"], element["TextSize"],`
			`element["Font"]["family_name"],`
			`element["Font"]["italic"],element["Font"]["weight"]]`

			`curr_page_contents.append(current_element)`



			`#for detection of figures`
			`elif list(element['Path'])[11] == "F":`

			`if current_page == element["Page"]:`
			`pass`
			`else:`
			`all_pages_data.append(curr_page_contents)`
			`current_page += 1`
			`curr_page_contents = []`

			`current_element = ["Figure",element["filePaths"][0],element["attributes"]["Placement"],`
			`element["attributes"]["BBox"][0],element["attributes"]["BBox"][1],`
			`element["attributes"]["BBox"][2],element["attributes"]["BBox"][3]]`


			`curr_page_contents.append(current_element)`



			`#for detection of tables`
			`elif list(element['Path'])[11] == "S":`

			`if current_page == element["Page"]:`
			`pass`
			`else:`
			`all_pages_data.append(curr_page_contents)`
			`current_page += 1`
			`curr_page_contents = []`

			`if list(element['Path'])[11:21] == "Sect/Table":`
			`curr_page_contents.append(["Table",element["attributes"]["NumRow"],element["attributes"]["NumCol"],element["filePaths"][0]])`
			`else:`
			`pass`



			`all_pages_data.append(curr_page_contents)`
			`# Closing file`
			`f.close()`
			`return all_pages_data`



			`def word_creator(all_data,doc_name,media_path_ref, src_lang, tar_lang, header_footer_present, headers, footers):`
			`odc_name_modified = (media_path_ref.split("/"))[:-1]`
			`media_path_dir = ""`
			`for i in odc_name_modified:`
			`media_path_dir = media_path_dir + str(i) + "/"`

			`listo = ""`

			`doc = Document()`
			`file = open("/home/user/mnf/project/MNF/translation_data.txt", "w")`
			`for count, page in enumerate(all_data):`
			`print("headers are:",headers)`
			`if header_footer_present == "header" or header_footer_present == "both":`
			`try:`
			`header = doc.sections[count].header`
			`header_text = header.paragraphs[0].add_run(str(headers[count]))`
			`header_text.font.size = Pt(12)`
			`header_text.font.bold = True`
			`except Exception as e:`
			`print("Adding Header has the below error: ",e)`
			`for ele in page:`
			`print("Current Element",ele)`
			`#writing text in docx`
			`if ele[0] == "Text":`
			`style = doc.styles['Normal']`
			`font = style.font`
			`font.name = str(ele[3])`
			`font.size = Pt(int(ele[2]))`
			`act = doc.add_paragraph(style=style)`
			`act_format = act.paragraph_format`
			`if ele[6] == "":`
			`act_format.space_after = Pt(12)`
			`else:`
			`act_format.space_after = Pt(int(ele[6]))`
			`if ele[7] == "":`
			`act_format.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY`
			`else:`
			`if ele[7] == "Justify":`
			`act_format.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY`
			`elif ele[7] == "Start":`
			`act_format.alignment = WD_ALIGN_PARAGRAPH.LEFT`
			`elif ele[7] == "Center":`
			`act_format.alignment = WD_ALIGN_PARAGRAPH.CENTER`
			`elif ele[7] == "End":`
			`act_format.alignment = WD_ALIGN_PARAGRAPH.RIGHT`
			`else:`
			`act_format.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY`
			`act_format.line_spacing = Pt(12)`
			`act_format.left_indent = Inches(0)`
			`# if (non_dial_dest_lang == 'hi') or (non_dial_dest_lang == 'gu'):`
			`# act.style.font.name = 'Mangal'`
			`# else:`
			`# act.style.font.name = 'Courier New'`
			`trans_text, trans = translate_comparison(ele[1], src_lang, tar_lang, True)`
			`file.write(str(ele[1]))`
			`file.write(str(trans))`
			`para = act.add_run(trans_text)`
			`if ele[4] == "true":`
			`para.italic = True`
			`if ele[5] > 400:`
			`para.bold = True`



			`#adding table in docx`
			`elif ele[0] == "Table":`

			`# read xlsx file`
			`hr_df = pd.read_excel(media_path_dir + str(ele[3]))`


			`doc = Document()`
			`section = doc.sections[0]`
			`section.left_margin = Mm(5)`
			`section.right_margin = Mm(5)`

			`# add tables`
			`add_table_to_doc(doc, hr_df.iloc[:5])`



			`#adding list in docx`
			`elif ele[0] == "List Numbering":`

			`if (list(ele[1])[0]).isdigit():`
			`listo = "Ordered"`
			`else:`
			`listo = "UnOrdered"`



			`#adding list in docx`
			`elif ele[0] == "List Data":`
			`if listo == "Ordered":`

			`para = doc.add_paragraph(translate_comparison(ele[1], src_lang, tar_lang, True),`
			`style='List Number')`
			`listo=""`
			`else:`
			`para = doc.add_paragraph(translate_comparison(ele[1], src_lang, tar_lang, True),`
			`style='List Bullet')`
			`listo = ""`

			`if ele[4] == "true":`
			`para.italic = True`
			`if ele[5] > 300:`
			`para.bold = True`



			`#adding figure in docx`
			`elif ele[0] == "Figure":`

			`doc.add_picture(media_path_dir + "/contents/" + str(ele[1]), width=Pt(int(ele[5])-int(ele[3])), height=Pt(int(ele[6])-int(ele[4])))`
			`last_paragraph = doc.paragraphs[-1]`
			`last_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER`

			`print("footers are:", footers)`
			`if header_footer_present == "footer" or header_footer_present == "both":`
			`try:`
			`footer = doc.sections[count].footer`
			`footer_text = footer.paragraphs[0].add_run(str(footer[count]))`
			`footer_text.font.size = Pt(12)`
			`footer_text.font.bold = True`
			`except Exception as e:`
			`print("Adding Footer has the below error: ", e)`

			`doc.add_page_break()`
			`doc.add_section(WD_SECTION.NEW_PAGE)`
			`# save to file`
			`print("Saving the Doc")`
			`print(doc_name)`
			`print(doc.sections)`
			`for count,section in enumerate(doc.sections):`
			`print(count)`
			`doc.save(doc_name)`



			`def convert_books(inputfile, src_lang, tar_lang, greater_than_10 , header_footer_present):`

			`outputzipname_pre = str((str(inputfile).split("."))[0]) + ".zip"`
			`outputzipname = outputzipname_pre.replace("/book/","/book/zips/")`


			`print("Extracting header and footer of every page if present")`
			`headerFlag = False`
			`footerFlag = False`
			`if header_footer_present == "both":`
			`headerFlag = True`
			`footerFlag = True`
			`elif header_footer_present == "header":`
			`headerFlag = True`
			`elif header_footer_present == "footer":`
			`footerFlag = True`
			`else:`
			`pass`
			`headers, footers = extract_header_footer(inputfile, headerFlag, footerFlag)`
			`print("founded headers and footers",headers,footers)`

			`if greater_than_10:`
			`pdf = PyPDF2.PdfFileReader(inputfile, "rb")`
			`numpages = pdf.getNumPages()`
			`intermediate_pages = []`
			`# if numpages % 10 != 0:`
			`hard_pages = numpages // 10`
			`for i in range(hard_pages):`
			`page = (10 * i, 10 * (i + 1))`
			`intermediate_pages.append(page)`
			`final_pages = numpages - 10 * hard_pages`

			`intermediate_pages.append(`
			`(10 * hard_pages, 10 * hard_pages + final_pages))`
			`print("counter 1234")`
			`print(intermediate_pages)`
			`for page_start, page_end in intermediate_pages:`

			`# from PyPDF2 import PdfFileWriter, PdfFileReader`
			`# pdf_reader = PdfFileReader(open(filename, "rb"))`
			`pdf_writer1 = PyPDF2.PdfFileWriter()`
			`for page in range(page_start, page_end):`
			`pdf_writer1.addPage(pdf.getPage(page))`
			`if not os.path.exists(f"{basepath}/media/scripts/book/intermediate_files/{(((str(inputfile).split('/'))[-1]).split('.'))[0]}/"):`
			`os.mkdir(f"{basepath}/media/scripts/book/intermediate_files/{(((str(inputfile).split('/'))[-1]).split('.'))[0]}/", mode=0o777)`
			`with open(`
			`f"{basepath}/media/scripts/book/intermediate_files/"`
			`f"{(((str(inputfile).split('/'))[-1]).split('.'))[0]}/{page_start}_{page_end}.pdf",`
			`"wb",`
			`) as file1:`
			`pdf_writer1.write(file1)`

			`# Making Docx for each 10pages pdf`
			`for page_start, page_end in intermediate_pages:`
			`pdf_file = f"{basepath}/media/scripts/book/intermediate_files/{(((str(inputfile).split('/'))[-1]).split('.'))[0]}/{page_start}_{page_end}.pdf"`
			`print("Current File -> ",pdf_file)`
			`loc = pdf_text_images_extractor(api_creds, pdf_file, f"{basepath}/media/scripts/book/intermediate_files/"`
			`f"{(((str(inputfile).split('/'))[-1]).split('.'))[0]}/{page_start}_{page_end}.zip")`

			`print("Extracting all contents of zip")`
			`zip_extractor(loc)`

			`print("Parsing the Json File and getting all the details")`
			`locs = (loc.split("/"))[:-1]`
			`json_dir = ""`
			`for i in locs:`
			`json_dir = json_dir + str(i) + "/"`
			`json_file_path = json_dir + "/contents/structuredData.json"`
			`all_pages_data = json_parser(json_file_path)`

			`print("Creating the Doc")`
			`print(basePath + f"/{(json_dir.split('/'))[-2]}.docx")`
			`word_creator(all_pages_data, basePath + f"/{(json_dir.split('/'))[-2]}.docx",json_dir + f"/{(json_dir.split('/'))[-2]}.docx", src_lang, tar_lang, header_footer_present , headers, footers)`

			`#deleting the directory of zip contents`
			`print("Deleting the Directory")`
			`print(json_dir)`
			`#os.rmdir(json_dir)`

			`# shutil.rmtree(json_dir, ignore_errors=True)`

			`# return str(basePath + f"/{(json_dir.split('/'))[-2]}.docx")`


			`# combining all the docx's in one Docx`

			`pass`
			`else:`
			`print("Extracted All the texts and images from PDF-API CALL ")`
			`loc = pdf_text_images_extractor(api_creds, inputfile, outputzipname)`

			`print("Extracting all contents of zip")`
			`zip_extractor(loc)`

			`print("Parsing the Json File and getting all the details")`
			`locs = (loc.split("/"))[:-1]`
			`json_dir = ""`
			`for i in locs:`
			`json_dir = json_dir + str(i) + "/"`
			`json_file_path = json_dir + "/contents/structuredData.json"`
			`all_pages_data = json_parser(json_file_path)`

			`print("Creating the Doc")`
			`print(basePath + f"/{(json_dir.split('/'))[-2]}.docx")`
			`word_creator(all_pages_data, basePath + f"/{(json_dir.split('/'))[-2]}.docx",json_dir + f"/{(json_dir.split('/'))[-2]}.docx", src_lang, tar_lang, header_footer_present , headers, footers)`
			`#deleting the directory of zip contents`
			`print("Deleting the Directory")`
			`print(json_dir)`
			`#os.rmdir(json_dir)`

			`shutil.rmtree(json_dir, ignore_errors=True)`

			`return str(basePath + f"/{(json_dir.split('/'))[-2]}.docx")`


			`#`
			`# def convert_books_grt10pages(inputfile, src_lang, tar_lang):`
			`# outputzipname_pre = str((str(inputfile).split("."))[0]) + ".zip"`
			`# outputzipname = outputzipname_pre.replace("/book/", "/book/zips/")`
			`#`
			`# print("Extracted All the texts and images from PDF-API CALL ")`
			`# loc = pdf_text_images_extractor(api_creds, inputfile, outputzipname)`
			`#`

			`# input2 = "C:\\Users\\ANSU\\Downloads\\testtt12.pdf"`
			`# inputfile = "C:\\Users\\ANSU\\Desktop\\MNF\\convertBook\\Adobe\\adobe-dc-pdf-services-sdk-extract-python-samples\\resources\\ihuuh_tnew.pdf"`
			`# outputzipname = "someoutput2.zip"`
			`# json_file = "C:\\Users\\ANSU\\Desktop\\MNF\\convertBook\\contents\\structuredData.json"`


			`#convert_books("/home/user/mnf/project/MNF/conversion/booktranslator/ihuuh_tnew.pdf")`
			`#pdf_text_images_extractor(api_creds,"/home/user/mnf/project/MNF/conversion/booktranslator/ihuuh_tnew.pdf","output.zip")`

			`#zip_extractor(outputzipname)`

			`# all_pages_data = json_parser(json_file)`
			`# #print(all_pages_data)`
			`# word_creator(all_pages_data)`