Conversion_Kitchen_Code/kitchen_counter/conversion/booktranslator/newConvertBook_name.py

#for extraction of text and images from pdf
import logging
import os.path
from adobe.pdfservices.operation.auth.credentials import Credentials
from adobe.pdfservices.operation.exception.exceptions import ServiceApiException, ServiceUsageException, SdkException
from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_pdf_options import ExtractPDFOptions
from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_renditions_element_type import \
    ExtractRenditionsElementType
from adobe.pdfservices.operation.pdfops.options.extractpdf.extract_element_type import ExtractElementType
from adobe.pdfservices.operation.execution_context import ExecutionContext
from adobe.pdfservices.operation.io.file_ref import FileRef
from adobe.pdfservices.operation.pdfops.extract_pdf_operation import ExtractPDFOperation

#for zip extraction
from zipfile import ZipFile

#for parsing json
import json


#for adding tables in docx
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx import Document
import pandas as pd
from docx.shared import Mm
from docx.shared import Inches, Cm, Pt
from docx.oxml.shared import OxmlElement
from docx.oxml.ns import qn


api_creds = "/home/user/mnf/project/MNF/conversion/booktranslator/api_creds/pdfservices-api-credentials.json"
logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO"))


def set_cell_margins(cell, **kwargs):

    tc = cell._tc
    tcPr = tc.get_or_add_tcPr()
    tcMar = OxmlElement('w:tcMar')

    for m in ["top", "start", "bottom", "end"]:
        if m in kwargs:
            node = OxmlElement("w:{}".format(m))
            node.set(qn('w:w'), str(kwargs.get(m)))
            node.set(qn('w:type'), 'dxa')
            tcMar.append(node)

    tcPr.append(tcMar)


def add_table_to_doc(doc, df):

    columns = list(df.columns)

    table = doc.add_table(rows=1, cols=len(columns), style="Table Grid")
    table.autofit = True

    for col in range(len(columns)):
        set_cell_margins(table.cell(0, col), top=100, start=100, bottom=100, end=50)
        table.cell(0, col).text = columns[col].replace(" _x000D_", "").capitalize()


    for i, row in enumerate(df.itertuples()):
        table_row = table.add_row().cells
        for col in range(len(columns)):
            set_cell_margins(table_row[col], top=100, start=100, bottom=100, end=50)
            table_row[col].text = str(row[col + 1]).replace(" _x000D_", "")

    return doc


def pdf_text_images_extractor(api_creds, inputFile, outputzip):
    try:
        # get base path.
        base_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

        # Initial setup, create credentials instance.
        credentials = Credentials.service_account_credentials_builder() \
            .from_file(api_creds) \
            .build()

        # Create an ExecutionContext using credentials and create a new operation instance.
        execution_context = ExecutionContext.create(credentials)
        extract_pdf_operation = ExtractPDFOperation.create_new()

        # Set operation input from a source file.
        source = FileRef.create_from_local_file(inputFile)
        extract_pdf_operation.set_input(source)

        # Build ExtractPDF options and set them into the operation
        extract_pdf_options: ExtractPDFOptions = ExtractPDFOptions.builder() \
            .with_elements_to_extract([ExtractElementType.TEXT, ExtractElementType.TABLES]) \
            .with_elements_to_extract_renditions([ExtractRenditionsElementType.TABLES,
                                                  ExtractRenditionsElementType.FIGURES]) \
            .build()
        extract_pdf_operation.set_options(extract_pdf_options)

        # Execute the operation.
        result: FileRef = extract_pdf_operation.execute(execution_context)
        # print("33333333333333333333333333")
        # print(result)
        # f = open("myfile.txt", "wb")
        # print("12121212")
        # # Save the result to the specified location.
        # result.write_to_stream(f)
        # print("ttttttttttttt")
        # f.close()
        result.save_as("/tmp/extra")
    except (ServiceApiException, ServiceUsageException, SdkException):
        logging.exception("Exception encountered while executing operation")


def zip_extractor(filename):

    with ZipFile(filename, 'r') as zipObj:
        # Extract all the contents of zip file in current directory
        zipObj.extractall("contents")


def json_parser(filename):
    # Opening JSON file
    f = open(filename, encoding="utf8")

    # returns JSON object as a dictionary
    data = json.load(f)

    # Iterating through the json list
    print(data['extended_metadata']['page_count'])
    print(data['extended_metadata']['language'])


    all_pages_data = []
    curr_page_contents = []
    current_page = 0

    for element in data['elements']:

        #for detection of headings and paragraphs
        if list(element['Path'])[11] == "H" or list(element['Path'])[11] == "P":

            if current_page == element["Page"]:
                pass
            else:
                all_pages_data.append(curr_page_contents)
                current_page += 1
                curr_page_contents = []

            current_element = ["Text",element["Text"],element["TextSize"], element["Font"]["family_name"],
                               element["Font"]["italic"],element["Font"]["weight"]]
            try:
                output = element["attributes"]["SpaceAfter"]
                current_element.append(output)
            except:
                current_element.append("")
            try:
                output = element["attributes"]["TextAlign"]
                current_element.append(output)
            except:
                current_element.append("")

            curr_page_contents.append(current_element)


        #for detection of a list between paragraphs
        elif list(element['Path'])[11] == "L":

            if current_page == element["Page"]:
                pass
            else:
                all_pages_data.append(curr_page_contents)
                current_page += 1
                curr_page_contents = []

            differ_creator = (element["Path"]).split("/")
            if differ_creator[-1] == "Lbl":
                current_element = ["List Numbering", element["Text"], element["TextSize"], element["Font"]["family_name"],
                                   element["Font"]["italic"],element["Font"]["weight"]]
            else:
                current_element = ["List Data", element["Text"], element["TextSize"],
                                   element["Font"]["family_name"],
                                   element["Font"]["italic"],element["Font"]["weight"]]

            curr_page_contents.append(current_element)


        #for detection of figures
        elif list(element['Path'])[11] == "F":

            if current_page == element["Page"]:
                pass
            else:
                all_pages_data.append(curr_page_contents)
                current_page += 1
                curr_page_contents = []

            current_element = ["Figure",element["filePaths"][0],element["attributes"]["Placement"],
                               element["attributes"]["BBox"][0],element["attributes"]["BBox"][1],
                               element["attributes"]["BBox"][2],element["attributes"]["BBox"][3]]


            curr_page_contents.append(current_element)


        #for detection of tables
        elif list(element['Path'])[11] == "S":

            if current_page == element["Page"]:
                pass
            else:
                all_pages_data.append(curr_page_contents)
                current_page += 1
                curr_page_contents = []

            if list(element['Path'])[11:21] == "Sect/Table":
                curr_page_contents.append(["Table",element["attributes"]["NumRow"],element["attributes"]["NumCol"],element["filePaths"][0]])
            else:
                pass


    all_pages_data.append(curr_page_contents)
    # Closing file
    f.close()
    return all_pages_data


def word_creator(all_data):
    listo = ""

    doc = Document()

    for page in all_data:

        for ele in page:


            #writing text in docx
            if ele[0] == "Text":

                style = doc.styles['Normal']
                font = style.font
                font.name = str(ele[3])
                font.size = Pt(int(ele[2]))
                act = doc.add_paragraph(style=style)
                act_format = act.paragraph_format
                if ele[6] == "":
                    act_format.space_after = Pt(12)
                else:
                    act_format.space_after = Pt(int(ele[6]))
                if ele[7] == "":
                    act_format.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
                else:
                    if ele[7] == "Justify":
                        act_format.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
                    elif ele[7] == "Start":
                        act_format.alignment = WD_ALIGN_PARAGRAPH.LEFT
                    elif ele[7] == "Center":
                        act_format.alignment = WD_ALIGN_PARAGRAPH.CENTER
                    elif ele[7] == "End":
                        act_format.alignment = WD_ALIGN_PARAGRAPH.RIGHT
                    else:
                        act_format.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
                act_format.line_spacing = Pt(12)
                act_format.left_indent = Inches(0)
                # if (non_dial_dest_lang == 'hi') or (non_dial_dest_lang == 'gu'):
                #     act.style.font.name = 'Mangal'
                # else:
                #     act.style.font.name = 'Courier New'
                para = act.add_run(ele[1])
                if ele[4] == "true":
                    para.italic = True
                if ele[5] > 400:
                    para.bold = True


            #adding table in docx
            elif ele[0] == "Table":

                # read xlsx file
                hr_df = pd.read_excel('C:\\Users\\ANSU\\Desktop\\MNF\\convertBook\\contents_table\\tables\\fileoutpart0.xlsx')


                doc = Document()
                section = doc.sections[0]
                section.left_margin = Mm(5)
                section.right_margin = Mm(5)

                # add tables
                add_table_to_doc(doc, hr_df.iloc[:5])


            #adding list in docx
            elif ele[0] == "List Numbering":

                if (list(ele[1])[0]).isdigit():
                    listo = "Ordered"
                else:
                    listo = "UnOrdered"


            #adding list in docx
            elif ele[0] == "List Data":
                if listo == "Ordered":

                    para = doc.add_paragraph(ele[1],
                                      style='List Number')
                    listo=""
                else:
                    para = doc.add_paragraph(ele[1],
                                      style='List Bullet')
                    listo = ""

                if ele[4] == "true":
                    para.italic = True
                if ele[5] > 300:
                    para.bold = True


            #adding figure in docx
            elif ele[0] == "Figure":
                print(ele)
                doc.add_picture("C:\\Users\\ANSU\\Desktop\\MNF\\convertBook\\contents\\"+str(ele[1]), width=Pt(int(ele[5])-int(ele[3])), height=Pt(int(ele[6])-int(ele[4])))
                last_paragraph = doc.paragraphs[-1]
                last_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER


        doc.add_page_break()
    # save to file
    doc.save("hr_data11111111.docx")


def convert_books(inputfile):

    outputzipname = str((str(inputfile).split("."))[0]) + ".zip"


    # ext = str(inputfile).split(".")
    # if ext[-1] == "pdf":
    #     pass
    #
    # def convert_to_pdf(input_docx, out_folder):
    #     p = subprocess.Popen(
    #         [
    #             "libreoffice",
    #             "--headless",
    #             "--convert-to",
    #             "pdf",
    #             "--outdir",
    #             out_folder,
    #             input_docx,
    #         ]
    #     )
    #     print(["--convert-to", "pdf", input_docx])
    #     p.communicate()

    print(outputzipname)
    fldrs = str(outputzipname).split("/")
    leno = len(fldrs)
    # for i in range(leno):
    #     if i+1 == leno:
    print("1111111111111111111111111111111111111111111111111111111111111111111111111")
    print(fldrs[-2])
    fldrs[-2] = fldrs[-2] + "/zdddips/"
    print(fldrs[-2])
    #print(fldrs[i+1])
    outputziploc = "/home/user"
    pdf_text_images_extractor(api_creds, inputfile, outputzipname)
    print("6666666666666666666666666666666666666666666666666666666666666666666666666666")
    #zip_extractor(outputzipname)
    return 1


# input2 = "C:\\Users\\ANSU\\Downloads\\testtt12.pdf"
# inputfile = "C:\\Users\\ANSU\\Desktop\\MNF\\convertBook\\Adobe\\adobe-dc-pdf-services-sdk-extract-python-samples\\resources\\ihuuh_tnew.pdf"
# outputzipname = "someoutput2.zip"
# json_file = "C:\\Users\\ANSU\\Desktop\\MNF\\convertBook\\contents\\structuredData.json"


#convert_books("/home/user/mnf/project/MNF/conversion/booktranslator/ihuuh_tnew.pdf")
#pdf_text_images_extractor(api_creds,"/home/user/mnf/project/MNF/conversion/booktranslator/ihuuh_tnew.pdf","output.zip")

#zip_extractor(outputzipname)

# all_pages_data = json_parser(json_file)
# #print(all_pages_data)
# word_creator(all_pages_data)