Conversion_Kitchen_Code/kitchen_counter/conversion/booktranslator2/convertBookDocx.py

# -> For OS Related Operations
import os

# -> For Extracting Docx Contents
import zipfile

# -> For Parsing Docx Contents
from bs4 import BeautifulSoup

# -> For Recreating Docx
from docx import Document
from docx.shared import Mm
from docx.shared import Inches, Cm, Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.oxml.shared import OxmlElement
from docx.oxml.ns import qn
from docx.enum.section import WD_SECTION

# -> For Translation
from conversion.translation.translation_function import translate_comparison


class ConvertBook:

    def __init__(self):
        self.data_in_docx = []
        self.ignore_paras = 0

    def fetch_orig_text(self, tag):
        return tag.string


    def fetch_img(self, tag):
        return tag["name"]


    def count_rows_cols(self, tag):
        cols_in_rows = []
        number_rows = len(tag.find_all("w:tr"))
        for ele in tag.find_all("w:tr"):
            number_cols = len(ele.find_all("w:tc"))
            cols_in_rows.append(number_cols)
        return number_rows, cols_in_rows


    def Points_to_Inches(self, width, height):
        return (int(width) / 12700 ) / 100, (int(height) / 9525 ) / 100


    def extract_docx_contents(self, docx_path, extraction_folder):
        print(docx_path)
        # -> Extracting all the files of Docx
        archive = zipfile.ZipFile(docx_path)
        for file in archive.filelist:
            archive.extract(file, path=f"{extraction_folder}")
            if file.filename == "word/document.xml":
                document_xml = file.filename

        # -> Reading the main XML with all the document(.docx) data
        content = archive.read(document_xml)

        # -> Parse the XML content using BeautifulSoup
        soup = BeautifulSoup(content, "xml")

        # -> Find all text elements
        elements = soup.find_all(["w:p", "w:tbl"])

        print("All Elemnets 73")
        # -> Iterate over elements and retrieve font settings
        for element in elements:
            # -> Possible Numbered Lists
            if len(element.attrs.keys()) > 0 and "w14:paraId" in list(element.attrs.keys()) and element.find("w:numPr") != None:
                self.data_in_docx.append(["List", self.fetch_orig_text(str(element.find("w:t"))), [str(element.find("w:numPr"))[24], str(element.find("w:numPr"))[44]]])


            # -> Possible Tables
            elif len(element.attrs.keys()) == 0 and str(element)[:7] == "<w:tbl>":
                number_rows, cols_in_rows = self.count_rows_cols(element)
                self.data_in_docx.append(["Table", element, number_rows, cols_in_rows])
                self.ignore_paras = sum(cols_in_rows)


            # -> Possible Para of text and inside para there can be images or drawings too
            # elif len(element.attrs.keys()) > 0 and "w14:paraId" in list(element.attrs.keys()):
            else:
                if self.ignore_paras > 0:
                    self.ignore_paras -= 1
                    pass
                else:
                    small_elements = element.find_all(["w:t", "wp:docPr"])
                    # -> If tag is text then fetch the orig_text from the whole tag or else it is an image/drawing
                    for ele in small_elements:
                        # -> Image case
                        if str(ele)[:9] == "<wp:docPr":
                            image_name = self.fetch_img(ele)
                            self.data_in_docx.append(["Image/Drawing", image_name, element.find("wp:extent")["cx"], element.find("wp:extent")["cy"]])
                        # -> Text case
                        else:
                            orig_text = self.fetch_orig_text(ele)
                            self.data_in_docx.append(["Para", orig_text])


        return self.data_in_docx


    def recreate_translated_book(self, data_book_list, docx_name, source_language, target_language):
        print("Entered Recreating book function")
        doc = Document()
        for ele in data_book_list:
            # trans_text = translate_comparison(ele[1], source_language, target_language)
            # trans_text = ele[1]


            # -> For Unordered list use Bullet instead of Number
            if ele[0] == "List":
                doc.add_paragraph(translate_comparison(ele[1], source_language, target_language),
                                  style='List Number' if ele[2][0] == '0' else f'List Number {str(int(ele[2][0])+1)}')


            # -> Adding Normal Text in Docx
            elif ele[0] == "Para":
                style = doc.styles['Normal']
                font = style.font
                font.name = "Courier New"
                font.size = Pt(12)
                act = doc.add_paragraph(style=style)
                act_format = act.paragraph_format
                act_format.space_after = Pt(12)
                act_format.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
                act_format.line_spacing = Pt(12)
                act_format.left_indent = Inches(0)
                para = act.add_run(translate_comparison(ele[1], source_language, target_language))
                # if wanna_italic == True:
                #     para.italic = True
                # if wanna_bold == True:
                #     para.bold = True


            elif ele[0] == "Image/Drawing":
                width, height = self.Points_to_Inches(ele[2], ele[3])
                doc.add_picture(f"{'/'.join((str(docx_name).split('/'))[:-1])}/word/media/{ele[1]}",width = Inches(width), height = Inches(height))


            elif ele[0] == "Table":

                # -> Getting the Table data
                table_data = [self.fetch_orig_text(text) for text in ele[1].find_all("w:t")]

                # -> Creating a Table with one row
                table = doc.add_table(rows=1, cols=ele[3][0])

                # -> Setting the style for table borders
                table.style = 'Table Grid'

                # -> Adding heading in the 1st row of the table
                row = table.rows[0].cells
                for idx, heading in enumerate(table_data[:ele[3][0]]):
                    row[idx].text = translate_comparison(heading, source_language, target_language)

                # -> if more than one rows
                if ele[2] > 1:
                    for row_no in range(ele[2] - 1):
                        # Adding a row and then adding data in it.
                        row = table.add_row().cells
                        start_idx = sum(ele[3][:row_no+1])
                        last_idx = start_idx + ele[3][row_no+1]
                        for idx, cell_data in enumerate(table_data[start_idx:last_idx]):
                            row[idx].text = translate_comparison(str(cell_data), source_language, target_language)


        doc.save(docx_name)


# docx_path = "fddfd_testing_112.docx"
# a = ConvertBook()
# a.recreate_translated_book(a.extract_docx_contents(docx_path))