Conversion_Kitchen_Code/kitchen_counter/conversion/booktranslator2/convertBookDocx.py

# -> For OS Related Operations
import os

# -> For Extracting Docx Contents
import zipfile

# -> For Parsing Docx Contents
from bs4 import BeautifulSoup

# -> For Recreating Docx
from docx import Document
from docx.shared import Mm
from docx.shared import Inches, Cm, Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.oxml.shared import OxmlElement
from docx.oxml.ns import qn
from docx.enum.section import WD_SECTION

# -> For Translation
from conversion.translation.translation_function import translate_comparison


class ConvertBook:

    def __init__(self):
        self.data_in_docx = []
        self.ignore_paras = 0

    def fetch_orig_text(self, tag):
        return tag.string


    def fetch_img(self, tag):
        return tag["name"]


    def count_rows_cols(self, tag):
        cols_in_rows = []
        number_rows = len(tag.find_all("w:tr"))
        for ele in tag.find_all("w:tr"):
            number_cols = len(ele.find_all("w:tc"))
            cols_in_rows.append(number_cols)
        return number_rows, cols_in_rows


    def Points_to_Inches(self, width, height):
        return (int(width) / 12700 ) / 100, (int(height) / 9525 ) / 100


    def extract_docx_contents(self, docx_path, extraction_folder):
        print(docx_path)
        # -> Extracting all the files of Docx
        archive = zipfile.ZipFile(docx_path)
        for file in archive.filelist:
            archive.extract(file, path=f"{extraction_folder}")
            if file.filename == "word/document.xml":
                document_xml = file.filename

        # -> Reading the main XML with all the document(.docx) data
        content = archive.read(document_xml)

        # -> Parse the XML content using BeautifulSoup
        soup = BeautifulSoup(content, "xml")

        # -> Find all text elements
        elements = soup.find_all(["w:p", "w:tbl"])

        print("All Elemnets 73")
        # -> Iterate over elements and retrieve font settings
        for element in elements:
            # -> Possible Numbered Lists
            if len(element.attrs.keys()) > 0 and "w14:paraId" in list(element.attrs.keys()) and element.find("w:numPr") != None:
                self.data_in_docx.append(["List", self.fetch_orig_text(str(element.find("w:t"))), [str(element.find("w:numPr"))[24], str(element.find("w:numPr"))[44]]])


            # -> Possible Tables
            elif len(element.attrs.keys()) == 0 and str(element)[:7] == "<w:tbl>":
                number_rows, cols_in_rows = self.count_rows_cols(element)
                self.data_in_docx.append(["Table", element, number_rows, cols_in_rows])
                self.ignore_paras = sum(cols_in_rows)


            # -> Possible Para of text and inside para there can be images or drawings too
            # elif len(element.attrs.keys()) > 0 and "w14:paraId" in list(element.attrs.keys()):
            else:
                if self.ignore_paras > 0:
                    self.ignore_paras -= 1
                    pass
                else:
                    small_elements = element.find_all(["w:t", "wp:docPr"])
                    # -> If tag is text then fetch the orig_text from the whole tag or else it is an image/drawing
                    for ele in small_elements:
                        # -> Image case
                        if str(ele)[:9] == "<wp:docPr":
                            image_name = self.fetch_img(ele)
                            self.data_in_docx.append(["Image/Drawing", image_name, element.find("wp:extent")["cx"], element.find("wp:extent")["cy"]])
                        # -> Text case
                        else:
                            orig_text = self.fetch_orig_text(ele)
                            self.data_in_docx.append(["Para", orig_text])


        return self.data_in_docx


    def recreate_translated_book(self, data_book_list, docx_name, source_language, target_language):
        print("Entered Recreating book function")
        doc = Document()
        for ele in data_book_list:
            # trans_text = translate_comparison(ele[1], source_language, target_language)
            # trans_text = ele[1]


            # -> For Unordered list use Bullet instead of Number
            if ele[0] == "List":
                doc.add_paragraph(translate_comparison(ele[1], source_language, target_language),
                                  style='List Number' if ele[2][0] == '0' else f'List Number {str(int(ele[2][0])+1)}')


            # -> Adding Normal Text in Docx
            elif ele[0] == "Para":
                style = doc.styles['Normal']
                font = style.font
                font.name = "Courier New"
                font.size = Pt(12)
                act = doc.add_paragraph(style=style)
                act_format = act.paragraph_format
                act_format.space_after = Pt(12)
                act_format.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
                act_format.line_spacing = Pt(12)
                act_format.left_indent = Inches(0)
                para = act.add_run(translate_comparison(ele[1], source_language, target_language))
                # if wanna_italic == True:
                #     para.italic = True
                # if wanna_bold == True:
                #     para.bold = True


            elif ele[0] == "Image/Drawing":
                width, height = self.Points_to_Inches(ele[2], ele[3])
                doc.add_picture(f"{'/'.join((str(docx_name).split('/'))[:-1])}/word/media/{ele[1]}",width = Inches(width), height = Inches(height))


            elif ele[0] == "Table":

                # -> Getting the Table data
                table_data = [self.fetch_orig_text(text) for text in ele[1].find_all("w:t")]

                # -> Creating a Table with one row
                table = doc.add_table(rows=1, cols=ele[3][0])

                # -> Setting the style for table borders
                table.style = 'Table Grid'

                # -> Adding heading in the 1st row of the table
                row = table.rows[0].cells
                for idx, heading in enumerate(table_data[:ele[3][0]]):
                    row[idx].text = translate_comparison(heading, source_language, target_language)

                # -> if more than one rows
                if ele[2] > 1:
                    for row_no in range(ele[2] - 1):
                        # Adding a row and then adding data in it.
                        row = table.add_row().cells
                        start_idx = sum(ele[3][:row_no+1])
                        last_idx = start_idx + ele[3][row_no+1]
                        for idx, cell_data in enumerate(table_data[start_idx:last_idx]):
                            row[idx].text = translate_comparison(str(cell_data), source_language, target_language)


        doc.save(docx_name)


# docx_path = "fddfd_testing_112.docx"
# a = ConvertBook()
# a.recreate_translated_book(a.extract_docx_contents(docx_path))
first commit 2024-04-27 09:33:09 +00:00			`# -> For OS Related Operations`
			`import os`

			`# -> For Extracting Docx Contents`
			`import zipfile`

			`# -> For Parsing Docx Contents`
			`from bs4 import BeautifulSoup`

			`# -> For Recreating Docx`
			`from docx import Document`
			`from docx.shared import Mm`
			`from docx.shared import Inches, Cm, Pt`
			`from docx.enum.text import WD_ALIGN_PARAGRAPH`
			`from docx.oxml.shared import OxmlElement`
			`from docx.oxml.ns import qn`
			`from docx.enum.section import WD_SECTION`

			`# -> For Translation`
			`from conversion.translation.translation_function import translate_comparison`


			`class ConvertBook:`

			`def __init__(self):`
			`self.data_in_docx = []`
			`self.ignore_paras = 0`

			`def fetch_orig_text(self, tag):`
			`return tag.string`



			`def fetch_img(self, tag):`
			`return tag["name"]`



			`def count_rows_cols(self, tag):`
			`cols_in_rows = []`
			`number_rows = len(tag.find_all("w:tr"))`
			`for ele in tag.find_all("w:tr"):`
			`number_cols = len(ele.find_all("w:tc"))`
			`cols_in_rows.append(number_cols)`
			`return number_rows, cols_in_rows`


			`def Points_to_Inches(self, width, height):`
			`return (int(width) / 12700 ) / 100, (int(height) / 9525 ) / 100`


			`def extract_docx_contents(self, docx_path, extraction_folder):`
			`print(docx_path)`
			`# -> Extracting all the files of Docx`
			`archive = zipfile.ZipFile(docx_path)`
			`for file in archive.filelist:`
			`archive.extract(file, path=f"{extraction_folder}")`
			`if file.filename == "word/document.xml":`
			`document_xml = file.filename`

			`# -> Reading the main XML with all the document(.docx) data`
			`content = archive.read(document_xml)`

			`# -> Parse the XML content using BeautifulSoup`
			`soup = BeautifulSoup(content, "xml")`

			`# -> Find all text elements`
			`elements = soup.find_all(["w:p", "w:tbl"])`

			`print("All Elemnets 73")`
			`# -> Iterate over elements and retrieve font settings`
			`for element in elements:`
			`# -> Possible Numbered Lists`
			`if len(element.attrs.keys()) > 0 and "w14:paraId" in list(element.attrs.keys()) and element.find("w:numPr") != None:`
			`self.data_in_docx.append(["List", self.fetch_orig_text(str(element.find("w:t"))), [str(element.find("w:numPr"))[24], str(element.find("w:numPr"))[44]]])`



			`# -> Possible Tables`
			`elif len(element.attrs.keys()) == 0 and str(element)[:7] == "<w:tbl>":`
			`number_rows, cols_in_rows = self.count_rows_cols(element)`
			`self.data_in_docx.append(["Table", element, number_rows, cols_in_rows])`
			`self.ignore_paras = sum(cols_in_rows)`



			`# -> Possible Para of text and inside para there can be images or drawings too`
			`# elif len(element.attrs.keys()) > 0 and "w14:paraId" in list(element.attrs.keys()):`
			`else:`
			`if self.ignore_paras > 0:`
			`self.ignore_paras -= 1`
			`pass`
			`else:`
			`small_elements = element.find_all(["w:t", "wp:docPr"])`
			`# -> If tag is text then fetch the orig_text from the whole tag or else it is an image/drawing`
			`for ele in small_elements:`
			`# -> Image case`
			`if str(ele)[:9] == "<wp:docPr":`
			`image_name = self.fetch_img(ele)`
			`self.data_in_docx.append(["Image/Drawing", image_name, element.find("wp:extent")["cx"], element.find("wp:extent")["cy"]])`
			`# -> Text case`
			`else:`
			`orig_text = self.fetch_orig_text(ele)`
			`self.data_in_docx.append(["Para", orig_text])`




			`return self.data_in_docx`



			`def recreate_translated_book(self, data_book_list, docx_name, source_language, target_language):`
			`print("Entered Recreating book function")`
			`doc = Document()`
			`for ele in data_book_list:`
			`# trans_text = translate_comparison(ele[1], source_language, target_language)`
			`# trans_text = ele[1]`


			`# -> For Unordered list use Bullet instead of Number`
			`if ele[0] == "List":`
			`doc.add_paragraph(translate_comparison(ele[1], source_language, target_language),`
			`style='List Number' if ele[2][0] == '0' else f'List Number {str(int(ele[2][0])+1)}')`



			`# -> Adding Normal Text in Docx`
			`elif ele[0] == "Para":`
			`style = doc.styles['Normal']`
			`font = style.font`
			`font.name = "Courier New"`
			`font.size = Pt(12)`
			`act = doc.add_paragraph(style=style)`
			`act_format = act.paragraph_format`
			`act_format.space_after = Pt(12)`
			`act_format.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY`
			`act_format.line_spacing = Pt(12)`
			`act_format.left_indent = Inches(0)`
			`para = act.add_run(translate_comparison(ele[1], source_language, target_language))`
			`# if wanna_italic == True:`
			`# para.italic = True`
			`# if wanna_bold == True:`
			`# para.bold = True`



			`elif ele[0] == "Image/Drawing":`
			`width, height = self.Points_to_Inches(ele[2], ele[3])`
			`doc.add_picture(f"{'/'.join((str(docx_name).split('/'))[:-1])}/word/media/{ele[1]}",width = Inches(width), height = Inches(height))`



			`elif ele[0] == "Table":`

			`# -> Getting the Table data`
			`table_data = [self.fetch_orig_text(text) for text in ele[1].find_all("w:t")]`

			`# -> Creating a Table with one row`
			`table = doc.add_table(rows=1, cols=ele[3][0])`

			`# -> Setting the style for table borders`
			`table.style = 'Table Grid'`

			`# -> Adding heading in the 1st row of the table`
			`row = table.rows[0].cells`
			`for idx, heading in enumerate(table_data[:ele[3][0]]):`
			`row[idx].text = translate_comparison(heading, source_language, target_language)`

			`# -> if more than one rows`
			`if ele[2] > 1:`
			`for row_no in range(ele[2] - 1):`
			`# Adding a row and then adding data in it.`
			`row = table.add_row().cells`
			`start_idx = sum(ele[3][:row_no+1])`
			`last_idx = start_idx + ele[3][row_no+1]`
			`for idx, cell_data in enumerate(table_data[start_idx:last_idx]):`
			`row[idx].text = translate_comparison(str(cell_data), source_language, target_language)`


			`doc.save(docx_name)`



			`# docx_path = "fddfd_testing_112.docx"`
			`# a = ConvertBook()`
			`# a.recreate_translated_book(a.extract_docx_contents(docx_path))`