Conversion_Kitchen_Code/kitchen_counter/conversion/booktranslator2/convertBookDocx.py

189 lines
7.0 KiB
Python
Raw Normal View History

2024-04-27 09:33:09 +00:00
# -> For OS Related Operations
import os
# -> For Extracting Docx Contents
import zipfile
# -> For Parsing Docx Contents
from bs4 import BeautifulSoup
# -> For Recreating Docx
from docx import Document
from docx.shared import Mm
from docx.shared import Inches, Cm, Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.oxml.shared import OxmlElement
from docx.oxml.ns import qn
from docx.enum.section import WD_SECTION
# -> For Translation
from conversion.translation.translation_function import translate_comparison
class ConvertBook:
def __init__(self):
self.data_in_docx = []
self.ignore_paras = 0
def fetch_orig_text(self, tag):
return tag.string
def fetch_img(self, tag):
return tag["name"]
def count_rows_cols(self, tag):
cols_in_rows = []
number_rows = len(tag.find_all("w:tr"))
for ele in tag.find_all("w:tr"):
number_cols = len(ele.find_all("w:tc"))
cols_in_rows.append(number_cols)
return number_rows, cols_in_rows
def Points_to_Inches(self, width, height):
return (int(width) / 12700 ) / 100, (int(height) / 9525 ) / 100
def extract_docx_contents(self, docx_path, extraction_folder):
print(docx_path)
# -> Extracting all the files of Docx
archive = zipfile.ZipFile(docx_path)
for file in archive.filelist:
archive.extract(file, path=f"{extraction_folder}")
if file.filename == "word/document.xml":
document_xml = file.filename
# -> Reading the main XML with all the document(.docx) data
content = archive.read(document_xml)
# -> Parse the XML content using BeautifulSoup
soup = BeautifulSoup(content, "xml")
# -> Find all text elements
elements = soup.find_all(["w:p", "w:tbl"])
print("All Elemnets 73")
# -> Iterate over elements and retrieve font settings
for element in elements:
# -> Possible Numbered Lists
if len(element.attrs.keys()) > 0 and "w14:paraId" in list(element.attrs.keys()) and element.find("w:numPr") != None:
self.data_in_docx.append(["List", self.fetch_orig_text(str(element.find("w:t"))), [str(element.find("w:numPr"))[24], str(element.find("w:numPr"))[44]]])
# -> Possible Tables
elif len(element.attrs.keys()) == 0 and str(element)[:7] == "<w:tbl>":
number_rows, cols_in_rows = self.count_rows_cols(element)
self.data_in_docx.append(["Table", element, number_rows, cols_in_rows])
self.ignore_paras = sum(cols_in_rows)
# -> Possible Para of text and inside para there can be images or drawings too
# elif len(element.attrs.keys()) > 0 and "w14:paraId" in list(element.attrs.keys()):
else:
if self.ignore_paras > 0:
self.ignore_paras -= 1
pass
else:
small_elements = element.find_all(["w:t", "wp:docPr"])
# -> If tag is text then fetch the orig_text from the whole tag or else it is an image/drawing
for ele in small_elements:
# -> Image case
if str(ele)[:9] == "<wp:docPr":
image_name = self.fetch_img(ele)
self.data_in_docx.append(["Image/Drawing", image_name, element.find("wp:extent")["cx"], element.find("wp:extent")["cy"]])
# -> Text case
else:
orig_text = self.fetch_orig_text(ele)
self.data_in_docx.append(["Para", orig_text])
return self.data_in_docx
def recreate_translated_book(self, data_book_list, docx_name, source_language, target_language):
print("Entered Recreating book function")
doc = Document()
for ele in data_book_list:
# trans_text = translate_comparison(ele[1], source_language, target_language)
# trans_text = ele[1]
# -> For Unordered list use Bullet instead of Number
if ele[0] == "List":
doc.add_paragraph(translate_comparison(ele[1], source_language, target_language),
style='List Number' if ele[2][0] == '0' else f'List Number {str(int(ele[2][0])+1)}')
# -> Adding Normal Text in Docx
elif ele[0] == "Para":
style = doc.styles['Normal']
font = style.font
font.name = "Courier New"
font.size = Pt(12)
act = doc.add_paragraph(style=style)
act_format = act.paragraph_format
act_format.space_after = Pt(12)
act_format.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
act_format.line_spacing = Pt(12)
act_format.left_indent = Inches(0)
para = act.add_run(translate_comparison(ele[1], source_language, target_language))
# if wanna_italic == True:
# para.italic = True
# if wanna_bold == True:
# para.bold = True
elif ele[0] == "Image/Drawing":
width, height = self.Points_to_Inches(ele[2], ele[3])
doc.add_picture(f"{'/'.join((str(docx_name).split('/'))[:-1])}/word/media/{ele[1]}",width = Inches(width), height = Inches(height))
elif ele[0] == "Table":
# -> Getting the Table data
table_data = [self.fetch_orig_text(text) for text in ele[1].find_all("w:t")]
# -> Creating a Table with one row
table = doc.add_table(rows=1, cols=ele[3][0])
# -> Setting the style for table borders
table.style = 'Table Grid'
# -> Adding heading in the 1st row of the table
row = table.rows[0].cells
for idx, heading in enumerate(table_data[:ele[3][0]]):
row[idx].text = translate_comparison(heading, source_language, target_language)
# -> if more than one rows
if ele[2] > 1:
for row_no in range(ele[2] - 1):
# Adding a row and then adding data in it.
row = table.add_row().cells
start_idx = sum(ele[3][:row_no+1])
last_idx = start_idx + ele[3][row_no+1]
for idx, cell_data in enumerate(table_data[start_idx:last_idx]):
row[idx].text = translate_comparison(str(cell_data), source_language, target_language)
doc.save(docx_name)
# docx_path = "fddfd_testing_112.docx"
# a = ConvertBook()
# a.recreate_translated_book(a.extract_docx_contents(docx_path))