# -> For OS Related Operations import os # -> For Extracting Docx Contents import zipfile # -> For Parsing Docx Contents from bs4 import BeautifulSoup # -> For Recreating Docx from docx import Document from docx.shared import Mm from docx.shared import Inches, Cm, Pt from docx.enum.text import WD_ALIGN_PARAGRAPH from docx.oxml.shared import OxmlElement from docx.oxml.ns import qn from docx.enum.section import WD_SECTION # -> For Translation from conversion.translation.translation_function import translate_comparison class ConvertBook: def __init__(self): self.data_in_docx = [] self.ignore_paras = 0 def fetch_orig_text(self, tag): return tag.string def fetch_img(self, tag): return tag["name"] def count_rows_cols(self, tag): cols_in_rows = [] number_rows = len(tag.find_all("w:tr")) for ele in tag.find_all("w:tr"): number_cols = len(ele.find_all("w:tc")) cols_in_rows.append(number_cols) return number_rows, cols_in_rows def Points_to_Inches(self, width, height): return (int(width) / 12700 ) / 100, (int(height) / 9525 ) / 100 def extract_docx_contents(self, docx_path, extraction_folder): print(docx_path) # -> Extracting all the files of Docx archive = zipfile.ZipFile(docx_path) for file in archive.filelist: archive.extract(file, path=f"{extraction_folder}") if file.filename == "word/document.xml": document_xml = file.filename # -> Reading the main XML with all the document(.docx) data content = archive.read(document_xml) # -> Parse the XML content using BeautifulSoup soup = BeautifulSoup(content, "xml") # -> Find all text elements elements = soup.find_all(["w:p", "w:tbl"]) print("All Elemnets 73") # -> Iterate over elements and retrieve font settings for element in elements: # -> Possible Numbered Lists if len(element.attrs.keys()) > 0 and "w14:paraId" in list(element.attrs.keys()) and element.find("w:numPr") != None: self.data_in_docx.append(["List", self.fetch_orig_text(str(element.find("w:t"))), [str(element.find("w:numPr"))[24], str(element.find("w:numPr"))[44]]]) # -> Possible Tables elif len(element.attrs.keys()) == 0 and str(element)[:7] == "": number_rows, cols_in_rows = self.count_rows_cols(element) self.data_in_docx.append(["Table", element, number_rows, cols_in_rows]) self.ignore_paras = sum(cols_in_rows) # -> Possible Para of text and inside para there can be images or drawings too # elif len(element.attrs.keys()) > 0 and "w14:paraId" in list(element.attrs.keys()): else: if self.ignore_paras > 0: self.ignore_paras -= 1 pass else: small_elements = element.find_all(["w:t", "wp:docPr"]) # -> If tag is text then fetch the orig_text from the whole tag or else it is an image/drawing for ele in small_elements: # -> Image case if str(ele)[:9] == " Text case else: orig_text = self.fetch_orig_text(ele) self.data_in_docx.append(["Para", orig_text]) return self.data_in_docx def recreate_translated_book(self, data_book_list, docx_name, source_language, target_language): print("Entered Recreating book function") doc = Document() for ele in data_book_list: # trans_text = translate_comparison(ele[1], source_language, target_language) # trans_text = ele[1] # -> For Unordered list use Bullet instead of Number if ele[0] == "List": doc.add_paragraph(translate_comparison(ele[1], source_language, target_language), style='List Number' if ele[2][0] == '0' else f'List Number {str(int(ele[2][0])+1)}') # -> Adding Normal Text in Docx elif ele[0] == "Para": style = doc.styles['Normal'] font = style.font font.name = "Courier New" font.size = Pt(12) act = doc.add_paragraph(style=style) act_format = act.paragraph_format act_format.space_after = Pt(12) act_format.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY act_format.line_spacing = Pt(12) act_format.left_indent = Inches(0) para = act.add_run(translate_comparison(ele[1], source_language, target_language)) # if wanna_italic == True: # para.italic = True # if wanna_bold == True: # para.bold = True elif ele[0] == "Image/Drawing": width, height = self.Points_to_Inches(ele[2], ele[3]) doc.add_picture(f"{'/'.join((str(docx_name).split('/'))[:-1])}/word/media/{ele[1]}",width = Inches(width), height = Inches(height)) elif ele[0] == "Table": # -> Getting the Table data table_data = [self.fetch_orig_text(text) for text in ele[1].find_all("w:t")] # -> Creating a Table with one row table = doc.add_table(rows=1, cols=ele[3][0]) # -> Setting the style for table borders table.style = 'Table Grid' # -> Adding heading in the 1st row of the table row = table.rows[0].cells for idx, heading in enumerate(table_data[:ele[3][0]]): row[idx].text = translate_comparison(heading, source_language, target_language) # -> if more than one rows if ele[2] > 1: for row_no in range(ele[2] - 1): # Adding a row and then adding data in it. row = table.add_row().cells start_idx = sum(ele[3][:row_no+1]) last_idx = start_idx + ele[3][row_no+1] for idx, cell_data in enumerate(table_data[start_idx:last_idx]): row[idx].text = translate_comparison(str(cell_data), source_language, target_language) doc.save(docx_name) # docx_path = "fddfd_testing_112.docx" # a = ConvertBook() # a.recreate_translated_book(a.extract_docx_contents(docx_path))