189 lines
7.0 KiB
Python
Executable File
189 lines
7.0 KiB
Python
Executable File
# -> For OS Related Operations
|
|
import os
|
|
|
|
# -> For Extracting Docx Contents
|
|
import zipfile
|
|
|
|
# -> For Parsing Docx Contents
|
|
from bs4 import BeautifulSoup
|
|
|
|
# -> For Recreating Docx
|
|
from docx import Document
|
|
from docx.shared import Mm
|
|
from docx.shared import Inches, Cm, Pt
|
|
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
|
from docx.oxml.shared import OxmlElement
|
|
from docx.oxml.ns import qn
|
|
from docx.enum.section import WD_SECTION
|
|
|
|
# -> For Translation
|
|
from conversion.translation.translation_function import translate_comparison
|
|
|
|
|
|
class ConvertBook:
|
|
|
|
def __init__(self):
|
|
self.data_in_docx = []
|
|
self.ignore_paras = 0
|
|
|
|
def fetch_orig_text(self, tag):
|
|
return tag.string
|
|
|
|
|
|
|
|
def fetch_img(self, tag):
|
|
return tag["name"]
|
|
|
|
|
|
|
|
def count_rows_cols(self, tag):
|
|
cols_in_rows = []
|
|
number_rows = len(tag.find_all("w:tr"))
|
|
for ele in tag.find_all("w:tr"):
|
|
number_cols = len(ele.find_all("w:tc"))
|
|
cols_in_rows.append(number_cols)
|
|
return number_rows, cols_in_rows
|
|
|
|
|
|
def Points_to_Inches(self, width, height):
|
|
return (int(width) / 12700 ) / 100, (int(height) / 9525 ) / 100
|
|
|
|
|
|
def extract_docx_contents(self, docx_path, extraction_folder):
|
|
print(docx_path)
|
|
# -> Extracting all the files of Docx
|
|
archive = zipfile.ZipFile(docx_path)
|
|
for file in archive.filelist:
|
|
archive.extract(file, path=f"{extraction_folder}")
|
|
if file.filename == "word/document.xml":
|
|
document_xml = file.filename
|
|
|
|
# -> Reading the main XML with all the document(.docx) data
|
|
content = archive.read(document_xml)
|
|
|
|
# -> Parse the XML content using BeautifulSoup
|
|
soup = BeautifulSoup(content, "xml")
|
|
|
|
# -> Find all text elements
|
|
elements = soup.find_all(["w:p", "w:tbl"])
|
|
|
|
print("All Elemnets 73")
|
|
# -> Iterate over elements and retrieve font settings
|
|
for element in elements:
|
|
# -> Possible Numbered Lists
|
|
if len(element.attrs.keys()) > 0 and "w14:paraId" in list(element.attrs.keys()) and element.find("w:numPr") != None:
|
|
self.data_in_docx.append(["List", self.fetch_orig_text(str(element.find("w:t"))), [str(element.find("w:numPr"))[24], str(element.find("w:numPr"))[44]]])
|
|
|
|
|
|
|
|
# -> Possible Tables
|
|
elif len(element.attrs.keys()) == 0 and str(element)[:7] == "<w:tbl>":
|
|
number_rows, cols_in_rows = self.count_rows_cols(element)
|
|
self.data_in_docx.append(["Table", element, number_rows, cols_in_rows])
|
|
self.ignore_paras = sum(cols_in_rows)
|
|
|
|
|
|
|
|
# -> Possible Para of text and inside para there can be images or drawings too
|
|
# elif len(element.attrs.keys()) > 0 and "w14:paraId" in list(element.attrs.keys()):
|
|
else:
|
|
if self.ignore_paras > 0:
|
|
self.ignore_paras -= 1
|
|
pass
|
|
else:
|
|
small_elements = element.find_all(["w:t", "wp:docPr"])
|
|
# -> If tag is text then fetch the orig_text from the whole tag or else it is an image/drawing
|
|
for ele in small_elements:
|
|
# -> Image case
|
|
if str(ele)[:9] == "<wp:docPr":
|
|
image_name = self.fetch_img(ele)
|
|
self.data_in_docx.append(["Image/Drawing", image_name, element.find("wp:extent")["cx"], element.find("wp:extent")["cy"]])
|
|
# -> Text case
|
|
else:
|
|
orig_text = self.fetch_orig_text(ele)
|
|
self.data_in_docx.append(["Para", orig_text])
|
|
|
|
|
|
|
|
|
|
return self.data_in_docx
|
|
|
|
|
|
|
|
def recreate_translated_book(self, data_book_list, docx_name, source_language, target_language):
|
|
print("Entered Recreating book function")
|
|
doc = Document()
|
|
for ele in data_book_list:
|
|
# trans_text = translate_comparison(ele[1], source_language, target_language)
|
|
# trans_text = ele[1]
|
|
|
|
|
|
# -> For Unordered list use Bullet instead of Number
|
|
if ele[0] == "List":
|
|
doc.add_paragraph(translate_comparison(ele[1], source_language, target_language),
|
|
style='List Number' if ele[2][0] == '0' else f'List Number {str(int(ele[2][0])+1)}')
|
|
|
|
|
|
|
|
# -> Adding Normal Text in Docx
|
|
elif ele[0] == "Para":
|
|
style = doc.styles['Normal']
|
|
font = style.font
|
|
font.name = "Courier New"
|
|
font.size = Pt(12)
|
|
act = doc.add_paragraph(style=style)
|
|
act_format = act.paragraph_format
|
|
act_format.space_after = Pt(12)
|
|
act_format.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
|
|
act_format.line_spacing = Pt(12)
|
|
act_format.left_indent = Inches(0)
|
|
para = act.add_run(translate_comparison(ele[1], source_language, target_language))
|
|
# if wanna_italic == True:
|
|
# para.italic = True
|
|
# if wanna_bold == True:
|
|
# para.bold = True
|
|
|
|
|
|
|
|
elif ele[0] == "Image/Drawing":
|
|
width, height = self.Points_to_Inches(ele[2], ele[3])
|
|
doc.add_picture(f"{'/'.join((str(docx_name).split('/'))[:-1])}/word/media/{ele[1]}",width = Inches(width), height = Inches(height))
|
|
|
|
|
|
|
|
elif ele[0] == "Table":
|
|
|
|
# -> Getting the Table data
|
|
table_data = [self.fetch_orig_text(text) for text in ele[1].find_all("w:t")]
|
|
|
|
# -> Creating a Table with one row
|
|
table = doc.add_table(rows=1, cols=ele[3][0])
|
|
|
|
# -> Setting the style for table borders
|
|
table.style = 'Table Grid'
|
|
|
|
# -> Adding heading in the 1st row of the table
|
|
row = table.rows[0].cells
|
|
for idx, heading in enumerate(table_data[:ele[3][0]]):
|
|
row[idx].text = translate_comparison(heading, source_language, target_language)
|
|
|
|
# -> if more than one rows
|
|
if ele[2] > 1:
|
|
for row_no in range(ele[2] - 1):
|
|
# Adding a row and then adding data in it.
|
|
row = table.add_row().cells
|
|
start_idx = sum(ele[3][:row_no+1])
|
|
last_idx = start_idx + ele[3][row_no+1]
|
|
for idx, cell_data in enumerate(table_data[start_idx:last_idx]):
|
|
row[idx].text = translate_comparison(str(cell_data), source_language, target_language)
|
|
|
|
|
|
doc.save(docx_name)
|
|
|
|
|
|
|
|
# docx_path = "fddfd_testing_112.docx"
|
|
# a = ConvertBook()
|
|
# a.recreate_translated_book(a.extract_docx_contents(docx_path))
|
|
|