from io import IOBase, StringIO import os,csv import subprocess from centralisedFileSystem.models import File, ScreenPlay, Script from bs4 import BeautifulSoup as bfs from docx import Document from docx.enum.text import WD_ALIGN_PARAGRAPH from docx.shared import Inches, Mm, Pt import pandas as pd # import pdftotext from pdf2docx import parse #from tika import parser # import page_script.models as ps_models from django.conf import settings from django.template.loader import render_to_string from django.utils.html import strip_tags from django.core.mail import EmailMultiAlternatives from .filesystem import get_file_path def fdx_to_csv(fdx_file : IOBase) -> pd.DataFrame: """ Converts th efdx document to Pandas DataFrame. Give FileIO as input, Not file path. Args: fdx_file (IOBase): fdx File object Returns: pd.DataFrame: csv generated from given fdx """ fdx_content = fdx_file.read() soup = bfs(fdx_content, 'xml') paragraphs = soup.find_all('Paragraph') print("paragraphs are", paragraphs) df = pd.DataFrame(columns=['Text','Script_Element']) for para in paragraphs: print(para) try: script_element = para['Type'] except: continue dirty_texts = para.find_all('Text') cleaned_texts = [text.get_text().strip() for text in dirty_texts] full_txt = " ".join(cleaned_texts) full_txt = full_txt.strip() _d = pd.DataFrame({'Text':[full_txt], 'Script_Element':[script_element]}) df = pd.concat([df, _d], ignore_index=True) return df def fdx_to_txt(fdx_file : IOBase) -> str: """ Converts the fdx document to PlainText (string with indentations). Give FileIO as input, Not file path. Args: fdx_file (IOBase): fdx File object Returns: str: string generated from given fdx """ df = fdx_to_csv(fdx_file) count = len(df) with StringIO() as f: for _, txt, script_element in df.itertuples(): line = '' if script_element == 'Character': line = txt.rjust(len(txt)+35) elif script_element == 'Dialogue': line = txt.rjust(len(txt)+25) elif script_element == 'Parenthetical': line = txt.rjust(len(txt)+30) elif script_element == 'Transition': line = txt.rjust(len(txt)+55) else: line = txt.rjust(len(txt)+15) #if script_element in ('Action', 'Scene Heading','Transition'): # f.write('\n') f.write(line) if _ < (count-1): if script_element in ('Dialogue', 'Action', 'Scene Heading','Transition'): if not(script_element == 'Dialogue' and df['Script_Element'][_ +1] in ('Dialogue','Parenthetical')): f.write('\n') #f.write('\n') return f.getvalue() def csv_to_docx(csv: pd.DataFrame) -> Document: output_doc = Document() style = output_doc.styles["Normal"] font = style.font font.name = "Courier New" font.size = Pt(12) section = output_doc.sections[0] section.page_height = Mm(297) a4_right = 8.57 section.page_width = Inches(a4_right) section.left_margin = Inches(1.5) for index in csv.index: para = output_doc.add_paragraph() paragraph_format = para.paragraph_format paragraph_format.space_before = Pt(0) paragraph_format.space_after = Pt(0) paragraph_format.line_spacing = Pt(12) script_element = csv["script_element"][index] content = csv["content"][index] print("content = ",content) if script_element == "blank": continue elif script_element == "slugline": paragraph_format.left_indent = Inches(0) paragraph_format.right_indent = Inches(0) print("content is slugline") try: content = content.upper() except Exception as exp: print("Exception =", exp) content = content elif script_element == "action": paragraph_format.left_indent = Inches(0) paragraph_format.right_indent = Inches(0) elif script_element == "dialogue": paragraph_format.left_indent = Inches(1.0) paragraph_format.right_indent = Inches(1.25) elif script_element == "parenthetical": paragraph_format.left_indent = Inches(1.5) paragraph_format.right_indent = Inches(2.25) elif script_element == "speaker": paragraph_format.left_indent = Inches(2) paragraph_format.right_indent = Inches(1) print("content is speaker") try: content = content.upper() except Exception as exp: print("Exception =", exp) content = content elif script_element == "transition": para.alignment = WD_ALIGN_PARAGRAPH.RIGHT paragraph_format.left_indent = Inches(2.5) paragraph_format.right_indent = Inches(0) elif script_element == "special_term": paragraph_format.left_indent = Inches(0) paragraph_format.right_indent = Inches(0) if isinstance(content, float): content = "" para.text = content return output_doc def get_csv_from_id(script_id : str, encoding : str ="utf-8") -> pd.DataFrame: f_path, f_name = ps_models.MNFScriptDatabase_2.get_file_path(script_id) path = f"{f_path}{f_name.rsplit('.',1)[0]+'_audited.csv'}" if not os.path.exists(path): path = path.replace("_audited.csv", ".csv") if not os.path.exists(path): raise FileNotFoundError(f"Not found : {path}") try: df = pd.read_csv(path, encoding=encoding) except UnicodeError: df = pd.read_csv(path, encoding="utf-16") return df def get_csv_path_from_id(script_id : str, encoding : str ="utf-8") -> pd.DataFrame: f_path, f_name = ps_models.MNFScriptDatabase_2.get_file_path(script_id,"csv") path = f"{f_path}{f_name.rsplit('.',1)[0]+'_audited.csv'}" if not os.path.exists(path): path = path.replace("_audited.csv", ".csv") if not os.path.exists(path): raise FileNotFoundError(f"Not found : {path}") return path def json_to_csv(json_obj: dict) -> pd.DataFrame: df = pd.DataFrame(columns=["para_no", "scene_no", "content", "script_element"]) scene_nodes : list = json_obj["content"] para = 1 for scene_no, scene in enumerate(scene_nodes, start=1): if "content" not in scene.keys(): continue for paragraph in scene["content"]: se = paragraph["attrs"]["scriptElement"] cn = "" if se == "blank": cn = None elif "content" in paragraph.keys(): cn = paragraph["content"][0]["text"] row = {"para_no": para, "scene_no": scene_no, "content": cn, "script_element": se} para += 1 # df = df.append(row, ignore_index=True) df = pd.DataFrame([row]) prv_el_bl = False for index, row in df.iterrows(): el = row[-1] if el == "blank" and not prv_el_bl: prv_el_bl = True elif el == "blank" and prv_el_bl: df.drop(index=index, inplace=True) else: prv_el_bl = False return df def csv_to_json(csv: pd.DataFrame) -> str: jsons: str = csv.to_json(orient="index") return jsons def docx_to_pdf(doc_path : str, path : str) -> str: subprocess.call( [ "soffice", # '--headless', "--convert-to", "pdf", "--outdir", path, doc_path, ] ) pdf_path = path + '/' + doc_path.rsplit('/', 1)[1].rsplit('.', 1)[0] + ".pdf" return pdf_path def get_plain_text(script_id: str = None, path : str = None) -> str: if not script_id: f_name = path if not path: output_converted_txt: str = "" f_path, f_name = ps_models.MNFScriptDatabase_2.get_file_path(script_id) path: str = f"{f_path}{f_name}" input_file = open(path, "rb") if str(f_name).endswith("txt"): output_converted_txt = input_file elif str(f_name).endswith("pdf"): # pdf = pdftotext.PDF(input_file) # output_converted_txt = "\n\n".join(pdf) #commented on 08-2-24 pass elif str(f_name).endswith("docx"): parsed = parser.from_file(path) output_converted_txt = parsed["content"] else: raise TypeError( f"conversion of {f_name.rsplit('.', maxsplit=1)[-1]} files not supported" ) return output_converted_txt def pdf_to_docx(input_script : str, output_converted_docx : str) -> None: parse(input_script,output_converted_docx,start=0,end=None) def send_email_to_user(user,screenplay_name,subject,message):# removed flag = 1 subject = subject + "." from_email = settings.EMAIL_HOST_USER to = user.email context = { "Name": user, "story_name": screenplay_name, "message" : message, } html_content = render_to_string( "audit/coree_email.html", context ) text_content = strip_tags(html_content) msg = EmailMultiAlternatives(subject, text_content, from_email, [to]) msg.attach_alternative(html_content, "text/html") msg.send() # def screen_play_details(script_id): # """ # This Function will extract the details in the CSV # No of Dialogues # No of ActionLine # No of Scene # No of Speakers # Speakers # """ # print("Entering the ScreenPlay Details extraction") # file_name = get_file_path(script_id, "script-csv") # num_actions = 0 # num_dialogues = 0 # num_slugline = 0 # num_locations= set() # num_speakers = set() # num_tranisitions = 0 # num_parenthetical = 0 # num_special_terms = 0 # num_INT = 0 # num_EXT = 0 # longest_dialogue_len = 0 # longest_dialogue_speaker = "None" # dialogue_count = { # } # with open(file_name, "r") as file: # reader = csv.reader(file) # header = next(reader) # for row in reader: # para_no, scene_no, content, script_element = row # if script_element == "action": # num_actions += 1 # if script_element == "dialogue": # dialogue_count[content]+=1 # if longest_dialogue_len longest_dialogue_len: longest_dialogue_scene = scene_no longest_dialogue_speaker = speakers[-1] longest_dialogue_len = len(content) num_dialogues += 1 if script_element == "slugline": if content.startswith("INT"): num_INT += 1 elif content.startswith("EXT"): num_EXT += 1 num_scenes += 1 num_locations.add(content) if script_element == "parenthetical": num_parenthetical += 1 if script_element == "transition": num_tranisitions += 1 if script_element == "special_term": num_special_terms += 1 if script_element == "speaker": speakers.append(content) num_speakers.add(content) # Calculate the word count for this script element and add it to the total word count words = content.split(" ") total_word_count += len(words) # Estimate the number of pages based on the total word count # interval_at = Script.objects.get(id=script_id).interval_at # lock_status = Script.objects.get(id=script_id).lock_status # no_of_pages = Script.objects.get(id=script_id).no_of_pages #gpt_count = BeatSheet.objects.filter(script__id=script_id, had_used_gpt=True).count() max_dialogue_speaker = max(dialogue_count, key=lambda word: dialogue_count[word]) max_dialogue_count = dialogue_count[max_dialogue_speaker] num_speaker = len(num_speakers) details = { "interval_at": interval_at, "lock_status": lock_status, "no_of_pages": no_of_pages, "action": num_actions, "dialogues": num_dialogues, "scenes": num_scenes, "num_of_speakers": num_speaker, "special_term": num_special_terms, "parenthetical": num_parenthetical, "transition": num_tranisitions, "speakers": list(num_speakers), "max_dialogue_speaker": max_dialogue_speaker, "max_dialogue_count": max_dialogue_count, "longest_dialogue_speaker": longest_dialogue_speaker, "longest_dialogue_len": longest_dialogue_len, "longest_dialogue_scene" : longest_dialogue_scene, "num_of_locations": len(num_locations), "locations": list(num_locations), "num_INT" : num_INT, "num_EXT" : num_EXT, "gpt_count": gpt_count, "total_word_count": total_word_count, # Include the estimated page count in the result } return details def json_to_csv_scriptpad(json_obj: list) -> pd.DataFrame: print('inside json to csv', json_obj) df = pd.DataFrame(columns=["para_no", "content", "script_element"]) para = 1 for paragraph in json_obj: se = paragraph["attrs"]["scriptElement"] cn = "" if se == "blank": cn = None elif "content" in paragraph.keys(): cn = paragraph["content"][0]["text"] row = {"para_no": para, "content": cn, "script_element": se} para += 1 # df = pd.DataFrame([row]) row_df = pd.DataFrame([row]) df = pd.concat([df, row_df], ignore_index=True) prv_el_bl = False for index, row in df.iterrows(): el = row[-1] if el == "blank" and not prv_el_bl: prv_el_bl = True elif el == "blank" and prv_el_bl: df.drop(index=index, inplace=True) else: prv_el_bl = False print("csv df", df) return df # def Screen_Play_status_calculator(script_id): # """ # This Function will extract the details in the CSV # No of Pages # No of Words # No of Dialogues # No of ActionLine # No of Scene # No of Speakers # Speakers # Chatgpt Use Count # Longest Dialouge # """ # print("Entering the ScreenPlay Details extraction") # file_name = get_file_path(script_id, "script-csv") # docx_file = get_file_path(script_id, "script-docx") # doc = docx.Document(docx_file) # # Initiali\ze counters for paragraphs and lines # num_paragraphs = 0 # num_lines = 0 # # Iterate through paragraphs and count sd lines # for paragraph in doc.paragraphs: # num_paragraphs += 1 # num_lines += len(paragraph.text.split('\n')) # words_per_page = 250 # pages = num_lines / words_per_page # num_actions = 0 # num_dialogues = 0 # num_slugline = 0 # num_speakers = set() # num_scene = 0 # with open(file_name, "r") as file: # reader = csv.reader(file) # header = next(reader) # for row in reader: # para_no, scene_no, content, script_element = row # if script_element == "action": # num_actions += 1 # if script_element == "dialogue": # num_dialogues += 1 # if script_element == "slugline": # num_slugline += 1 # if script_element == "speaker": # num_speakers.add(content) # # if scene_no != num_scene: # # num_scene+=1 # num_speaker = len(num_speakers) # details = { # "action": num_actions, # "dialogues":num_dialogues, # "scenes": num_dialogues, # "num_of_speaker": num_speaker, # "speakers": list(num_speakers) # } # # print("Number of action lines:", num_actions) # # print("Number of dialogues:", num_dialogues) # # print("Number of scenes:", num_slugline) # # print("Number of speaker:", len(num_speaker)) # return details