Conversion_Kitchen_Code/kitchen_counter/MNF/utils/utilities.py

657 lines
19 KiB
Python
Raw Permalink Normal View History

2024-04-27 09:33:09 +00:00
from io import IOBase, StringIO
import os,csv
import subprocess
from centralisedFileSystem.models import File, ScreenPlay, Script , BeatSheet
from bs4 import BeautifulSoup as bfs
from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.shared import Inches, Mm, Pt
import pandas as pd
# import pdftotext
from pdf2docx import parse
from tika import parser
import page_script.models as ps_models
from django.conf import settings
from django.template.loader import render_to_string
from django.utils.html import strip_tags
from django.core.mail import EmailMultiAlternatives
from .filesystem import get_file_path
def fdx_to_csv(fdx_file : IOBase) -> pd.DataFrame:
"""
Converts th efdx document to Pandas DataFrame.
Give FileIO as input, Not file path.
Args:
fdx_file (IOBase): fdx File object
Returns:
pd.DataFrame: csv generated from given fdx
"""
fdx_content = fdx_file.read()
soup = bfs(fdx_content, 'xml')
paragraphs = soup.find_all('Paragraph')
df = pd.DataFrame(columns=['Text','Script_Element'])
for para in paragraphs:
try:
script_element = para['Type']
except:
continue
dirty_texts = para.find_all('Text')
cleaned_texts = [text.get_text().strip() for text in dirty_texts]
full_txt = " ".join(cleaned_texts)
full_txt = full_txt.strip()
_d = pd.DataFrame({'Text':[full_txt], 'Script_Element':[script_element]})
df = pd.concat([df, _d], ignore_index=True)
return df
def fdx_to_txt(fdx_file : IOBase) -> str:
"""
Converts the fdx document to PlainText (string with indentations).
Give FileIO as input, Not file path.
Args:
fdx_file (IOBase): fdx File object
Returns:
str: string generated from given fdx
"""
df = fdx_to_csv(fdx_file)
count = len(df)
with StringIO() as f:
for _, txt, script_element in df.itertuples():
line = ''
if script_element == 'Character':
line = txt.rjust(len(txt)+35)
elif script_element == 'Dialogue':
line = txt.rjust(len(txt)+25)
elif script_element == 'Parenthetical':
line = txt.rjust(len(txt)+30)
elif script_element == 'Transition':
line = txt.rjust(len(txt)+55)
else:
line = txt.rjust(len(txt)+15)
#if script_element in ('Action', 'Scene Heading','Transition'):
# f.write('\n')
f.write(line)
if _ < (count-1):
if script_element in ('Dialogue', 'Action', 'Scene Heading','Transition'):
if not(script_element == 'Dialogue' and df['Script_Element'][_ +1] in ('Dialogue','Parenthetical')):
f.write('\n')
#f.write('\n')
return f.getvalue()
def csv_to_docx(csv: pd.DataFrame) -> Document:
output_doc = Document()
style = output_doc.styles["Normal"]
font = style.font
font.name = "Courier New"
font.size = Pt(12)
section = output_doc.sections[0]
section.page_height = Mm(297)
a4_right = 8.57
section.page_width = Inches(a4_right)
section.left_margin = Inches(1.5)
for index in csv.index:
para = output_doc.add_paragraph()
paragraph_format = para.paragraph_format
paragraph_format.space_before = Pt(0)
paragraph_format.space_after = Pt(0)
paragraph_format.line_spacing = Pt(12)
script_element = csv["script_element"][index]
content = csv["content"][index]
print("content = ",content)
if script_element == "blank":
continue
elif script_element == "slugline":
paragraph_format.left_indent = Inches(0)
paragraph_format.right_indent = Inches(0)
print("content is slugline")
try:
content = content.upper()
except Exception as exp:
print("Exception =", exp)
content = content
elif script_element == "action":
paragraph_format.left_indent = Inches(0)
paragraph_format.right_indent = Inches(0)
elif script_element == "dialogue":
paragraph_format.left_indent = Inches(1.0)
paragraph_format.right_indent = Inches(1.25)
elif script_element == "parenthetical":
paragraph_format.left_indent = Inches(1.5)
paragraph_format.right_indent = Inches(2.25)
elif script_element == "speaker":
paragraph_format.left_indent = Inches(2)
paragraph_format.right_indent = Inches(1)
print("content is speaker")
try:
content = content.upper()
except Exception as exp:
print("Exception =", exp)
content = content
elif script_element == "transition":
para.alignment = WD_ALIGN_PARAGRAPH.RIGHT
paragraph_format.left_indent = Inches(2.5)
paragraph_format.right_indent = Inches(0)
elif script_element == "special_term":
paragraph_format.left_indent = Inches(0)
paragraph_format.right_indent = Inches(0)
if isinstance(content, float):
content = ""
para.text = content
return output_doc
def get_csv_from_id(script_id : str, encoding : str ="utf-8") -> pd.DataFrame:
f_path, f_name = ps_models.MNFScriptDatabase_2.get_file_path(script_id)
path = f"{f_path}{f_name.rsplit('.',1)[0]+'_audited.csv'}"
if not os.path.exists(path):
path = path.replace("_audited.csv", ".csv")
if not os.path.exists(path):
raise FileNotFoundError(f"Not found : {path}")
try:
df = pd.read_csv(path, encoding=encoding)
except UnicodeError:
df = pd.read_csv(path, encoding="utf-16")
return df
def get_csv_path_from_id(script_id : str, encoding : str ="utf-8") -> pd.DataFrame:
f_path, f_name = ps_models.MNFScriptDatabase_2.get_file_path(script_id,"csv")
path = f"{f_path}{f_name.rsplit('.',1)[0]+'_audited.csv'}"
if not os.path.exists(path):
path = path.replace("_audited.csv", ".csv")
if not os.path.exists(path):
raise FileNotFoundError(f"Not found : {path}")
return path
def json_to_csv(json_obj: dict) -> pd.DataFrame:
df = pd.DataFrame(columns=["para_no", "scene_no", "content", "script_element"])
scene_nodes : list = json_obj["content"]
para = 1
for scene_no, scene in enumerate(scene_nodes, start=1):
if "content" not in scene.keys():
continue
for paragraph in scene["content"]:
se = paragraph["attrs"]["scriptElement"]
cn = ""
if se == "blank":
cn = None
elif "content" in paragraph.keys():
cn = paragraph["content"][0]["text"]
row = {"para_no": para, "scene_no": scene_no, "content": cn, "script_element": se}
para += 1
# df = df.append(row, ignore_index=True)
df = pd.DataFrame([row])
prv_el_bl = False
for index, row in df.iterrows():
el = row[-1]
if el == "blank" and not prv_el_bl:
prv_el_bl = True
elif el == "blank" and prv_el_bl:
df.drop(index=index, inplace=True)
else:
prv_el_bl = False
return df
def csv_to_json(csv: pd.DataFrame) -> str:
jsons: str = csv.to_json(orient="index")
return jsons
def docx_to_pdf(doc_path : str, path : str) -> str:
subprocess.call(
[
"soffice",
# '--headless',
"--convert-to",
"pdf",
"--outdir",
path,
doc_path,
]
)
pdf_path = path + '/' + doc_path.rsplit('/', 1)[1].rsplit('.', 1)[0] + ".pdf"
return pdf_path
def get_plain_text(script_id: str = None, path : str = None) -> str:
if not script_id:
f_name = path
if not path:
output_converted_txt: str = ""
f_path, f_name = ps_models.MNFScriptDatabase_2.get_file_path(script_id)
path: str = f"{f_path}{f_name}"
input_file = open(path, "rb")
if str(f_name).endswith("txt"):
output_converted_txt = input_file
elif str(f_name).endswith("pdf"):
# pdf = pdftotext.PDF(input_file)
# output_converted_txt = "\n\n".join(pdf) #commented on 08-2-24
pass
elif str(f_name).endswith("docx"):
parsed = parser.from_file(path)
output_converted_txt = parsed["content"]
else:
raise TypeError(
f"conversion of {f_name.rsplit('.', maxsplit=1)[-1]} files not supported"
)
return output_converted_txt
def pdf_to_docx(input_script : str, output_converted_docx : str) -> None:
parse(input_script,output_converted_docx,start=0,end=None)
def send_email_to_user(user,screenplay_name,subject,message):# removed flag = 1
subject = subject + "."
from_email = settings.EMAIL_HOST_USER
to = user.email
context = {
"Name": user,
"story_name": screenplay_name,
"message" : message,
}
html_content = render_to_string(
"audit/coree_email.html", context
)
text_content = strip_tags(html_content)
msg = EmailMultiAlternatives(subject, text_content, from_email, [to])
msg.attach_alternative(html_content, "text/html")
msg.send()
# def screen_play_details(script_id):
# """
# This Function will extract the details in the CSV
# No of Dialogues
# No of ActionLine
# No of Scene
# No of Speakers
# Speakers
# """
# print("Entering the ScreenPlay Details extraction")
# file_name = get_file_path(script_id, "script-csv")
# num_actions = 0
# num_dialogues = 0
# num_slugline = 0
# num_locations= set()
# num_speakers = set()
# num_tranisitions = 0
# num_parenthetical = 0
# num_special_terms = 0
# num_INT = 0
# num_EXT = 0
# longest_dialogue_len = 0
# longest_dialogue_speaker = "None"
# dialogue_count = {
# }
# with open(file_name, "r") as file:
# reader = csv.reader(file)
# header = next(reader)
# for row in reader:
# para_no, scene_no, content, script_element = row
# if script_element == "action":
# num_actions += 1
# if script_element == "dialogue":
# dialogue_count[content]+=1
# if longest_dialogue_len <len(content):
# longest_dialogue_speaker = content
# longest_dialogue_len = max(len(content),longest_dialogue_len)
# num_dialogues += 1
# if script_element == "slugline":
# if content[0:3]=="INT":
# num_INT+=1
# if content[0:3]=="EXT":
# num_EXT+=1
# num_slugline += 1
# num_locations.add(content)
# if script_element == "parenthetical":
# num_parenthetical += 1
# if script_element == "transition":
# num_tranisitions += 1
# if script_element == "special_term":
# num_special_terms += 1
# if script_element == "speaker":
# num_speakers.add(content)
# max_dialogue_speaker = max(dialogue_count, key=lambda word: dialogue_count[word])
# max_dialouge_count = dialogue_count[max_dialogue_speaker]
# num_speaker = len(num_speakers)
# details = {
# "action": num_actions,
# "dialogues":num_dialogues,
# "scenes": num_dialogues,
# "num_of_speaker": num_speaker,
# "special_term" : num_special_terms,
# "parenthetical" : num_parenthetical,
# "transition" : num_tranisitions,
# "speakers": list(num_speakers),
# "max_dialogue_speaker" : max_dialogue_speaker,
# "max_dialouge_count" : max_dialouge_count,
# "longest_dialogue_speaker" : longest_dialogue_speaker,
# "longest_dialogue_len" : longest_dialogue_len,
# "num_of_locations" : len(num_locations)
# }
# # print("Number of action lines:", num_actions)
# # print("Number of dialogues:", num_dialogues)
# # print("Number of scenes:", num_slugline)
# # print("Number of speaker:", len(num_speaker))
# return details
import csv
def estimate_page_count(word_count, words_per_page=100):
# You can adjust the words_per_page value based on your specific script formatting
return word_count / words_per_page
def screen_play_details(script_id):
print("Entering the ScreenPlay Details extraction")
file_name = get_file_path(script_id, "script-csv")
num_actions = 0
num_dialogues = 0
num_scenes = 0
num_locations = set()
num_speakers = set()
num_tranisitions = 0
num_parenthetical = 0
num_special_terms = 0
num_INT = 0
num_EXT = 0
longest_dialogue_len = 0
longest_dialogue_scene = 0
longest_dialogue_speaker = "None"
dialogue_count = {}
speakers = []
total_word_count = 0 # To calculate the total word count
gpt_count = 0
interval_at = 0
lock_status = False
no_of_pages = 0
with open(file_name, "r") as file:
reader = csv.reader(file)
header = next(reader)
for row in reader:
para_no, scene_no, content, script_element = row
if script_element == "action":
num_actions += 1
if script_element == "dialogue":
if content in dialogue_count:
dialogue_count[speakers[-1]] += 1
else:
dialogue_count[speakers[-1]] = 1
if len(content) > longest_dialogue_len:
longest_dialogue_scene = scene_no
longest_dialogue_speaker = speakers[-1]
longest_dialogue_len = len(content)
num_dialogues += 1
if script_element == "slugline":
if content.startswith("INT"):
num_INT += 1
elif content.startswith("EXT"):
num_EXT += 1
num_scenes += 1
num_locations.add(content)
if script_element == "parenthetical":
num_parenthetical += 1
if script_element == "transition":
num_tranisitions += 1
if script_element == "special_term":
num_special_terms += 1
if script_element == "speaker":
speakers.append(content)
num_speakers.add(content)
# Calculate the word count for this script element and add it to the total word count
words = content.split(" ")
total_word_count += len(words)
# Estimate the number of pages based on the total word count
# interval_at = Script.objects.get(id=script_id).interval_at
# lock_status = Script.objects.get(id=script_id).lock_status
# no_of_pages = Script.objects.get(id=script_id).no_of_pages
#gpt_count = BeatSheet.objects.filter(script__id=script_id, had_used_gpt=True).count()
max_dialogue_speaker = max(dialogue_count, key=lambda word: dialogue_count[word])
max_dialogue_count = dialogue_count[max_dialogue_speaker]
num_speaker = len(num_speakers)
details = {
"interval_at": interval_at,
"lock_status": lock_status,
"no_of_pages": no_of_pages,
"action": num_actions,
"dialogues": num_dialogues,
"scenes": num_scenes,
"num_of_speakers": num_speaker,
"special_term": num_special_terms,
"parenthetical": num_parenthetical,
"transition": num_tranisitions,
"speakers": list(num_speakers),
"max_dialogue_speaker": max_dialogue_speaker,
"max_dialogue_count": max_dialogue_count,
"longest_dialogue_speaker": longest_dialogue_speaker,
"longest_dialogue_len": longest_dialogue_len,
"longest_dialogue_scene" : longest_dialogue_scene,
"num_of_locations": len(num_locations),
"locations": list(num_locations),
"num_INT" : num_INT,
"num_EXT" : num_EXT,
"gpt_count": gpt_count,
"total_word_count": total_word_count,
# Include the estimated page count in the result
}
return details
def json_to_csv_scriptpad(json_obj: list) -> pd.DataFrame:
df = pd.DataFrame(columns=["para_no", "content", "script_element"])
para = 1
for paragraph in json_obj:
se = paragraph["attrs"]["scriptElement"]
cn = ""
if se == "blank":
cn = None
elif "content" in paragraph.keys():
cn = paragraph["content"][0]["text"]
row = {"para_no": para, "content": cn, "script_element": se}
para += 1
# df = df.append(row, ignore_index=True)
df = pd.DataFrame([row])
prv_el_bl = False
for index, row in df.iterrows():
el = row[-1]
if el == "blank" and not prv_el_bl:
prv_el_bl = True
elif el == "blank" and prv_el_bl:
df.drop(index=index, inplace=True)
else:
prv_el_bl = False
return df
# def Screen_Play_status_calculator(script_id):
# """
# This Function will extract the details in the CSV
# No of Pages
# No of Words
# No of Dialogues
# No of ActionLine
# No of Scene
# No of Speakers
# Speakers
# Chatgpt Use Count
# Longest Dialouge
# """
# print("Entering the ScreenPlay Details extraction")
# file_name = get_file_path(script_id, "script-csv")
# docx_file = get_file_path(script_id, "script-docx")
# doc = docx.Document(docx_file)
# # Initiali\ze counters for paragraphs and lines
# num_paragraphs = 0
# num_lines = 0
# # Iterate through paragraphs and count sd lines
# for paragraph in doc.paragraphs:
# num_paragraphs += 1
# num_lines += len(paragraph.text.split('\n'))
# words_per_page = 250
# pages = num_lines / words_per_page
# num_actions = 0
# num_dialogues = 0
# num_slugline = 0
# num_speakers = set()
# num_scene = 0
# with open(file_name, "r") as file:
# reader = csv.reader(file)
# header = next(reader)
# for row in reader:
# para_no, scene_no, content, script_element = row
# if script_element == "action":
# num_actions += 1
# if script_element == "dialogue":
# num_dialogues += 1
# if script_element == "slugline":
# num_slugline += 1
# if script_element == "speaker":
# num_speakers.add(content)
# # if scene_no != num_scene:
# # num_scene+=1
# num_speaker = len(num_speakers)
# details = {
# "action": num_actions,
# "dialogues":num_dialogues,
# "scenes": num_dialogues,
# "num_of_speaker": num_speaker,
# "speakers": list(num_speakers)
# }
# # print("Number of action lines:", num_actions)
# # print("Number of dialogues:", num_dialogues)
# # print("Number of scenes:", num_slugline)
# # print("Number of speaker:", len(num_speaker))
# return details