Conversion_Kitchen_Code/kitchen_counter/utils/utilities.py

663 lines
19 KiB
Python
Executable File

from io import IOBase, StringIO
import os,csv
import subprocess
from centralisedFileSystem.models import File, ScreenPlay, Script
from bs4 import BeautifulSoup as bfs
from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.shared import Inches, Mm, Pt
import pandas as pd
# import pdftotext
from pdf2docx import parse
#from tika import parser
# import page_script.models as ps_models
from django.conf import settings
from django.template.loader import render_to_string
from django.utils.html import strip_tags
from django.core.mail import EmailMultiAlternatives
from .filesystem import get_file_path
def fdx_to_csv(fdx_file : IOBase) -> pd.DataFrame:
"""
Converts th efdx document to Pandas DataFrame.
Give FileIO as input, Not file path.
Args:
fdx_file (IOBase): fdx File object
Returns:
pd.DataFrame: csv generated from given fdx
"""
fdx_content = fdx_file.read()
soup = bfs(fdx_content, 'xml')
paragraphs = soup.find_all('Paragraph')
print("paragraphs are", paragraphs)
df = pd.DataFrame(columns=['Text','Script_Element'])
for para in paragraphs:
print(para)
try:
script_element = para['Type']
except:
continue
dirty_texts = para.find_all('Text')
cleaned_texts = [text.get_text().strip() for text in dirty_texts]
full_txt = " ".join(cleaned_texts)
full_txt = full_txt.strip()
_d = pd.DataFrame({'Text':[full_txt], 'Script_Element':[script_element]})
df = pd.concat([df, _d], ignore_index=True)
return df
def fdx_to_txt(fdx_file : IOBase) -> str:
"""
Converts the fdx document to PlainText (string with indentations).
Give FileIO as input, Not file path.
Args:
fdx_file (IOBase): fdx File object
Returns:
str: string generated from given fdx
"""
df = fdx_to_csv(fdx_file)
count = len(df)
with StringIO() as f:
for _, txt, script_element in df.itertuples():
line = ''
if script_element == 'Character':
line = txt.rjust(len(txt)+35)
elif script_element == 'Dialogue':
line = txt.rjust(len(txt)+25)
elif script_element == 'Parenthetical':
line = txt.rjust(len(txt)+30)
elif script_element == 'Transition':
line = txt.rjust(len(txt)+55)
else:
line = txt.rjust(len(txt)+15)
#if script_element in ('Action', 'Scene Heading','Transition'):
# f.write('\n')
f.write(line)
if _ < (count-1):
if script_element in ('Dialogue', 'Action', 'Scene Heading','Transition'):
if not(script_element == 'Dialogue' and df['Script_Element'][_ +1] in ('Dialogue','Parenthetical')):
f.write('\n')
#f.write('\n')
return f.getvalue()
def csv_to_docx(csv: pd.DataFrame) -> Document:
output_doc = Document()
style = output_doc.styles["Normal"]
font = style.font
font.name = "Courier New"
font.size = Pt(12)
section = output_doc.sections[0]
section.page_height = Mm(297)
a4_right = 8.57
section.page_width = Inches(a4_right)
section.left_margin = Inches(1.5)
for index in csv.index:
para = output_doc.add_paragraph()
paragraph_format = para.paragraph_format
paragraph_format.space_before = Pt(0)
paragraph_format.space_after = Pt(0)
paragraph_format.line_spacing = Pt(12)
script_element = csv["script_element"][index]
content = csv["content"][index]
print("content = ",content)
if script_element == "blank":
continue
elif script_element == "slugline":
paragraph_format.left_indent = Inches(0)
paragraph_format.right_indent = Inches(0)
print("content is slugline")
try:
content = content.upper()
except Exception as exp:
print("Exception =", exp)
content = content
elif script_element == "action":
paragraph_format.left_indent = Inches(0)
paragraph_format.right_indent = Inches(0)
elif script_element == "dialogue":
paragraph_format.left_indent = Inches(1.0)
paragraph_format.right_indent = Inches(1.25)
elif script_element == "parenthetical":
paragraph_format.left_indent = Inches(1.5)
paragraph_format.right_indent = Inches(2.25)
elif script_element == "speaker":
paragraph_format.left_indent = Inches(2)
paragraph_format.right_indent = Inches(1)
print("content is speaker")
try:
content = content.upper()
except Exception as exp:
print("Exception =", exp)
content = content
elif script_element == "transition":
para.alignment = WD_ALIGN_PARAGRAPH.RIGHT
paragraph_format.left_indent = Inches(2.5)
paragraph_format.right_indent = Inches(0)
elif script_element == "special_term":
paragraph_format.left_indent = Inches(0)
paragraph_format.right_indent = Inches(0)
if isinstance(content, float):
content = ""
para.text = content
return output_doc
def get_csv_from_id(script_id : str, encoding : str ="utf-8") -> pd.DataFrame:
f_path, f_name = ps_models.MNFScriptDatabase_2.get_file_path(script_id)
path = f"{f_path}{f_name.rsplit('.',1)[0]+'_audited.csv'}"
if not os.path.exists(path):
path = path.replace("_audited.csv", ".csv")
if not os.path.exists(path):
raise FileNotFoundError(f"Not found : {path}")
try:
df = pd.read_csv(path, encoding=encoding)
except UnicodeError:
df = pd.read_csv(path, encoding="utf-16")
return df
def get_csv_path_from_id(script_id : str, encoding : str ="utf-8") -> pd.DataFrame:
f_path, f_name = ps_models.MNFScriptDatabase_2.get_file_path(script_id,"csv")
path = f"{f_path}{f_name.rsplit('.',1)[0]+'_audited.csv'}"
if not os.path.exists(path):
path = path.replace("_audited.csv", ".csv")
if not os.path.exists(path):
raise FileNotFoundError(f"Not found : {path}")
return path
def json_to_csv(json_obj: dict) -> pd.DataFrame:
df = pd.DataFrame(columns=["para_no", "scene_no", "content", "script_element"])
scene_nodes : list = json_obj["content"]
para = 1
for scene_no, scene in enumerate(scene_nodes, start=1):
if "content" not in scene.keys():
continue
for paragraph in scene["content"]:
se = paragraph["attrs"]["scriptElement"]
cn = ""
if se == "blank":
cn = None
elif "content" in paragraph.keys():
cn = paragraph["content"][0]["text"]
row = {"para_no": para, "scene_no": scene_no, "content": cn, "script_element": se}
para += 1
# df = df.append(row, ignore_index=True)
df = pd.DataFrame([row])
prv_el_bl = False
for index, row in df.iterrows():
el = row[-1]
if el == "blank" and not prv_el_bl:
prv_el_bl = True
elif el == "blank" and prv_el_bl:
df.drop(index=index, inplace=True)
else:
prv_el_bl = False
return df
def csv_to_json(csv: pd.DataFrame) -> str:
jsons: str = csv.to_json(orient="index")
return jsons
def docx_to_pdf(doc_path : str, path : str) -> str:
subprocess.call(
[
"soffice",
# '--headless',
"--convert-to",
"pdf",
"--outdir",
path,
doc_path,
]
)
pdf_path = path + '/' + doc_path.rsplit('/', 1)[1].rsplit('.', 1)[0] + ".pdf"
return pdf_path
def get_plain_text(script_id: str = None, path : str = None) -> str:
if not script_id:
f_name = path
if not path:
output_converted_txt: str = ""
f_path, f_name = ps_models.MNFScriptDatabase_2.get_file_path(script_id)
path: str = f"{f_path}{f_name}"
input_file = open(path, "rb")
if str(f_name).endswith("txt"):
output_converted_txt = input_file
elif str(f_name).endswith("pdf"):
# pdf = pdftotext.PDF(input_file)
# output_converted_txt = "\n\n".join(pdf) #commented on 08-2-24
pass
elif str(f_name).endswith("docx"):
parsed = parser.from_file(path)
output_converted_txt = parsed["content"]
else:
raise TypeError(
f"conversion of {f_name.rsplit('.', maxsplit=1)[-1]} files not supported"
)
return output_converted_txt
def pdf_to_docx(input_script : str, output_converted_docx : str) -> None:
parse(input_script,output_converted_docx,start=0,end=None)
def send_email_to_user(user,screenplay_name,subject,message):# removed flag = 1
subject = subject + "."
from_email = settings.EMAIL_HOST_USER
to = user.email
context = {
"Name": user,
"story_name": screenplay_name,
"message" : message,
}
html_content = render_to_string(
"audit/coree_email.html", context
)
text_content = strip_tags(html_content)
msg = EmailMultiAlternatives(subject, text_content, from_email, [to])
msg.attach_alternative(html_content, "text/html")
msg.send()
# def screen_play_details(script_id):
# """
# This Function will extract the details in the CSV
# No of Dialogues
# No of ActionLine
# No of Scene
# No of Speakers
# Speakers
# """
# print("Entering the ScreenPlay Details extraction")
# file_name = get_file_path(script_id, "script-csv")
# num_actions = 0
# num_dialogues = 0
# num_slugline = 0
# num_locations= set()
# num_speakers = set()
# num_tranisitions = 0
# num_parenthetical = 0
# num_special_terms = 0
# num_INT = 0
# num_EXT = 0
# longest_dialogue_len = 0
# longest_dialogue_speaker = "None"
# dialogue_count = {
# }
# with open(file_name, "r") as file:
# reader = csv.reader(file)
# header = next(reader)
# for row in reader:
# para_no, scene_no, content, script_element = row
# if script_element == "action":
# num_actions += 1
# if script_element == "dialogue":
# dialogue_count[content]+=1
# if longest_dialogue_len <len(content):
# longest_dialogue_speaker = content
# longest_dialogue_len = max(len(content),longest_dialogue_len)
# num_dialogues += 1
# if script_element == "slugline":
# if content[0:3]=="INT":
# num_INT+=1
# if content[0:3]=="EXT":
# num_EXT+=1
# num_slugline += 1
# num_locations.add(content)
# if script_element == "parenthetical":
# num_parenthetical += 1
# if script_element == "transition":
# num_tranisitions += 1
# if script_element == "special_term":
# num_special_terms += 1
# if script_element == "speaker":
# num_speakers.add(content)
# max_dialogue_speaker = max(dialogue_count, key=lambda word: dialogue_count[word])
# max_dialouge_count = dialogue_count[max_dialogue_speaker]
# num_speaker = len(num_speakers)
# details = {
# "action": num_actions,
# "dialogues":num_dialogues,
# "scenes": num_dialogues,
# "num_of_speaker": num_speaker,
# "special_term" : num_special_terms,
# "parenthetical" : num_parenthetical,
# "transition" : num_tranisitions,
# "speakers": list(num_speakers),
# "max_dialogue_speaker" : max_dialogue_speaker,
# "max_dialouge_count" : max_dialouge_count,
# "longest_dialogue_speaker" : longest_dialogue_speaker,
# "longest_dialogue_len" : longest_dialogue_len,
# "num_of_locations" : len(num_locations)
# }
# # print("Number of action lines:", num_actions)
# # print("Number of dialogues:", num_dialogues)
# # print("Number of scenes:", num_slugline)
# # print("Number of speaker:", len(num_speaker))
# return details
import csv
def estimate_page_count(word_count, words_per_page=100):
# You can adjust the words_per_page value based on your specific script formatting
return word_count / words_per_page
def screen_play_details(script_id):
print("Entering the ScreenPlay Details extraction")
file_name = get_file_path(script_id, "script-csv")
num_actions = 0
num_dialogues = 0
num_scenes = 0
num_locations = set()
num_speakers = set()
num_tranisitions = 0
num_parenthetical = 0
num_special_terms = 0
num_INT = 0
num_EXT = 0
longest_dialogue_len = 0
longest_dialogue_scene = 0
longest_dialogue_speaker = "None"
dialogue_count = {}
speakers = []
total_word_count = 0 # To calculate the total word count
gpt_count = 0
interval_at = 0
lock_status = False
no_of_pages = 0
with open(file_name, "r") as file:
reader = csv.reader(file)
header = next(reader)
for row in reader:
para_no, scene_no, content, script_element = row
if script_element == "action":
num_actions += 1
if script_element == "dialogue":
if content in dialogue_count:
dialogue_count[speakers[-1]] += 1
else:
dialogue_count[speakers[-1]] = 1
if len(content) > longest_dialogue_len:
longest_dialogue_scene = scene_no
longest_dialogue_speaker = speakers[-1]
longest_dialogue_len = len(content)
num_dialogues += 1
if script_element == "slugline":
if content.startswith("INT"):
num_INT += 1
elif content.startswith("EXT"):
num_EXT += 1
num_scenes += 1
num_locations.add(content)
if script_element == "parenthetical":
num_parenthetical += 1
if script_element == "transition":
num_tranisitions += 1
if script_element == "special_term":
num_special_terms += 1
if script_element == "speaker":
speakers.append(content)
num_speakers.add(content)
# Calculate the word count for this script element and add it to the total word count
words = content.split(" ")
total_word_count += len(words)
# Estimate the number of pages based on the total word count
# interval_at = Script.objects.get(id=script_id).interval_at
# lock_status = Script.objects.get(id=script_id).lock_status
# no_of_pages = Script.objects.get(id=script_id).no_of_pages
#gpt_count = BeatSheet.objects.filter(script__id=script_id, had_used_gpt=True).count()
max_dialogue_speaker = max(dialogue_count, key=lambda word: dialogue_count[word])
max_dialogue_count = dialogue_count[max_dialogue_speaker]
num_speaker = len(num_speakers)
details = {
"interval_at": interval_at,
"lock_status": lock_status,
"no_of_pages": no_of_pages,
"action": num_actions,
"dialogues": num_dialogues,
"scenes": num_scenes,
"num_of_speakers": num_speaker,
"special_term": num_special_terms,
"parenthetical": num_parenthetical,
"transition": num_tranisitions,
"speakers": list(num_speakers),
"max_dialogue_speaker": max_dialogue_speaker,
"max_dialogue_count": max_dialogue_count,
"longest_dialogue_speaker": longest_dialogue_speaker,
"longest_dialogue_len": longest_dialogue_len,
"longest_dialogue_scene" : longest_dialogue_scene,
"num_of_locations": len(num_locations),
"locations": list(num_locations),
"num_INT" : num_INT,
"num_EXT" : num_EXT,
"gpt_count": gpt_count,
"total_word_count": total_word_count,
# Include the estimated page count in the result
}
return details
def json_to_csv_scriptpad(json_obj: list) -> pd.DataFrame:
print('inside json to csv', json_obj)
df = pd.DataFrame(columns=["para_no", "content", "script_element"])
para = 1
for paragraph in json_obj:
se = paragraph["attrs"]["scriptElement"]
cn = ""
if se == "blank":
cn = None
elif "content" in paragraph.keys():
cn = paragraph["content"][0]["text"]
row = {"para_no": para, "content": cn, "script_element": se}
para += 1
# df = pd.DataFrame([row])
row_df = pd.DataFrame([row])
df = pd.concat([df, row_df], ignore_index=True)
prv_el_bl = False
for index, row in df.iterrows():
el = row[-1]
if el == "blank" and not prv_el_bl:
prv_el_bl = True
elif el == "blank" and prv_el_bl:
df.drop(index=index, inplace=True)
else:
prv_el_bl = False
print("csv df", df)
return df
# def Screen_Play_status_calculator(script_id):
# """
# This Function will extract the details in the CSV
# No of Pages
# No of Words
# No of Dialogues
# No of ActionLine
# No of Scene
# No of Speakers
# Speakers
# Chatgpt Use Count
# Longest Dialouge
# """
# print("Entering the ScreenPlay Details extraction")
# file_name = get_file_path(script_id, "script-csv")
# docx_file = get_file_path(script_id, "script-docx")
# doc = docx.Document(docx_file)
# # Initiali\ze counters for paragraphs and lines
# num_paragraphs = 0
# num_lines = 0
# # Iterate through paragraphs and count sd lines
# for paragraph in doc.paragraphs:
# num_paragraphs += 1
# num_lines += len(paragraph.text.split('\n'))
# words_per_page = 250
# pages = num_lines / words_per_page
# num_actions = 0
# num_dialogues = 0
# num_slugline = 0
# num_speakers = set()
# num_scene = 0
# with open(file_name, "r") as file:
# reader = csv.reader(file)
# header = next(reader)
# for row in reader:
# para_no, scene_no, content, script_element = row
# if script_element == "action":
# num_actions += 1
# if script_element == "dialogue":
# num_dialogues += 1
# if script_element == "slugline":
# num_slugline += 1
# if script_element == "speaker":
# num_speakers.add(content)
# # if scene_no != num_scene:
# # num_scene+=1
# num_speaker = len(num_speakers)
# details = {
# "action": num_actions,
# "dialogues":num_dialogues,
# "scenes": num_dialogues,
# "num_of_speaker": num_speaker,
# "speakers": list(num_speakers)
# }
# # print("Number of action lines:", num_actions)
# # print("Number of dialogues:", num_dialogues)
# # print("Number of scenes:", num_slugline)
# # print("Number of speaker:", len(num_speaker))
# return details