Conversion_Kitchen_Code/kitchen_counter/utils/utilities.py

from io import IOBase, StringIO
import os,csv
import subprocess
from centralisedFileSystem.models import File, ScreenPlay, Script

from bs4 import BeautifulSoup as bfs
from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.shared import Inches, Mm, Pt
import pandas as pd
# import pdftotext
from pdf2docx import parse
#from tika import parser

# import page_script.models as ps_models
from django.conf import settings
from django.template.loader import render_to_string
from django.utils.html import strip_tags
from django.core.mail import EmailMultiAlternatives
from .filesystem import get_file_path

def fdx_to_csv(fdx_file : IOBase) -> pd.DataFrame:
    """
    Converts th efdx document to Pandas DataFrame.
    Give FileIO as input, Not file path.

    Args:
        fdx_file (IOBase): fdx File object

    Returns:
        pd.DataFrame: csv generated from given fdx
    """

    fdx_content = fdx_file.read()
    soup = bfs(fdx_content, 'xml')

    paragraphs = soup.find_all('Paragraph')
    print("paragraphs are", paragraphs)

    df = pd.DataFrame(columns=['Text','Script_Element'])

    for para in paragraphs:
        print(para)
        try:
            script_element = para['Type']
        except:
            continue

        dirty_texts = para.find_all('Text')
        cleaned_texts = [text.get_text().strip() for text in dirty_texts]

        full_txt = " ".join(cleaned_texts)
        full_txt = full_txt.strip()

        _d = pd.DataFrame({'Text':[full_txt], 'Script_Element':[script_element]})
        df = pd.concat([df, _d], ignore_index=True)

    return df


def fdx_to_txt(fdx_file : IOBase) -> str:
    """
    Converts the fdx document to PlainText (string with indentations).
    Give FileIO as input, Not file path.

    Args:
        fdx_file (IOBase): fdx File object

    Returns:
        str: string generated from given fdx
    """

    df = fdx_to_csv(fdx_file)

    count = len(df)
    with StringIO() as f:

        for _, txt, script_element in df.itertuples():
            line = ''

            if script_element == 'Character':
                line = txt.rjust(len(txt)+35)
            elif script_element == 'Dialogue':
                line = txt.rjust(len(txt)+25)
            elif script_element == 'Parenthetical':
                line = txt.rjust(len(txt)+30)
            elif script_element == 'Transition':
                line = txt.rjust(len(txt)+55)
            else:
                line = txt.rjust(len(txt)+15)

            #if script_element in ('Action', 'Scene Heading','Transition'):
            #    f.write('\n')

            f.write(line)

            if _ < (count-1):
                if script_element in ('Dialogue', 'Action', 'Scene Heading','Transition'):
                    if not(script_element == 'Dialogue' and df['Script_Element'][_ +1] in ('Dialogue','Parenthetical')):
                        f.write('\n')

            #f.write('\n')

        return f.getvalue()


def csv_to_docx(csv: pd.DataFrame) -> Document:

    output_doc = Document()
    style = output_doc.styles["Normal"]
    font = style.font
    font.name = "Courier New"
    font.size = Pt(12)
    section = output_doc.sections[0]
    section.page_height = Mm(297)
    a4_right = 8.57
    section.page_width = Inches(a4_right)
    section.left_margin = Inches(1.5)

    for index in csv.index:
        para = output_doc.add_paragraph()

        paragraph_format = para.paragraph_format

        paragraph_format.space_before = Pt(0)
        paragraph_format.space_after = Pt(0)
        paragraph_format.line_spacing = Pt(12)

        script_element = csv["script_element"][index]
        content = csv["content"][index]
        print("content = ",content)
        if script_element == "blank":
            continue

        elif script_element == "slugline":
            paragraph_format.left_indent = Inches(0)
            paragraph_format.right_indent = Inches(0)
            print("content is slugline")
            try:
                content = content.upper()
            except Exception as exp:
                print("Exception =", exp)
                content = content


        elif script_element == "action":
            paragraph_format.left_indent = Inches(0)
            paragraph_format.right_indent = Inches(0)

        elif script_element == "dialogue":
            paragraph_format.left_indent = Inches(1.0)
            paragraph_format.right_indent = Inches(1.25)

        elif script_element == "parenthetical":
            paragraph_format.left_indent = Inches(1.5)
            paragraph_format.right_indent = Inches(2.25)

        elif script_element == "speaker":
            paragraph_format.left_indent = Inches(2)
            paragraph_format.right_indent = Inches(1)
            print("content is speaker")
            try:
                content = content.upper()
            except Exception as exp:
                print("Exception =", exp)
                content = content

        elif script_element == "transition":
            para.alignment = WD_ALIGN_PARAGRAPH.RIGHT
            paragraph_format.left_indent = Inches(2.5)
            paragraph_format.right_indent = Inches(0)

        elif script_element == "special_term":
            paragraph_format.left_indent = Inches(0)
            paragraph_format.right_indent = Inches(0)

        if isinstance(content, float):
            content = ""

        para.text = content

    return output_doc


def get_csv_from_id(script_id : str, encoding : str ="utf-8") -> pd.DataFrame:

    f_path, f_name = ps_models.MNFScriptDatabase_2.get_file_path(script_id)

    path = f"{f_path}{f_name.rsplit('.',1)[0]+'_audited.csv'}"

    if not os.path.exists(path):

        path = path.replace("_audited.csv", ".csv")

        if not os.path.exists(path):
            raise FileNotFoundError(f"Not found : {path}")

    try:
        df = pd.read_csv(path, encoding=encoding)
    except UnicodeError:
        df = pd.read_csv(path, encoding="utf-16")

    return df

def get_csv_path_from_id(script_id : str, encoding : str ="utf-8") -> pd.DataFrame:

    f_path, f_name = ps_models.MNFScriptDatabase_2.get_file_path(script_id,"csv")

    path = f"{f_path}{f_name.rsplit('.',1)[0]+'_audited.csv'}"

    if not os.path.exists(path):

        path = path.replace("_audited.csv", ".csv")

        if not os.path.exists(path):
            raise FileNotFoundError(f"Not found : {path}")

    return path


def json_to_csv(json_obj: dict) -> pd.DataFrame:

    df = pd.DataFrame(columns=["para_no", "scene_no", "content", "script_element"])

    scene_nodes : list = json_obj["content"]
    para = 1

    for scene_no, scene in enumerate(scene_nodes, start=1):

        if "content" not in scene.keys():
            continue

        for paragraph in scene["content"]:

            se = paragraph["attrs"]["scriptElement"]

            cn = ""

            if se == "blank":
                cn = None

            elif "content" in paragraph.keys():
                cn = paragraph["content"][0]["text"]

            row = {"para_no": para, "scene_no": scene_no, "content": cn, "script_element": se}

            para += 1

            # df = df.append(row, ignore_index=True)
            df = pd.DataFrame([row])

    prv_el_bl = False
    for index, row in df.iterrows():
        el = row[-1]
        if el == "blank" and not prv_el_bl:
            prv_el_bl = True
        elif el == "blank" and prv_el_bl:
            df.drop(index=index, inplace=True)
        else:
            prv_el_bl = False

    return df


def csv_to_json(csv: pd.DataFrame) -> str:
    jsons: str = csv.to_json(orient="index")
    return jsons


def docx_to_pdf(doc_path : str, path : str) -> str:

    subprocess.call(
        [
            "soffice",
            # '--headless',
            "--convert-to",
            "pdf",
            "--outdir",
            path,
            doc_path,
        ]
    )

    pdf_path = path + '/' + doc_path.rsplit('/', 1)[1].rsplit('.', 1)[0] + ".pdf"

    return pdf_path


def get_plain_text(script_id: str = None, path : str = None) -> str:

    if not script_id:
        f_name = path

    if not path:
        output_converted_txt: str = ""
        f_path, f_name = ps_models.MNFScriptDatabase_2.get_file_path(script_id)
        path: str = f"{f_path}{f_name}"

    input_file = open(path, "rb")

    if str(f_name).endswith("txt"):
        output_converted_txt = input_file

    elif str(f_name).endswith("pdf"):

        # pdf = pdftotext.PDF(input_file)
        # output_converted_txt = "\n\n".join(pdf)  #commented on 08-2-24
        pass

    elif str(f_name).endswith("docx"):

        parsed = parser.from_file(path)
        output_converted_txt = parsed["content"]

    else:
        raise TypeError(
            f"conversion of {f_name.rsplit('.', maxsplit=1)[-1]} files not supported"
        )

    return output_converted_txt


def pdf_to_docx(input_script : str, output_converted_docx : str) -> None:

    parse(input_script,output_converted_docx,start=0,end=None)

def send_email_to_user(user,screenplay_name,subject,message):# removed flag = 1
    subject = subject + "."
    from_email = settings.EMAIL_HOST_USER
    to  = user.email
    context = {
        "Name": user,
        "story_name": screenplay_name,
        "message" : message,
    }
    html_content = render_to_string(
        "audit/coree_email.html", context
    )
    text_content = strip_tags(html_content)

    msg = EmailMultiAlternatives(subject, text_content, from_email, [to])
    msg.attach_alternative(html_content, "text/html")
    msg.send()


# def screen_play_details(script_id):

#     """
#     This Function will extract the details in the CSV
#     No of Dialogues
#     No of ActionLine
#     No of Scene
#     No of Speakers
#     Speakers

#     """
#     print("Entering the ScreenPlay Details extraction")
#     file_name = get_file_path(script_id, "script-csv")

#     num_actions = 0
#     num_dialogues = 0
#     num_slugline = 0
#     num_locations= set()
#     num_speakers = set()
#     num_tranisitions = 0
#     num_parenthetical = 0
#     num_special_terms = 0
#     num_INT = 0
#     num_EXT = 0
#     longest_dialogue_len = 0
#     longest_dialogue_speaker = "None"
#     dialogue_count = {

#     }
#     with open(file_name, "r") as file:
#         reader = csv.reader(file)
#         header = next(reader)

#         for row in reader:
#             para_no, scene_no, content, script_element = row

#             if script_element == "action":
#                 num_actions += 1

#             if script_element == "dialogue":
#                 dialogue_count[content]+=1
#                 if longest_dialogue_len <len(content):
#                     longest_dialogue_speaker = content
#                 longest_dialogue_len = max(len(content),longest_dialogue_len)
#                 num_dialogues += 1

#             if script_element == "slugline":
#                 if content[0:3]=="INT":
#                     num_INT+=1
#                 if content[0:3]=="EXT":
#                     num_EXT+=1
#                 num_slugline += 1
#                 num_locations.add(content)

#             if script_element == "parenthetical":
#                 num_parenthetical += 1

#             if script_element == "transition":
#                 num_tranisitions += 1

#             if script_element == "special_term":
#                 num_special_terms += 1

#             if script_element == "speaker":
#                 num_speakers.add(content)

#     max_dialogue_speaker = max(dialogue_count, key=lambda word: dialogue_count[word])
#     max_dialouge_count = dialogue_count[max_dialogue_speaker]
#     num_speaker = len(num_speakers)
#     details = {
#         "action": num_actions,
#         "dialogues":num_dialogues,
#         "scenes": num_dialogues,
#         "num_of_speaker": num_speaker,
#         "special_term" : num_special_terms,
#         "parenthetical" : num_parenthetical,
#         "transition" : num_tranisitions,
#         "speakers": list(num_speakers),
#         "max_dialogue_speaker" : max_dialogue_speaker,
#         "max_dialouge_count" : max_dialouge_count,
#         "longest_dialogue_speaker" : longest_dialogue_speaker,
#         "longest_dialogue_len" : longest_dialogue_len,
#         "num_of_locations" : len(num_locations)

#     }
#     # print("Number of action lines:", num_actions)
#     # print("Number of dialogues:", num_dialogues)
#     # print("Number of scenes:", num_slugline)
#     # print("Number of speaker:", len(num_speaker))
#     return details

import csv

def estimate_page_count(word_count, words_per_page=100):
    # You can adjust the words_per_page value based on your specific script formatting
    return word_count / words_per_page

def screen_play_details(script_id):
    print("Entering the ScreenPlay Details extraction")
    file_name = get_file_path(script_id, "script-csv")

    num_actions = 0
    num_dialogues = 0
    num_scenes = 0
    num_locations = set()
    num_speakers = set()
    num_tranisitions = 0
    num_parenthetical = 0
    num_special_terms = 0
    num_INT = 0
    num_EXT = 0
    longest_dialogue_len = 0
    longest_dialogue_scene = 0
    longest_dialogue_speaker = "None"
    dialogue_count = {}
    speakers = []
    total_word_count = 0  # To calculate the total word count
    gpt_count = 0
    interval_at = 0
    lock_status = False
    no_of_pages = 0

    with open(file_name, "r") as file:
        reader = csv.reader(file)
        header = next(reader)

        for row in reader:
            para_no, scene_no, content, script_element = row

            if script_element == "action":
                num_actions += 1

            if script_element == "dialogue":
                if content in dialogue_count:
                    dialogue_count[speakers[-1]] += 1
                else:
                    dialogue_count[speakers[-1]] = 1

                if len(content) > longest_dialogue_len:
                    longest_dialogue_scene = scene_no
                    longest_dialogue_speaker = speakers[-1]
                    longest_dialogue_len = len(content)
                num_dialogues += 1

            if script_element == "slugline":
                if content.startswith("INT"):
                    num_INT += 1
                elif content.startswith("EXT"):
                    num_EXT += 1
                num_scenes += 1
                num_locations.add(content)

            if script_element == "parenthetical":
                num_parenthetical += 1

            if script_element == "transition":
                num_tranisitions += 1

            if script_element == "special_term":
                num_special_terms += 1

            if script_element == "speaker":
                speakers.append(content)
                num_speakers.add(content)

            # Calculate the word count for this script element and add it to the total word count
            words = content.split(" ")
            total_word_count += len(words)

    # Estimate the number of pages based on the total word count
    # interval_at = Script.objects.get(id=script_id).interval_at
    # lock_status = Script.objects.get(id=script_id).lock_status
    # no_of_pages = Script.objects.get(id=script_id).no_of_pages
    #gpt_count = BeatSheet.objects.filter(script__id=script_id, had_used_gpt=True).count()
    max_dialogue_speaker = max(dialogue_count, key=lambda word: dialogue_count[word])
    max_dialogue_count = dialogue_count[max_dialogue_speaker]
    num_speaker = len(num_speakers)

    details = {
        "interval_at": interval_at,
        "lock_status": lock_status,
        "no_of_pages": no_of_pages,
        "action": num_actions,
        "dialogues": num_dialogues,
        "scenes": num_scenes,
        "num_of_speakers": num_speaker,
        "special_term": num_special_terms,
        "parenthetical": num_parenthetical,
        "transition": num_tranisitions,
        "speakers": list(num_speakers),
        "max_dialogue_speaker": max_dialogue_speaker,
        "max_dialogue_count": max_dialogue_count,
        "longest_dialogue_speaker": longest_dialogue_speaker,
        "longest_dialogue_len": longest_dialogue_len,
        "longest_dialogue_scene" : longest_dialogue_scene,
        "num_of_locations": len(num_locations),
        "locations": list(num_locations),
        "num_INT" : num_INT,
        "num_EXT" : num_EXT,
        "gpt_count": gpt_count,
        "total_word_count": total_word_count,
        # Include the estimated page count in the result
    }

    return details
def json_to_csv_scriptpad(json_obj: list) -> pd.DataFrame:

    print('inside json to csv', json_obj)

    df = pd.DataFrame(columns=["para_no", "content", "script_element"])

    para = 1

    for paragraph in json_obj:

        se = paragraph["attrs"]["scriptElement"]

        cn = ""

        if se == "blank":
            cn = None

        elif "content" in paragraph.keys():
            cn = paragraph["content"][0]["text"]

        row = {"para_no": para, "content": cn, "script_element": se}

        para += 1

        # df = pd.DataFrame([row])
        row_df = pd.DataFrame([row])
        df = pd.concat([df, row_df], ignore_index=True)

    prv_el_bl = False
    for index, row in df.iterrows():
        el = row[-1]
        if el == "blank" and not prv_el_bl:
            prv_el_bl = True
        elif el == "blank" and prv_el_bl:
            df.drop(index=index, inplace=True)
        else:
            prv_el_bl = False

    print("csv df", df)
    return df

# def Screen_Play_status_calculator(script_id):

#     """
#     This Function will extract the details in the CSV
#     No of Pages
#     No of Words
#     No of Dialogues
#     No of ActionLine
#     No of Scene
#     No of Speakers
#     Speakers
#     Chatgpt Use Count
#     Longest Dialouge
#     """
#     print("Entering the ScreenPlay Details extraction")
#     file_name = get_file_path(script_id, "script-csv")
#     docx_file = get_file_path(script_id, "script-docx")
#     doc = docx.Document(docx_file)

#     # Initiali\ze counters for paragraphs and lines
#     num_paragraphs = 0
#     num_lines = 0

#     # Iterate through paragraphs and count sd lines
#     for paragraph in doc.paragraphs:
#         num_paragraphs += 1
#         num_lines += len(paragraph.text.split('\n'))
#     words_per_page = 250
#     pages = num_lines / words_per_page
#     num_actions = 0
#     num_dialogues = 0
#     num_slugline = 0
#     num_speakers = set()
#     num_scene = 0

#     with open(file_name, "r") as file:
#         reader = csv.reader(file)
#         header = next(reader)

#         for row in reader:
#             para_no, scene_no, content, script_element = row

#             if script_element == "action":
#                 num_actions += 1

#             if script_element == "dialogue":
#                 num_dialogues += 1

#             if script_element == "slugline":
#                 num_slugline += 1

#             if script_element == "speaker":
#                 num_speakers.add(content)

#             # if scene_no != num_scene:
#             #     num_scene+=1

#     num_speaker = len(num_speakers)
#     details = {
#         "action": num_actions,
#         "dialogues":num_dialogues,
#         "scenes": num_dialogues,
#         "num_of_speaker": num_speaker,
#         "speakers": list(num_speakers)
#     }
#     # print("Number of action lines:", num_actions)
#     # print("Number of dialogues:", num_dialogues)
#     # print("Number of scenes:", num_slugline)
#     # print("Number of speaker:", len(num_speaker))
#     return details