Conversion_Kitchen_Code/kitchen_counter/scriptAudit/sa_functions.py.save

14495 lines
534 KiB
Plaintext
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import numpy as np
import pandas as pd
import math
import os
import csv
import subprocess
import io
import shutil
from centralisedFileSystem.models import File, Script
from pathlib import Path
import re
import textwrap
import docx
from docx import Document
from docx.shared import Pt, RGBColor
from docx.shared import Mm,Inches
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.table import WD_TABLE_ALIGNMENT
from docx.enum.table import WD_CELL_VERTICAL_ALIGNMENT
from docx.enum.section import WD_ORIENT
from pdf2docx import parse
# import pdftotext
from scriptAudit.exceptions import ScriptAuditException
from utils import utilities
from datetime import date
from PyPDF2 import PdfFileReader, PdfFileWriter
from utils.scripts_functions import countPages
from conversion.translation.detection import script_det, language_detector
from conversion.translation.translation_variables import get_language_script_code, language_code
#mypath= str(Path(__file__).resolve().parent.parent) + "/neutralAudit/matrices/"
mypath= str(Path(__file__).resolve().parent) + "/matrices/"
# mypath = os.getcwd() +'\\'
def convert_to_pdf(input_docx, out_folder):
subprocess.Popen(['libreoffice', '--headless', '--convert-to', 'pdf', '--outdir',out_folder, input_docx]).communicate()
def check_space_line(value):
if value.isspace():
return "Y"
else:
return "N"
def check_space(data):
# counter
space_count = 0
for i in range(0, len(data)):
# Check each char
# is blank or not
if data[i] == " ":
space_count += 1
else:
break
#print(space_count)
return space_count
def get_last_char_pos(data):
l = len(data)
for pos in range(0,l):
after_pos = data[pos+1:]
#print(data[pos],after_pos)
if after_pos.isspace() or not after_pos:
return pos
def get_case(value):
upperFound = False
if check_space_line(value) == "Y":
return "None"
elif value.isupper():
return "AllUpper"
elif value.islower():
return "AllLower"
else :
words = value.lstrip().split(" ")
try:
ch = words[0][0]
except:
return "None"
if words[0][0]:
if words[0][0].isupper() and not words[0].isupper():
return "FirstCamel"
elif words[0].isupper() and len(words[0]) > 1:
return "FirstUpper"
elif words[-1].isupper() and len(words[-1]) > 1:
return "EndUpper"
else:
for word in words:
if word.isupper() and len(word) > 1:
upperFound = True
if upperFound:
return "MidUpper"
else:
return "Partial"
return 'None'
def conv_pdf_to_docx(input_script,output_converted_docx):
parse(input_script,output_converted_docx,start=0,end=None)
def conv_docx_to_txt(input_script,output_converted_txt):
# import textwrap
# from docx import Document
# from docx.shared import Pt
# from docx.shared import Mm
read_doc = Document(input_script)
# print(read_doc._body._body.xml)
#section= read_doc.sections[-1]
def recalculate_section_properties(n):
try:
section = read_doc.sections[n]
section_width_inches = section.page_width.inches
#section_width = int(section.page_width.inches * 10)
# print("section width direct ",section.page_width.inches)
# print(section.left_margin.inches,section.right_margin.inches)
margins_inches = section.left_margin.inches + section.right_margin.inches
#margins = int((section.left_margin.inches + section.right_margin.inches)*10)
print(margins_inches)
canvas_width_inches = section_width_inches - margins_inches
canvas_width = int(canvas_width_inches *10)
print("canvas width",canvas_width)
left_margin = int(section.left_margin.inches * 10)
except:
section = None
canvas_width = 65
left_margin = 15
return section,canvas_width,left_margin
#for para in read_doc.paragraphs:
# n= 0
# p = para._p
# sectPrs = p.xpath("./w:pPr/w:sectPr")
# if sectPrs:
# n = n +1
# print("Section changed after para")
# print(para.text)
# section,canvas_width,left_margin = recalculate_section_properties(n)
# print(section.left_margin.inches)
n = 0
try:
section,canvas_width,left_margin = recalculate_section_properties(n)
print(section.left_margin.inches)
except:
n =-1
section,canvas_width,left_margin = recalculate_section_properties(n)
all_paras = read_doc.paragraphs
first = all_paras[0].paragraph_format
#print(first.left_indent)
#count = 1
print("number of paras",len(all_paras))
#left_margin = 15
previous_indent= 0
with open(output_converted_txt, 'w', encoding='utf-8') as f:
for para in all_paras:
print('\n')
paragraph_format = para.paragraph_format
## using the paragraph spacing add blank line if required
try:
space_before = paragraph_format.space_before.pt
except:
space_before = 0.0
try:
space_after = paragraph_format.space_after.pt
except:
space_after = 0.0
print("space before")
print(space_before)
print("space after")
print(space_after)
try:
print("line spacing ",paragraph_format.line_spacing.pt)
print("line spacing rule ",paragraph_format.line_spacing_rule)
if paragraph_format.line_spacing.pt < 5 and previous_indent > 20:
continue
#print("space before",paragraph_format.space_before.pt)
except:
pass
section_changed = False
try:
####check section end and remove if CONTINUED
p = para._p
sectPrs = p.xpath("./w:pPr/w:sectPr")
if sectPrs:
section_changed = True
print("checking for continued at section change")
text = para.text.split(' ')
print(text)
if len(text) == 1:
skip_words = ['CONT','CONTD','CONTINUED',"CONT'D"]
## to be replaced by regex ,match
found_continue = False
for skip_word in skip_words:
if skip_word in text[0]:
#skip para
print("found continued")
found_continue = True
break
if found_continue:
print("skipping para but setting new section")
n= n+1
try:
section,canvas_width,left_margin = recalculate_section_properties(n)
print(section.left_margin.inches)
except Exception as e:
print(e)
continue
print("Continued not found at section change")
except:
pass
if float(space_before) > 5.0 :
print("adding blank line")
f.write('\n')
fli =0
li =0
ri =0
try:
if para.style.name == 'List Paragraph':
fli = 0
else:
fli = paragraph_format.first_line_indent.inches
except:
pass
try:
li = paragraph_format.left_indent.inches
except:
pass
try:
ri = paragraph_format.right_indent.inches
except:
pass
indent = int((fli + li ) * 10)
print("calculated indent ",indent)
data = para.text
lines = data.split('\n')
print("Examining para")
try:
print(para.text)
print(para.style.name)
except:
pass
print("lines in para",len(lines))
## remove starting number (before margin) and number after 65 characters
if len(lines) == 1 :
if indent < 0:
print(indent)
#start = -(indent )
#if re.search('\d',lines[0][0:start]):
# lines[0] = " ".join(lines[0].split()[1:])
#else:
# lines[0] = lines[0][start:]
indent = 0
lines[0] = lines[0].rstrip()
if len(lines[0]) > 40:
if lines[0][40:-2].strip() == '' and re.search('\d',lines[0][-2:]) :
lines[0] = lines[0][0:-2]
print(indent)
for line in lines:
#line = line.rjust(len(line) + indent + left_margin)
line = line.replace('\t',' ')
if indent == 0:
indent = check_space(line)
line = line.strip()
if line:
#print(line)
print(fli,li,indent,ri)
print(para.alignment)
try:
width = int(canvas_width - (indent + ri*10))
except:
width = 58 - indent
#if fli == 0 and li == 0 and str(para.alignment) == 'CENTER (1)':
if str(para.alignment) == 'CENTER (1)' :
ch_count = len(line)
print("line is center aligned")
print(ch_count)
indent = indent + int((width-ch_count)/2)
print(indent)
#elif fli == 0 and li == 0 and str(para.alignment) == 'RIGHT (2)':
elif str(para.alignment) == 'RIGHT (2)':
##removing fli li =
ch_count = len(line)
print("line is right aligned")
print(ch_count)
indent = indent + int(width-ch_count)
print(indent)
else:
if str(para.alignment) == 'JUSTIFY (3)':
line = ' '.join(line.split())
print("line is left aligned")
if indent+left_margin > 55:
indent = indent - 1
if width <= 0:
width = 1
print("Calculated Width:",width)
wrapped_lines = textwrap.wrap(line, width)
wrapped_data_lines_count = len(wrapped_lines)
if wrapped_data_lines_count > 1:
print("need to wrap line")
for wrapped_line in wrapped_lines:
#print(wrapped_line)
wrapped_line = wrapped_line.rjust(len(wrapped_line) + indent + left_margin)
print(indent+left_margin)
#print(wrapped_line)
f.write(wrapped_line)
f.write('\n')
continue
line = line.rjust(len(line) + indent + left_margin)
f.write(line)
f.write('\n')
else:
print("line is blank")
f.write(line)
f.write('\n')
####check section end
#p = para._p
#sectPrs = p.xpath("./w:pPr/w:sectPr")
if section_changed:
n = n +1
print("Section changed")
section,canvas_width,left_margin = recalculate_section_properties(n)
print(section.left_margin.inches)
if space_after >5.0:
print("adding blank line")
f.write('\n')
print("\n")
previous_indent = indent + left_margin
print("Converted to text")
def conv_pdf_to_txt(input_script,output_converted_txt):
# Load your PDF
with open(input_script, "rb") as f:
# pdf = pdftotext.PDF(f) #06-2-24
pass
# # If it's password-protected
# with open("secure.pdf", "rb") as f:
# pdf = pdftotext.PDF(f, "secret")
# How many pages?
print(len(pdf))
# # Iterate over all the pages
# for page in pdf:
# print(page)
# Read some individual pages
# print(pdf[0])
# print(pdf[1])
# Read all the text into one string
#print("\n\n".join(pdf))
txt_data = "\n\n".join(pdf)
with open(output_converted_txt, "w", encoding="utf8") as out_file:
out_file.write(txt_data)
def conv_pdf_to_txt_java(input_script,output_converted_txt):
from py4j.java_gateway import JavaGateway
import sys
#import global_file_db
#pdf_file = global_file_db.input_script_pdf
#converted_txt = global_file_db.input_text_file
pdf_file = input_script
gw = JavaGateway()
result = gw.entry_point.strip(pdf_file)
# result is a dict of {
# 'success': 'true' or 'false',
# 'payload': pdf file content if 'success' is 'true'
# 'error': error message if 'success' is 'false'
# }
#print(result['error'])
print(result['payload'])
#print(result['success'])
file = open(output_converted_txt, "w", encoding="utf8")
file.write(str(result['payload']))
def conv_to_txt(input_script, output_converted_docx, output_converted_txt):
extention = input_script.rsplit(".", 1)[-1]
if extention == "txt":
shutil.copyfile(input_script, output_converted_txt)
elif extention == "pdf":
# try:
# conv_pdf_to_txt(input_script, output_converted_txt)
# except:
# conv_pdf_to_docx(input_script, output_converted_docx)
# conv_docx_to_txt(output_converted_docx, output_converted_txt)
conv_pdf_to_docx(input_script, output_converted_docx)
conv_docx_to_txt(output_converted_docx, output_converted_txt)
elif extention == "docx":
conv_docx_to_txt(input_script, output_converted_txt)
elif extention == "fdx":
fdx = open(input_script, 'r')
plain_txt = utilities.fdx_to_txt(fdx)
with open(output_converted_txt, 'w') as f:
f.write(plain_txt)
else:
raise ScriptAuditException(f"{extention} file is not supported for Audit!")
def conv_to_df(txt_script) :
script_data = open(txt_script, 'r', encoding="utf-8").read()
script_data = script_data.split("\n")
paragphs = []
line_no = 0.0
data = ''
fields = ['line_no','data','Identification_Status','isIdentified']
df = pd.DataFrame([],columns= fields)
for index_script in range(len(script_data)):
# This replaces the new-line character with a space character within a paragraph.
script_data[index_script] = script_data[index_script].replace("\n", " ")
paragphs.append(script_data[index_script])
#data = script_data[index_script]
for index_para in range(len(paragphs)):
data = paragphs[index_para]
line_no +=1
print("processing line",line_no)
#print(data)
df.loc[len(df.index)] = [str(line_no),data,'','No']
return df
def conv_to_csv(txt_script,csv_for_processing) :
#print(csv_for_processing)
import csv
script_data = open(txt_script, 'r', encoding="utf-8").read()
script_data = script_data.split("\n")
paragphs = []
line_no = 0.0
data = ''
for index_script in range(len(script_data)):
# This replaces the new-line character with a space character within a paragraph.
script_data[index_script] = script_data[index_script].replace("\n", " ")
paragphs.append(script_data[index_script])
fields = ['line_no','data','Identification_Status','isIdentified']
with open(csv_for_processing, 'w',newline='') as csvfile:
# creating a csv writer object
csvwriter = csv.writer(csvfile)
# writing the fields
csvwriter.writerow(fields)
for index_para in range(len(paragphs)):
data = paragphs[index_para]
line_no +=1
print("processing line",line_no)
#print(data)
with open(csv_for_processing, 'a', encoding='utf-8',newline='') as csvfile:
# creating a csv writer object
csvwriter = csv.writer(csvfile)
# writing the data rows
csvwriter.writerow([str(line_no),data,'','No'])
def pre_assign_wts(df):
skip_words = ['INT.','EXT.','I/E','E/I','CUT TO','CUT BACK TO','FLASHCUT TO','DISSOLVE TO', 'INTERCUT', 'INTER CUT','PBS', 'INTERVAL',
'FLASHBACK','FADE IN','FADE TO BLACK','ON THE SCREEN','ON THE TV','MORNING','AT HOTEL','TV','MONTAGES','MUSICAL MONTAGES','ESSENTIALS','LATER','ESSENTIAL']
pos_sp_dial_line_nos = df.loc[(df['data'].str.strip().str.contains(r':-|:|-|".*"') == True) & (df['data'].str.strip().str.contains('|'.join(skip_words)) == False) ,'line_no'].to_list()
print(pos_sp_dial_line_nos)
new_pos_sp_dial_line_nos =pos_sp_dial_line_nos
for index in df.loc[df['line_no'].isin(pos_sp_dial_line_nos),:].index:
data = df['data'][index]
line_no = 0.0
new_line_no = 0.0
pos_sp_par = ''
line_no = df['line_no'][index]
pos_sp =''
pos_par = ''
pos_dia = ''
pos_sp_par = ''
print(df.dtypes)
try:
print(data)
except:
pass
if ":-" in data:
pos_sp_par = data.split(":-")[0]
pos_dia = data.split(":-")[-1].strip()
elif ":" in data:
pos_sp_par = data.split(":")[0]
pos_dia = data.split(":")[-1].strip()
elif "-" in data:
pos_sp_par = data.split("-")[0]
pos_dia = data.split("-")[-1].strip()
elif "\"" in data:
pos_sp_par = data.split("\"")[0]
pos_dia = data.split("\"")[-2].strip()
pos_sp_par = pos_sp_par.strip()
if pos_sp_par:
#print(pos_sp_par)
if "(" in pos_sp_par and ")" in pos_sp_par:
pos_sp = pos_sp_par.split("(")[0]
pos_par = "(" + pos_sp_par.split("(")[-1]
else:
pos_sp = pos_sp_par
pos_par = ''
print(pos_sp)
print(pos_par)
print(pos_dia)
if pos_sp:
has_digit = any(chr.isdigit() for chr in pos_sp)
if not has_digit and pos_sp.isupper() and pos_dia.strip():
#if pos_dia.strip():
df['data'][index] = pos_sp
df['preassigned_weights'][index] ='ps7-20'
if pos_par:
df.loc[index + 0.3] = np.nan
df.loc[index + 0.3,'data'] = pos_par
new_line_no = line_no + 0.3
df.loc[index + 0.3,'line_no'] = new_line_no
df.loc[index + 0.3,'isIdentified'] = 'No'
df.loc[index + 0.3,'preassigned_weights'] = 'ps10-20'
new_pos_sp_dial_line_nos.append(new_line_no)
print("split pos_par",df.loc[index + 0.3,'line_no'])
if pos_dia:
print("1",df.dtypes)
df.loc[index + 0.6] = np.nan
print("1.5",df.dtypes)
df.loc[index + 0.6,'data'] = pos_dia
new_line_no = line_no + 0.6
print(type(line_no),type(new_line_no))
df.loc[index + 0.6,'line_no'] = new_line_no
print("2",df.dtypes)
df.loc[index + 0.6,'isIdentified'] = 'No'
df.loc[index + 0.6,'preassigned_weights'] = 'ps13-20;ps14-20;ps15-20'
new_pos_sp_dial_line_nos.append(new_line_no)
print("split pos_dia",df.loc[index + 0.6,'line_no'],type(df.loc[index + 0.6,'line_no']))
print("3",df.dtypes)
df = df.sort_index().reset_index(drop=True)
for index in df.index:
df['line_no'][index] = float(index + 1)
return df
def create_audit_df(df):
audit_df = df[['line_no','data']]
audit_df['Identification_Status'] = ''
audit_df['data_corrected'] = ''
audit_df['audited_line_no'] = ''
audit_df['scene_number'] = ''
audit_df['line_removed'] = 'No'
audit_df['introduction'] = 'No'
audit_df['appendix'] = 'No'
audit_df['page_no'] = 'No'
audit_df['left_indent_corrected'] = 'No'
audit_df['right_indent_corrected'] = 'No'
audit_df['line_wrapped_at_prescribed_right_indent'] = 'No'
audit_df['case_corrected'] = 'No'
audit_df['blank_inserted_before'] = 'No'
audit_df['blank_inserted_after'] = 'No'
audit_df['blank_deleted_before'] = 'No'
audit_df['blank_deleted_after'] = 'No'
audit_df['space_removed_between_characters'] = 'No'
audit_df['space_added_between_characters'] = 'No'
audit_df['line_merged_with_next_line'] = 'No'
audit_df['line_broken_into_multiple_lines'] = 'No'
audit_df['punctuation_mark_added'] = 'No'
audit_df['punctuation_mark_removed'] = 'No'
audit_df['language_specific_audit_comments'] = 'No'
audit_df.set_index('line_no',inplace=True)
return audit_df
def trim_intro(df,audit_df):
stopwords = ['FADE IN' ]
remove_upto = -1
intro_removed = False
for index in df.index:
data = df['data'][index]
data = ' '.join(data.split())
for sw in stopwords:
if re.search(sw,data,re.IGNORECASE):
print("Found Fade In",index)
remove_upto = index
if remove_upto <= 100 :
print("removing lines till ", remove_upto)
while remove_upto != -1:
line_no = df['line_no'][remove_upto]
audit_df['line_removed'][line_no] = 'Yes'
audit_df['introduction'][line_no] = 'Yes'
df.drop(remove_upto,inplace= True)
remove_upto -= 1
intro_removed = True
print("title and introduction removed")
break
if intro_removed:
break
def remove_page_numbers(df,audit_df):
page_no_found = False
for index in df.index:
data = df['data'][index]
if check_space(data) > 54:
pos_page_no = data.strip()
if pos_page_no:
for ch in pos_page_no:
if not re.match('[\d\.]',ch):
page_no_found = False
break
else:
page_no_found = True
else:
continue
if page_no_found:
line_no = df['line_no'][index]
audit_df['line_removed'][line_no] = 'Yes'
audit_df['page_no'][line_no] = 'Yes'
def get_per_uppercase(text):
count_upper = 0
for ch in text.strip():
if ch.isupper():
count_upper += 1
try:
return (int(count_upper/(len(text.strip()))*100))
except:
return 0
def prep_for_audit(df):
df.reset_index(inplace=True, drop=True)
import re
print("Entering prep_for_audit")
df['data'].fillna('',inplace =True)
if 'scene_number' not in df.columns:
df['scene_number'] = ''
if 'Identification_Status' not in df.columns:
df['Identification_Status'] = ''
if 'plb' not in df.columns:
df['plb'] = ''
if 'nlb' not in df.columns:
df['nlb'] = ''
if 'ssc' not in df.columns:
df['ssc'] = ''
if 'lcp' not in df.columns:
df['lcp'] = 0
if 'case' not in df.columns:
df['case'] = ''
if 'per_uppercase' not in df.columns:
df['per_uppercase'] = ''
if 'parenthetical' not in df.columns:
df['parenthetical'] = ''
if 'pnbl_line_no' not in df.columns:
df['pnbl_line_no'] = ''
if 'nnbl_line_no' not in df.columns:
df['nnbl_line_no'] = ''
if 'ppnbl_line_no' not in df.columns:
df['ppnbl_line_no'] = ''
if 'nnnbl_line_no' not in df.columns:
df['nnnbl_line_no'] = ''
if 'pdil_line_no' not in df.columns:
df['pdil_line_no'] = ''
if 'ndil_line_no' not in df.columns:
df['ndil_line_no'] = ''
print("prep_for_audit- after if")
#print(str(df['line_no']))
print("593")
print(df)
for index in df.index:
#print(index)
data=df['data'][index]
#print(data)
if check_space(data) >= 140 or data.isspace() or (not data ):
df['Identification_Status'][index] = 'blank'
first_line = False
last_line = False
if index == 0 :
first_line = True
plb = "N"
else:
pvs_data = df['data'][index-1]
if index == df.index[-1]:
last_line = True
nlb = "N"
else:
next_data = df['data'][index+1]
print("616")
if (not first_line):
if check_space(pvs_data) >= 140 or pvs_data.isspace() or (not pvs_data ):
plb = "Y"
else:
plb = "N"
#print(plb)
if (not last_line):
if check_space(next_data) >= 140 or next_data.isspace() or (not next_data ):
nlb = "Y"
else:
nlb = "N"
#print(nlb)
print("633")
cur_indent = check_space(data)
lcp = get_last_char_pos(data)
case = get_case(data)
per_uppercase = get_per_uppercase(data)
par = ''
if re.match('\(',data.strip()[:1]):
if re.match('\)',data.strip()[-1:]) :
par = 'Complete'
elif re.search('\)',data.strip()) :
par = 'PartStartMid'
else:
par = 'StartingLeft'
elif re.match('\)',data.strip()[-1:]):
if re.search('\(',data.strip()):
par = 'PartMidEnd'
else:
par = 'EndingRight'
# beginning end already checked so now if paren present it is mixed
elif re.search('\(',data.strip()) and re.search('\)',data.strip()):
par = 'PartMidMid'
elif re.search('\(',data.strip()):
par = 'MixedLeft'
elif re.search('\)',data.strip()):
par = 'MixedRight'
else:
par = 'Absent'
print("660")
df['plb'][index] = plb
df['nlb'][index] = nlb
df['ssc'][index] = cur_indent
df['lcp'][index] = lcp
df['case'][index] = case
df['parenthetical'][index] = par
df['per_uppercase'][index] = per_uppercase
## pnlb ?
if first_line:
pnbl_line_no = 0
elif plb == 'N':
pnbl_line_no = df['line_no'][index -1]
elif index - 1 == 0:
pnbl_line_no = 0
else:
pnbl_line_no = df['line_no'][index -2]
print("678")
## nnlb ?
if last_line:
nnbl_line_no = 100000
elif nlb == 'N':
nnbl_line_no = df['line_no'][index +1]
elif index + 1 == df.index[-1]:
nnbl_line_no = 100000
else:
try:
nnbl_line_no = df['line_no'][index +2]
except Exception as e:
print("Exception--",e)
i = float(index) + 2
print("691",i,index)
print(str(df['line_no']))
print("692",df['line_no'][i])
print(nnbl_line_no)
print("694")
df['pnbl_line_no'][index] = pnbl_line_no
df['nnbl_line_no'][index] = nnbl_line_no
print("prep_for_audit- after 1st for loop")
for index in df.index:
line_no = df['line_no'][index]
pnbl_line_no = df['pnbl_line_no'][index]
if pnbl_line_no == 0:
ppnbl_line_no = 0
else:
ppnbl_line_no = df.loc[df['line_no'] == pnbl_line_no, 'pnbl_line_no'].values[0]
nnbl_line_no = df['nnbl_line_no'][index]
print(index,line_no,pnbl_line_no,nnbl_line_no)
if nnbl_line_no == 100000:
nnnbl_line_no = 100000
else:
nnnbl_line_no = df.loc[df['line_no'] == nnbl_line_no, 'nnbl_line_no'].values[0]
df['ppnbl_line_no'][index] = ppnbl_line_no
df['nnnbl_line_no'][index] = nnnbl_line_no
print("prep_for_audit- after 2nd for loop")
for index in df.index:
data=df['data'][index]
pdil_line_no = 0
cur_indent = df['ssc'][index]
##pdil
## lets find previous different indent line
print(index,"looking for previous different indent line")
if index == 0:
df['pdil_line_no'][index] = pdil_line_no
continue
pdil_index = index - 1
while pdil_index >= 0 :
pdil_indent = df['ssc'][pdil_index]
print(cur_indent,pdil_indent)
if df['Identification_Status'][pdil_index] != 'blank' and pdil_indent != cur_indent:
pdil_line_no = df['line_no'][pdil_index]
break
else:
pdil_index -= 1
df['pdil_line_no'][index] = pdil_line_no
print("prep_for_audit- after 3rd for loop")
for index in df.index:
data=df['data'][index]
ndil_line_no = 100000
cur_indent = df['ssc'][index]
print("looking for next different indent line")
if index == df.index[-1]:
df['ndil_line_no'][index] = ndil_line_no
continue
ndil_index = index + 1
# ndil
while ndil_index <= df.index[-1]:
ndil_indent = df['ssc'][ndil_index]
print(cur_indent,ndil_indent)
if df['Identification_Status'][ndil_index] != 'blank' and ndil_indent != cur_indent:
ndil_line_no = df['line_no'][ndil_index]
break
else:
ndil_index += 1
df['ndil_line_no'][index] = ndil_line_no
return df
def remove_extra_blank_lines(df,audit_df):
# remove two or more consequtive blank lines.. keep one
for index in range(0,df.index[-1]):
data = df['data'][index]
line_no = df['line_no'][index]
nl_data = df['data'][index+1]
try:
print(data)
except:
pass
if not data.strip() and not nl_data.strip():
audit_df['line_removed'][line_no] = 'Yes'
audit_df['Identification_Status'][line_no] = 'blank'
elif not data.strip() and nl_data.strip():
df['plb'][index] = 'N'
def remove_blank_line_after_parenthetical(df,audit_df):
# remove two or more consequtive blank lines.. keep one
for index in range(0,df.index[-1]):
data = df['data'][index]
line_no = df['line_no'][index]
nl_data = df['data'][index+1]
nl_line_no = df['line_no'][index+1]
try:
print(data)
except:
pass
if df['parenthetical'][index] in ('Complete','EndingRight') and not nl_data.strip():
audit_df['line_removed'][nl_line_no] = 'Yes'
df['nlb'][index] = 'N'
def merge_broken_lines(df,audit_df):
index_iter = iter(range(0,df.index[-1]))
for index in index_iter:
cur_line_data = df['data'][index]
cur_line_indent = df['ssc'][index]
cur_case = 'AllUpper'
cur_lcp = df['lcp'][index]
nnbl_line_no = df['nnbl_line_no'][index]
nlb = df['nlb'][index]
# if nlb == 'Y':
# next_nbl_index = index +2
# if next_nbl_index > df.index[-1]:
# continue
# else:
# next_nbl_index = index +1
try:
next_nbl_data = df.loc[df['line_no'] == nnbl_line_no , 'data'].values[0]
next_nbl_indent = df.loc[df['line_no'] == nnbl_line_no , 'ssc'].values[0]
next_nbl_case = df.loc[df['line_no'] == nnbl_line_no , 'case'].values[0]
except:
next_nbl_data = ''
next_nbl_indent = 0
line_no = df['line_no'][index]
#next_nbl_line_no = df['line_no'][next_nbl_index]
two_line_data = ''
indent_dif = next_nbl_indent - cur_lcp
print(line_no,indent_dif)
if indent_dif > 0 and indent_dif <= 3 and next_nbl_case != 'AllUpper' and cur_case != 'AllUpper':
if indent_dif == 1:
two_line_data = cur_line_data.rstrip() + next_nbl_data.lstrip()
else:
two_line_data = cur_line_data.rstrip() + ' ' + next_nbl_data.lstrip()
two_line_len = len(two_line_data.strip())
print(index,line_no,cur_line_indent,next_nbl_indent,two_line_len)
print(cur_line_data)
print(next_nbl_data)
if two_line_len < 150:
print("merging lines")
df['data'][index] = two_line_data
case = get_case(two_line_data)
df['case'][index] = case
# lcp = get_last_char_pos(two_line_data)
# df['last_character_placement'][index] = lcp
print(line_no)
audit_df['line_merged_with_next_line'][line_no] = 'Yes'
print(two_line_data)
audit_df['line_removed'][nnbl_line_no] = 'Yes'
# try:
# df['nlb'][next_nbl_index-1] = df['nlb'][next_nbl_index]
# except:
# pass
# try:
# df['plb'][next_nbl_index+1] = df['plb'][next_nbl_index]
# except:
# pass
if nlb == 'N':
next(index_iter)
else:
next(index_iter)
next(index_iter)
else:
print(cur_line_data)
else:
print(index,cur_line_indent,next_nbl_indent)
try:
print(cur_line_data)
except:
pass
#newfile.write(cur_line_data)
def remove_space_between_words(df,audit_df):
lines_removed = audit_df.loc[audit_df['line_removed'] == 'Yes'].index.to_list()
# remove extra spaces between the words
for index in df.index:
cur_indent = df['ssc'][index]
line_no = df['line_no'][index]
if (line_no in lines_removed) or cur_indent > 140:
continue
data = df['data'][index]
new_data = ''
words = data.lstrip().split()
for word in words:
#print(word)
new_data += word + " "
new_data = new_data.rjust(len(new_data)+cur_indent)
df['data'][index] = new_data
df['lcp'][index] = get_last_char_pos(df['data'][index])
if new_data.strip() != data.strip():
audit_df['space_removed_between_characters'][line_no] = 'Yes'
print(index)
try:
print(data)
print(new_data)
except:
pass
#df = df.loc[df['line_removed'] != 'Yes',:]
def get_strict_conditions(csv_strict_conditions):
import pandas as pd
conditions_df = pd.read_csv(csv_strict_conditions, index_col = [0], skiprows = [0])
conditions_df = conditions_df.head(30)
cols = conditions_df.columns
conditions_df.rename(columns= { cols[3]:'cl_plb',
cols[4]:'cl_nlb',
cols[5]:'cl_ssc',
cols[6]:'cl_lcp',
cols[7]:'cl_par',
cols[8]:'cl_case',
cols[9]:'cl_per_uppercase',
cols[10]:'pnbl_plb',
cols[11]:'pnbl_par',
cols[12]:'pnbl_vs_cur_indent',
cols[15]:'pnbl_case',
cols[16]:'nnbl_nlb',
cols[17]:'nnbl_par',
cols[18]:'nnbl_vs_cur_indent',
cols[21]:'nnbl_case',
cols[22]:'pdil_plb',
cols[23]:'pdil_nlb',
cols[24]:'pdil_vs_cur_indent',
cols[27]:'pdil_par',
cols[29]:'ndil_plb',
cols[30]:'ndil_nlb',
cols[31]:'ndil_vs_cur_indent',
cols[34]:'ndil_par',
}, inplace = True)
conditions_df = conditions_df[['cl_plb','cl_nlb','cl_ssc','cl_lcp','cl_par','cl_case','cl_per_uppercase',
'pnbl_plb','pnbl_par','pnbl_vs_cur_indent','pnbl_case',
'nnbl_nlb','nnbl_par','nnbl_vs_cur_indent','nnbl_case',
'pdil_plb','pdil_nlb','pdil_par','pdil_vs_cur_indent',
'ndil_plb','ndil_nlb','ndil_par','ndil_vs_cur_indent']]
return conditions_df
def test_strict_conditions(df,csv_strict_conditions):
import pandas as pd
left_aligned = True
for index in df.index:
if df['ssc'][index] > 15:
left_aligned = False
break
## if conversion to text is left aligned then dialogue middle wrongly getting identified as action middle so skipping strict contions
if left_aligned:
df.loc[df['Identification_Status'] == 'blank','isIdentified'] = 'Yes'
return
conditions_df = get_strict_conditions(csv_strict_conditions)
#df['isIdentified'] = 'No'
df['When_Identified'] = ''
bb = False
for index in df.index:
if df['isIdentified'][index] == 'Yes':
continue
cl_plb = df['plb'][index]
cl_nlb = df['nlb'][index]
cl_indent = pd.to_numeric(df['ssc'][index])
cl_lcp = df['lcp'][index]
cl_par = df['parenthetical'][index]
cl_case = str(df['case'][index])
cl_per_uppercase = df['per_uppercase'][index]
pnbl_plb = None
pnbl_indent = None
pnbl_par = None
pnbl_case = None
nnbl_nlb = None
nnbl_indent = None
nnbl_par = None
nnbl_case = None
pnbl = True
nnbl = True
pdil = True
ndil = True
pdil_plb = None
pdil_nlb = None
pdil_par = None
pdil_indent = None
ndil_plb = None
ndil_nlb = None
ndil_par = None
ndil_indent = None
pnbl_vs_cur_indent = "NA"
nnbl_vs_cur_indent = "NA"
pdil_vs_cur_indent = "NA"
ndil_vs_cur_indent = "NA"
try:
pnbl_line_no = df['pnbl_line_no'][index]
pnbl_index = df.loc[df['line_no'] == pnbl_line_no, :].index.values[0]
except:
pnbl = False
if pnbl:
pnbl_plb = df['plb'][pnbl_index]
pnbl_indent = df['ssc'][pnbl_index]
pnbl_par = df['parenthetical'][pnbl_index]
pnbl_case = str(df['case'][pnbl_index])
if pnbl_indent > cl_indent:
pnbl_vs_cur_indent = "More"
elif pnbl_indent == cl_indent:
pnbl_vs_cur_indent = "Same"
else:
pnbl_vs_cur_indent = "Less"
#print(pnbl_index)
try:
nnbl_line_no = df['nnbl_line_no'][index]
nnbl_index = df.loc[df['line_no'] == nnbl_line_no, :].index.values[0]
nnbl_nlb = df['nlb'][nnbl_index]
nnbl_indent = df['ssc'][nnbl_index]
nnbl_par = df['parenthetical'][nnbl_index]
nnbl_case = str(df['case'][nnbl_index])
if nnbl_indent > cl_indent:
nnbl_vs_cur_indent = "More"
elif nnbl_indent == cl_indent:
nnbl_vs_cur_indent = "Same"
else:
nnbl_vs_cur_indent = "Less"
except:
nnbl = 'afterlast'
try:
pdil_line_no = df['pdil_line_no'][index]
pdil_index = df.loc[df['line_no'] == pdil_line_no, :].index.values[0]
except:
pdil = False
if pdil:
pdil_plb = df['plb'][pdil_index]
pdil_nlb = df['nlb'][pdil_index]
pdil_par = df['parenthetical'][pdil_index]
pdil_indent = df['ssc'][pdil_index]
if pdil_indent > cl_indent:
pdil_vs_cur_indent = "More"
elif pdil_indent == cl_indent:
pdil_vs_cur_indent = "Same"
else:
pdil_vs_cur_indent = "Less"
try:
ndil_line_no = df['ndil_line_no'][index]
ndil_index = df.loc[df['line_no'] == ndil_line_no, :].index.values[0]
except:
ndil = False
if ndil:
ndil_plb = df['plb'][ndil_index]
ndil_nlb = df['nlb'][ndil_index]
ndil_par = df['parenthetical'][ndil_index]
ndil_indent = df['ssc'][ndil_index]
if ndil_indent > cl_indent:
ndil_vs_cur_indent = "More"
elif ndil_indent == cl_indent:
ndil_vs_cur_indent = "Same"
else:
ndil_vs_cur_indent = "Less"
cl_pos = ''
## get the conditions
#for j in range(1,32):
for j in range(1,18):
if j in [23,24,32,33]:
continue
ev_cl_plb = conditions_df['cl_plb' ]["ps{0}".format(j)]
ev_cl_nlb = conditions_df['cl_nlb' ]["ps{0}".format(j)]
ev_cl_indent_range = conditions_df['cl_ssc' ]["ps{0}".format(j)].split('-')
try:
ev_cl_indent_from = pd.to_numeric(ev_cl_indent_range[0])
except:
ev_cl_indent_from = 200
try:
ev_cl_indent_to = pd.to_numeric(ev_cl_indent_range[1])
except:
ev_cl_indent_to = ev_cl_indent_from
ev_cl_lcp_range = conditions_df['cl_lcp' ]["ps{0}".format(j)].split('-')
try:
ev_cl_lcp_from = pd.to_numeric(ev_cl_lcp_range[0])
except:
ev_cl_lcp_from = 200
try:
ev_cl_lcp_to = pd.to_numeric(ev_cl_lcp_range[1])
except:
ev_cl_lcp_to = ev_cl_lcp_from
ev_cl_par = conditions_df['cl_par' ]["ps{0}".format(j)].split(";")
ev_cl_case = []
ev_cl_case = conditions_df['cl_case' ]["ps{0}".format(j)].split(";")
ev_cl_per_uppercase = conditions_df['cl_per_uppercase']["ps{0}".format(j)].split(";")
try:
operator = ev_cl_per_uppercase[0]
value = int(ev_cl_per_uppercase[1])
except:
operator = ""
value = ""
ev_pnbl_plb = conditions_df['pnbl_plb' ]["ps{0}".format(j)]
ev_pnbl_vs_cur_indent = conditions_df['pnbl_vs_cur_indent' ]["ps{0}".format(j)].split(";")
ev_pnbl_par = conditions_df['pnbl_par' ]["ps{0}".format(j)].split(";")
ev_pnbl_case = conditions_df['pnbl_case' ]["ps{0}".format(j)].split(";")
ev_nnbl_nlb = conditions_df['nnbl_nlb' ]["ps{0}".format(j)]
ev_nnbl_vs_cur_indent = conditions_df['nnbl_vs_cur_indent' ]["ps{0}".format(j)].split(";")
ev_nnbl_par = conditions_df['nnbl_par' ]["ps{0}".format(j)].split(";")
ev_nnbl_case = conditions_df['nnbl_case' ]["ps{0}".format(j)].split(";")
ev_pdil_plb = conditions_df['pdil_plb' ]["ps{0}".format(j)]
ev_pdil_nlb = conditions_df['pdil_nlb' ]["ps{0}".format(j)]
ev_pdil_vs_cur_indent = conditions_df['pdil_vs_cur_indent' ]["ps{0}".format(j)].split(";")
ev_pdil_par = conditions_df['pdil_par' ]["ps{0}".format(j)].split(";")
ev_ndil_plb = conditions_df['ndil_plb' ]["ps{0}".format(j)]
ev_ndil_nlb = conditions_df['ndil_nlb' ]["ps{0}".format(j)]
ev_ndil_vs_cur_indent = conditions_df['ndil_vs_cur_indent' ]["ps{0}".format(j)].split(";")
ev_ndil_par = conditions_df['ndil_par' ]["ps{0}".format(j)].split(";")
## checks
if not pnbl:
pnbl_plb_check = True
pnbl_indent_check = True
pnbl_par_check = True
pnbl_case_check = True
else:
pnbl_plb_check = True if (pnbl_plb == ev_pnbl_plb) or (ev_pnbl_plb == 'Maybe') else False
pnbl_indent_check = True if (pnbl_vs_cur_indent in ev_pnbl_vs_cur_indent ) else False
pnbl_par_check = True if pnbl_par in ev_pnbl_par else False
if pnbl_case in ev_pnbl_case or ev_pnbl_case == '' :
pnbl_case_check = True
else:
pnbl_case_check = False
if not nnbl :
nnbl_nlb_check = True
nnbl_indent_check = True
nnbl_par_check = True
nnbl_case_check = True
else:
nnbl_nlb_check = True if (nnbl_nlb == ev_nnbl_nlb) or (ev_nnbl_nlb == 'Maybe') else False
nnbl_indent_check = True if (nnbl_vs_cur_indent in ev_nnbl_vs_cur_indent ) else False
nnbl_par_check = True if nnbl_par in ev_nnbl_par else False
if nnbl_case in ev_nnbl_case or ev_nnbl_case == '' :
nnbl_case_check = True
else:
nnbl_case_check = False
if not pdil:
pdil_plb_check = True
pdil_nlb_check = True
pdil_indent_check = True
pdil_par_check = True
else:
pdil_plb_check = True if (pdil_plb == ev_pdil_plb) or (ev_pdil_plb == 'Maybe') else False
pdil_nlb_check = True if (pdil_nlb == ev_pdil_nlb) or (ev_pdil_nlb == 'Maybe') else False
pdil_indent_check = True if (pdil_vs_cur_indent in ev_pdil_vs_cur_indent ) else False
pdil_par_check = True if pdil_par in ev_pdil_par else False
if not ndil:
ndil_plb_check = True
ndil_nlb_check = True
ndil_indent_check = True
ndil_par_check = True
else:
ndil_plb_check = True if (ndil_plb == ev_ndil_plb) or (ev_ndil_plb == 'Maybe') else False
ndil_nlb_check = True if (ndil_nlb == ev_ndil_nlb) or (ev_ndil_nlb == 'Maybe') else False
ndil_indent_check = True if (ndil_vs_cur_indent in ev_ndil_vs_cur_indent ) else False
ndil_par_check = True if ndil_par in ev_ndil_par else False
cl_indent_check = False
cl_lcp_check = False
## check conditions
cl_plb_check = True if (cl_plb == ev_cl_plb) or (ev_cl_plb == 'Maybe') else False
cl_nlb_check = True if (cl_nlb == ev_cl_nlb) or (ev_cl_nlb == 'Maybe') else False
cl_indent_check = True if (cl_indent >= ev_cl_indent_from) and (cl_indent <= ev_cl_indent_to) else False
cl_lcp_check = True if (cl_lcp >= ev_cl_lcp_from) and (cl_lcp <= ev_cl_lcp_to) else False
cl_par_check = True if cl_par in ev_cl_par else False
if j == 21 and cl_case in ev_cl_case:
data = df['data'][index]
if data.split()[-1] == data.split()[-1].upper():
cl_case_check = True
else:
cl_case_check = False
else:
cl_case_check = True if cl_case in ev_cl_case else False
cl_per_uppercase_check = True
## cl percentage upper checks
if operator == "lessthan":
cl_per_uppercase_check = True if cl_per_uppercase <= value else False
elif operator == "morethan":
cl_per_uppercase_check = True if cl_per_uppercase >= value else False
elif operator == "equal":
cl_per_upperacse_check = True if cl_per_uppercase == value else False
#cl_par_check = True
checklist = [cl_plb_check,cl_nlb_check,cl_indent_check,cl_lcp_check,cl_par_check,cl_case_check,cl_per_uppercase_check,
pnbl_plb_check,pnbl_indent_check,pnbl_par_check,pnbl_case_check,
nnbl_nlb_check,nnbl_indent_check,nnbl_par_check,nnbl_case_check,
pdil_plb_check,pdil_nlb_check,pdil_indent_check,pdil_par_check,
ndil_plb_check,ndil_nlb_check,ndil_indent_check,ndil_par_check]
if all(checklist):
cl_pos = "ps{0}".format(j)
df['Identification_Status'][index] = cl_pos
df['isIdentified'][index] = 'Yes'
df['When_Identified'][index] = 'FirstStrictConditions'
break
df.loc[df['Identification_Status'] == 'blank','isIdentified'] = 'Yes'
def prep_weights_csv (weights_csv) :
wts_df = pd.read_csv(weights_csv,skiprows=[0])
wts_df = wts_df.head(50)
wts_df.rename(columns={wts_df.columns[1]:'Possibilities',wts_df.columns[2]:'Description',
wts_df.columns[3]:'PureImpure',wts_df.columns[7]:'AllUpper',
wts_df.columns[8]:'AllLower',
wts_df.columns[9]:'FirstCamel',
wts_df.columns[10]:'FirstUpper',wts_df.columns[11]:'FirstLowerSomeUpper',
wts_df.columns[12]:'Partial',wts_df.columns[13]:'EntireLine',
wts_df.columns[14]:'PartofLine',wts_df.columns[15]:'only left parenthetical present',
wts_df.columns[16]:'only right parenthetical present',
wts_df.columns[17]:'PLB_Yes',wts_df.columns[18]:'PLB_No',
wts_df.columns[19]:'NLB_Yes',wts_df.columns[20]:'NLB_No',
wts_df.columns[21]:'<15withNumeric',
# wts_df.columns[28]:'<15withoutNumeric',
# wts_df.columns[33]:'cur_indent_equals_pnbl',
# wts_df.columns[34]:'cur_indent_equals_nnbl',
# wts_df.columns[35]:'containsSpecialWords1',
# wts_df.columns[36]:'containsSpecialWords2',
# wts_df.columns[37]:'containsSpecialWords3',
# wts_df.columns[38]:'containsSpecialWords4'
wts_df.columns[29]:'<15withoutNumeric',
wts_df.columns[34]:'cur_indent_equals_pnbl',
wts_df.columns[35]:'cur_indent_equals_nnbl',
wts_df.columns[36]:'containsSpecialWords1',
wts_df.columns[37]:'containsSpecialWords2',
wts_df.columns[38]:'containsSpecialWords3',
wts_df.columns[39]:'containsSpecialWords4'
},inplace = True)
wts_df = wts_df.loc[:,['Possibilities', 'Description', 'PureImpure','AllUpper','AllLower','FirstCamel','FirstUpper', 'FirstLowerSomeUpper',
'Partial','EntireLine', 'PartofLine',
'only left parenthetical present', 'only right parenthetical present',
'PLB_Yes', 'PLB_No', 'NLB_Yes', 'NLB_No', '<15withNumeric',
'ssc_15', 'ssc_25', 'ssc_30', 'ssc_35','ssc_55','ssc_65','ssc_gt_65', '<15withoutNumeric','lcp_35','lcp_49','lcp_59','lcp_72','cur_indent_equals_pnbl','cur_indent_equals_nnbl','containsSpecialWords1','containsSpecialWords2','containsSpecialWords3','containsSpecialWords4']]
## interpolate the in between weights for the starting space count
sub = wts_df.loc[:,['Possibilities','ssc_15', 'ssc_25', 'ssc_30', 'ssc_35','ssc_55','ssc_65']]
sub = sub.set_index('Possibilities')
for col in range(16,65):
if col in [25,30,35]:
continue
sub['ssc_{}'.format(col)] = np.nan
sub = sub.sort_index(axis =1).interpolate(axis = 1).apply(round).reset_index()
wts_df = wts_df.merge(sub, how ='inner', on = ['Possibilities'], suffixes=('','_y'))
## interpolate the in between weights for the last character placement
wts_df['lcp_30'] = 1
wts_df['lcp_75'] = 1
sub = pd.DataFrame()
sub = wts_df.loc[:,['Possibilities','lcp_30','lcp_35','lcp_49','lcp_59','lcp_72','lcp_75']]
sub = sub.set_index('Possibilities')
for col in range(31,75):
if col in [35,49,59,72]:
continue
sub['lcp_{}'.format(col)] = np.nan
sub = sub.sort_index(axis =1).interpolate(axis = 1).apply(round).reset_index()
wts_df = wts_df.merge(sub, how ='inner', on = ['Possibilities'], suffixes=('','_y'))
wts_df.set_index('Possibilities',inplace =True)
return wts_df
def give_largest(df, n):
largest = df.nlargest(n)
data = [x for x in largest]
index = [f'{i}_largest' for i in range(1, len(largest)+1)]
return pd.Series(data, index=index)
def n_largest(df, axis, n):
'''
Function to return the n-largest value of each
column/row of the input DataFrame.
'''
return df.apply(give_largest, axis=axis, n=n)
def update_parenthetical_neighbor_wt(df):
print("updating weghts of parenthetical neighbors")
## line before complete of StartingLeft or Complete
for index in df.index:
par = df['parenthetical'][index]
if par == 'Absent':
continue
if par in ('StartingLeft','Complete'):
print(index,par)
try:
if df['plb'][index] == 'N':
df["ps7"][index-1] += 10
else:
df["ps7"][index-2] += 10
except:
pass
##line after Complete or EndingRight
if par in ('EndingRight','Complete'):
print(index,par)
try:
if df['nlb'][index] == 'N':
df['ps13'][index+1] += 15
df['ps15'][index+1] += 15
else:
df['ps13'][index+2] += 15
df['ps15'][index+2] += 15
except:
pass
return df
def gen_pos_weights(df,weights_csv):
# prep weights csv
wts_df = prep_weights_csv(weights_csv)
import os
import csv
import re
#mport global_file_db
import sys
from pathlib import Path
import argparse
import numpy as np
import pandas as pd
import math
left_aligned = True
contains_special = False
for index in df.index:
if df['ssc'][index] > 16:
left_aligned = False
break
print("is script left aligned: ",left_aligned)
for i in range(1,32):
if i in ('23','24','32','33'):
continue
df["ps{0}".format(i)] = 0
for index in df.index:
line_no = df['line_no'][index]
data = df['data'][index]
plb = df['plb'][index]
nlb = df['nlb'][index]
contains_special = False
print("processing weight for ",line_no)
# if index == 0:
# pnbl_index = 'first'
# elif df['plb'][index] == 'N' :
# pnbl_index = index -1
# elif index - 1 == 0:
# pnbl_index = 'first'
# else:
# pnbl_index = index -2
# if index == df.index[-1]:
# nnbl_index = 'last'
# elif df['nlb'][index] == 'N' :
# nnbl_index = index + 1
# elif index+1 == df.index[-1]:
# nnbl_index = 'last'
# else:
# nnbl_index = index + 2
pnbl_index = False
nnbl_index = False
try:
pnbl_line_no = df['pnbl_line_no'][index]
pnbl_index = df.loc[df['line_no'] == pnbl_line_no,:].index.values[0]
except:
pnbl_index = False
try:
nnbl_line_no = df['nnbl_line_no'][index]
nnbl_index = df.loc[df['line_no'] == nnbl_line_no,:].index.values[0]
except:
nnbl_index = False
try:
pnbl_indent = df['ssc'][pnbl_index]
except:
pnbl_indent = -1
try:
nnbl_indent = df['ssc'][nnbl_index]
except:
nnbl_indent = -1
cur_indent = df['ssc'][index]
ssc_col = 'ssc_' + str(cur_indent)
print(ssc_col)
case = df['case'][index]
try:
print("processing line no",line_no, data)
except:
pass
print(plb)
print(nlb)
print(pnbl_indent)
print(nnbl_indent)
### wights to be assigned based on space count, case, parentheseis and plb/nlb
# read the weights csv
#wts_df = pd.read_csv('weights1.csv',index_col = 'Possibilities')
## create the wights for last character placement from 41-78 by interpolation using wwights 51 63 78
lcp = df['lcp'][index]
#print("lcp ",lcp)
try:
lcp_col = "lcp_" + str(int(lcp))
except:
lcp_col = "lcp_" + str(lcp)
#print(wts_df.head(0))
# make space dict for getting relevant space columns for weights
sp_bin_dict = {1:'0-14',2:'15',3:'16-24',4:'25',5:'26-29',6:'30',7:'31-34',8:'35',9:'36-73',10:'74onwards'
}
#loop over for the possibilities
for i in range(1,32):
if i in ('23','24','32','33'):
continue
df["ps{0}".format(i)][index] = 0
## get weights for the case
if case in ('EndUpper','MidUpper'):
case = 'FirstLowerSomeUpper'
if case != 'None':
df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),case]
## get weights based on the starting space count
try:
df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),ssc_col]
#print("starting wight code was here")
except:
pass
print("ps{0}".format(i),df["ps{0}".format(i)][index])
## get weights for <19 with Numeric character or <19 without Numeric character
try:
start_num = True if re.search('[0-9]',data.strip()[0]) else False
except:
start_num = False
pos_num = re.search('[0-9]',data)
if (pos_num!= None) and start_num and cur_indent<15:
df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'<15withNumeric']
elif check_space(data)<15:
df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'<15withoutNumeric']
if cur_indent>65:
df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'ssc_gt_65']
## get weights based on the last character placement
print("ps{0}".format(i),df["ps{0}".format(i)][index])
try:
df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),lcp_col]
print("code was here")
print(wts_df.loc["ps{0}".format(i),lcp_col])
except Exception as e:
print ("lcp exception is",e)
pass
# how far is it from position 51 63 78
# 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78
# modify the wights matrix and create in between weights
#print("ps{0}".format(i),df["ps{0}".format(i)][index])
# Calculation of weights based on plb and nlb(L-O column in sheet)
if plb == "Y":
df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'PLB_Yes']
if plb == "N":
df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'PLB_No']
if nlb == "Y":
df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'NLB_Yes']
if nlb == "N":
df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'NLB_No']
#print("ps{0}".format(i),df["ps{0}".format(i)][index])
# Calculation of weights based on parenthesis(H-K column in sheet)
if re.match('\(',data.strip()[:1]) and re.match('\)',data.strip()[-1:]) :
# print('EntireLine')
df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'EntireLine']
elif re.search('\(',data.strip()) and re.search('\)',data.strip()) :
#print('PartofLine')
df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'PartofLine']
elif re.search('\(',data.strip()) and not(re.search('\)',data.strip())) :
#print('only left parenthetical present')
df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'only left parenthetical present']
elif not(re.search('\(',data.strip())) and re.search('\)',data.strip()) :
#print('only right parenthetical present')
df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'only right parenthetical present']
#df["ps{0}".format(i)][index] = math.trunc(df["ps{0}".format(i)][index])
#print("i is ",i)
#print(math.trunc(ps_dict["ps{0}".format(i)]))
## Calculation of weights based on indent equals previous / next non blank line
if cur_indent == pnbl_indent:
df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'cur_indent_equals_pnbl']
if cur_indent == nnbl_indent:
df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'cur_indent_equals_nnbl']
print("ps{0}".format(i),df["ps{0}".format(i)][index])
print("Special Words Check")
## calculation of weights based on special words
sp_words1 = ['cut to','CUT BACK TO','FLASHCUT TO','dissolve to', 'intercut', 'Inter Cut','PBS', 'interval',
'Flashback','FADE IN','FADE TO BLACK']
for sp_word in sp_words1:
print(sp_word)
search_data = data.replace(":","")
match = re.match(sp_word,search_data.strip(),re.IGNORECASE)
if match:
contains_special = True
break
print (contains_special,search_data)
if not contains_special:
search_data = data.strip()
## check if within quotes
if search_data:
if len(search_data) > 3:
if (search_data.startswith('“') or search_data.startswith('"')):
if (search_data.endswith('”') or search_data.endswith('"')):
contains_special = True
if contains_special:
try:
print("found match in ",data)
except:
print("found match ")
for i in range(1,32):
if i in ('23','24','32','33'):
continue
df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'containsSpecialWords1']
## calculation of weights based on special slug words
sp_words3 = ['INT.','EXT.','I/E','E/I','EXT-','INT-']
if not contains_special:
for sp_word in sp_words3:
print(sp_word)
#search_data = data.replace(":","")
found = re.search(sp_word,data.strip()[0:8])
if found:
contains_special = True
try:
print("found match in ",data)
except:
print("found match ")
for i in range(1,32):
if i in ('23','24','32','33'):
continue
df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'containsSpecialWords3']
break
## calculation of weights based on special slug endings
sp_words4 = [' - MORNING',' - DAY',' - EVENING',' - EVE',' - NIGHT',' - LATER',' - AFTERNOON']
for sp_word in sp_words4:
found = re.search(sp_word,data.strip())
if found:
contains_special = True
try:
print("found match in ",data)
except:
print("found match ")
for i in range(1,32):
if i in ('23','24','32','33'):
continue
df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'containsSpecialWords4']
break
# speaker possble if single all caps word
if left_aligned and not contains_special:
if case == 'AllUpper' and len(data.split()) <= 2 and "." not in data and ":" not in data and df['lcp'][index] < 30 :
print("boosting speaker possibility")
df["ps7"][index] += 30
## add preassigned weight
if not contains_special:
if df['preassigned_weights'][index]:
pre_psw_list = df['preassigned_weights'][index].split(';')
for psw in pre_psw_list:
ps = psw.split('-')[0]
wt = psw.split('-')[1]
df[ps][index] += int(wt)
df = update_parenthetical_neighbor_wt(df)
if 'actual_element' not in df.columns:
df['actual_element'] = ''
return df
def sort_pos_decr_wts(df):
## sort in decreasing order
x = n_largest(df[['ps1', 'ps2', 'ps3', 'ps4', 'ps5', 'ps6', 'ps7', 'ps8', 'ps9', 'ps10', 'ps11', 'ps12', 'ps13', 'ps14', 'ps15', 'ps16', 'ps17', 'ps18', 'ps19',
'ps21', 'ps22', 'ps25', 'ps26', 'ps27','ps28','ps29', 'ps30', 'ps31']], axis=1, n=28)
df.insert(8, "1_largest", x['1_largest'])
df.insert(9, "2_largest", x['2_largest'])
df.insert(10, "3_largest", x['3_largest'])
df.insert(11, "4_largest", x['4_largest'])
df.insert(12, "5_largest", x['5_largest'])
df.insert(13, "6_largest", x['6_largest'])
df.insert(14, "7_largest", x['7_largest'])
df.insert(15, "8_largest", x['8_largest'])
df.insert(16, "9_largest", x['9_largest'])
df.insert(17, "10_largest", x['10_largest'])
df.insert(18, "11_largest", x['11_largest'])
df.insert(19, "12_largest", x['12_largest'])
df.insert(20, "13_largest", x['13_largest'])
df.insert(21, "14_largest", x['14_largest'])
df.insert(22, "15_largest", x['15_largest'])
df.insert(23, "16_largest", x['16_largest'])
df.insert(24, "17_largest", x['17_largest'])
df.insert(25, "18_largest", x['18_largest'])
df.insert(26, "19_largest", x['19_largest'])
df.insert(27, "20_largest", x['20_largest'])
df.insert(28, "21_largest", x['21_largest'])
df.insert(29, "22_largest", x['22_largest'])
df.insert(30, "23_largest", x['23_largest'])
df.insert(31, "24_largest", x['24_largest'])
df.insert(32, "25_largest", x['25_largest'])
df.insert(33, "26_largest", x['26_largest'])
df.insert(34, "27_largest", x['27_largest'])
df.insert(35, "28_largest", x['28_largest'])
b = df[['ps1', 'ps2', 'ps3', 'ps4', 'ps5', 'ps6', 'ps7', 'ps8', 'ps9', 'ps10', 'ps11', 'ps12', 'ps13', 'ps14',
'ps15', 'ps16', 'ps17', 'ps18', 'ps19', 'ps21', 'ps22', 'ps25', 'ps26', 'ps27', 'ps28','ps29', 'ps30', 'ps31']]
Tops = pd.DataFrame(b.apply(lambda x: list(b.columns[np.array(x).argsort()[::-1][:28]]), axis=1).to_list(), columns=['Top1', 'Top2', 'Top3', 'Top4', 'Top5', 'Top6', 'Top7', 'Top8',
'Top9', 'Top10', 'Top11', 'Top12', 'Top13', 'Top14', 'Top15', 'Top16', 'Top17', 'Top18', 'Top19', 'Top20', 'Top21', 'Top22', 'Top23', 'Top24', 'Top25', 'Top26', 'Top27','Top28'])
print(Tops)
res = pd.concat([df, Tops], axis=1)
#print("Ye kuch result hai:",res)
res['first_largest'] = res['Top1'] + "-" + res['1_largest'].astype(str)
res['second_largest'] = res['Top2'] + "-" + res['2_largest'].astype(str)
res['third_largest'] = res['Top3'] + "-" + res['3_largest'].astype(str)
res['fourth_largest'] = res['Top4'] + "-" + res['4_largest'].astype(str)
res['fifth_largest'] = res['Top5'] + "-" + res['5_largest'].astype(str)
res['sixth_largest'] = res['Top6'] + "-" + res['6_largest'].astype(str)
res['seventh_largest'] = res['Top7'] + "-" + res['7_largest'].astype(str)
res['eight_largest'] = res['Top8'] + "-" + res['8_largest'].astype(str)
res['ninth_largest'] = res['Top9'] + "-" + res['9_largest'].astype(str)
res['tenth_largest'] = res['Top10'] + "-" + res['10_largest'].astype(str)
res['eleventh_largest'] = res['Top11'] + "-" + res['11_largest'].astype(str)
res['twelth_largest'] = res['Top12'] + "-" + res['12_largest'].astype(str)
res['thirteenth_largest'] = res['Top13'] + "-" + res['13_largest'].astype(str)
res['fourteenth_largest'] = res['Top14'] + "-" + res['14_largest'].astype(str)
res['fifteenth_largest'] = res['Top15'] + "-" + res['15_largest'].astype(str)
res['sixteenth_largest'] = res['Top16'] + "-" + res['16_largest'].astype(str)
res['seventeenth_largest'] = res['Top17'] + "-" + res['17_largest'].astype(str)
res['eighteenth_largest'] = res['Top18'] + "-" + res['18_largest'].astype(str)
res['ninteenth_largest'] = res['Top19'] + "-" + res['19_largest'].astype(str)
res['tewenty_largest'] = res['Top20'] + "-" + res['20_largest'].astype(str)
res['tone_largest'] = res['Top21'] + "-" + res['21_largest'].astype(str)
res['ttwo_largest'] = res['Top22'] + "-" + res['22_largest'].astype(str)
res['tthree_largest'] = res['Top23'] + "-" + res['23_largest'].astype(str)
res['tfour_largest'] = res['Top24'] + "-" + res['24_largest'].astype(str)
res['tfive_largest'] = res['Top25'] + "-" + res['25_largest'].astype(str)
res['tsix_largest'] = res['Top26'] + "-" + res['26_largest'].astype(str)
res['tseven_largest'] = res['Top27'] + "-" + res['27_largest'].astype(str)
res['teight_largest'] = res['Top28'] + "-" + res['28_largest'].astype(str)
# res['largest1'] = res['Top1']
# res['largest2'] = res['Top2']
# res['largest3'] = res['Top3']
# res['largest4'] = res['Top4']
# res['largest5'] = res['Top5']
# res['largest6'] = res['Top6']
# res['largest7'] = res['Top7']
# res['largest8'] = res['Top8']
# res['largest9'] = res['Top9']
# res['largest10'] = res['Top10']
# res['largest11'] = res['Top11']
# res['largest12'] = res['Top12']
# res['largest13'] = res['Top13']
# res['largest14'] = res['Top14']
# res['largest15'] = res['Top15']
# res['largest16'] = res['Top16']
# res['largest17'] = res['Top17']
# res['largest18'] = res['Top18']
# res['largest19'] = res['Top19']
# res['largest20'] = res['Top20']
# res['largest21'] = res['Top21']
# res['largest22'] = res['Top22']
# res['largest23'] = res['Top23']
# res['largest24'] = res['Top24']
# res['largest25'] = res['Top25']
# res['largest26'] = res['Top26']
# res['largest27'] = res['Top27']
# res['largest28'] = res['Top28']
# print(res)
# res.drop(['Top1', 'Top2', 'Top3', 'Top4', 'Top5', 'Top6', 'Top7', 'Top8', 'Top9', 'Top10', 'Top11', 'Top12', 'Top13', 'Top14', 'Top15',
# 'Top16', 'Top17', 'Top18', 'Top19', 'Top20', 'Top21', 'Top22', 'Top23', 'Top24', 'Top25', 'Top26', 'Top27','Top28'], axis=1, inplace=True)
y = ['line_no',
'data',
'actual_element',
'Identification_Status',
'scene_number',
'plb',
'nlb',
'ssc',
'lcp',
'case',
'parenthetical',
'isIdentified',
'When_Identified',
'first_largest',
'second_largest',
'third_largest',
'fourth_largest',
'fifth_largest',
'sixth_largest',
'seventh_largest',
'eight_largest',
'ninth_largest',
'tenth_largest',
'eleventh_largest',
'twelth_largest',
'thirteenth_largest',
'fourteenth_largest',
'fifteenth_largest',
'sixteenth_largest',
'seventeenth_largest',
'eighteenth_largest',
'ninteenth_largest',
'tewenty_largest',
'tone_largest',
'ttwo_largest',
'tthree_largest',
'tfour_largest',
'tfive_largest',
'tsix_largest',
'tseven_largest',
'teight_largest',
# 'largest1',
# 'largest2',
# 'largest3',
# 'largest4',
# 'largest5',
# 'largest6',
# 'largest7',
# 'largest8',
# 'largest9',
# 'largest10',
# 'largest11',
# 'largest12',
# 'largest13',
# 'largest14',
# 'largest15',
# 'largest16',
# 'largest17',
# 'largest18',
# 'largest19',
# 'largest20',
# 'largest21',
# 'largest22',
# 'largest23',
# 'largest24',
# 'largest25',
# 'largest26',
# 'largest27',
'ps1',
'ps2',
'ps3',
'ps4',
'ps5',
'ps6',
'ps7',
'ps8',
'ps9',
'ps10',
'ps11',
'ps12',
'ps13',
'ps14',
'ps15',
'ps16',
'ps17', 'ps18', 'ps19', 'ps21', 'ps22', 'ps25', 'ps26', 'ps27', 'ps28','ps29', 'ps30', 'ps31',
'pnbl_line_no',
'nnbl_line_no',
'ppnbl_line_no',
'nnnbl_line_no',
'pdil_line_no',
'ndil_line_no'
]
df = res.reindex(columns=y)
return(df)
def prep_for_pos_elimination(df):
all_pos = [ "ps{0}".format(ps) for ps in range(1,35) ]
df.insert(12,'Identification_Status_with_weights','')
# In[24]:
for x in ['ps23','ps24','ps32','ps33','ps34']:
all_pos.remove(x)
# In[25]:
## go through all lines
## if line is identified add all possibilties
## get weights of the psssibilities
## sort the possibilities in decreasing order of weights
def useWeights(ps):
return int(ps.split("-")[1])
for index in df.index:
if df["isIdentified"][index] == 'No':
cur_line_pos = all_pos
## append the weight to the possibilites
pos_with_weights = []
for pos in cur_line_pos:
wt = ''
pos_wt = str(pos)
try:
wt = df[pos][index].astype(int)
pos_wt += '-' + str(wt)
except:
continue
pos_with_weights.append(pos_wt)
# now sort in descending order using the weights as key
pos_with_weights = sorted(pos_with_weights,key=useWeights , reverse = True)
line_pos_string_with_weights = ';'.join([str(elem) for elem in pos_with_weights])
df['Identification_Status_with_weights'][index] = line_pos_string_with_weights
## copy over to identification status without the weights but in order of decreasing weights
pos_without_weight = []
for pos in pos_with_weights:
pos_without_weight.append(pos.split("-")[0])
line_pos_string = ';'.join([str(elem) for elem in pos_without_weight])
print(line_pos_string)
df['Identification_Status'][index] = line_pos_string
## make a column which indicates the possibilities not to be removed
## possibilities not to eliminate
df['ps_not_to_remove'] = 'ps34'
# df['parenthetical'] = ''
for index in df.index:
# par = ''
#
# print(data)
# if re.match('\(',data.strip()[:1]):
# if re.match('\)',data.strip()[-1:]) :
# par = 'Complete'
# elif re.search('\)',data.strip()) :
# par = 'PartBeginningMid'
# else:
# par = 'Beginning'
# elif re.match('\)',data.strip()[-1:]):
# if re.search('\(',data.strip()):
# par = 'PartMidEnd'
# else:
# par = 'End'
# # beginning end already checked so now if paren present it is mixed
# elif re.search('\(',data.strip()) and re.search('\)',data.strip()):
# par = 'PartMidMid'
# elif re.search('\(',data.strip()):
# par = 'MixedBeginning'
# elif re.search('\)',data.strip()):
# par = 'MixedEnd'
# else:
# par = 'Absent'
# df['Parenthetical'][index] = par
data = df['data'][index]
pos_not_to_remove = []
if df["isIdentified"][index] == 'No':
## find the top possibilities - max weight
pos_with_wts = df["Identification_Status_with_weights"][index].split(";")
max_pos_index = 0
wt1 = 0
wt2 = 0
print(df["line_no"][index])
for k in range(0,len(pos_with_wts)-1):
wt1 = pos_with_wts[k].split("-")[1]
wt2 = pos_with_wts[k+1].split("-")[1]
print(wt1,wt2)
if wt2 == wt1:
max_pos_index = k+1
continue
else:
break
print (max_pos_index)
for j in range(0,max_pos_index+1):
pos_not_to_remove.append(df["Identification_Status"][index].split(";")[j])
# if entire line in parenthetical don't remove ps8,ps10
if re.match('\(',data.strip()[:1]) and re.match('\)',data.strip()[-1:]):
pos_not_to_remove.append('ps8')
pos_not_to_remove.append('ps10')
words = data.split()
if len(words[0]) > 1 and words[0].isupper():
pos_not_to_remove.append('ps8')
pos_not_to_remove.append('ps25')
pos_not_to_remove.append('ps30')
if len(words) == 1:
pos_not_to_remove.append('ps7')
left_p = True if re.search('\(',data) else False
right_p = True if re.search('\)',data) else False
if left_p and right_p and not(re.match('\(',data.strip()[:1])):
if (re.search('\)',data).start() -re.search('\(',data).start()) > 0:
pos_not_to_remove.append('ps25')
pos_not_to_remove.append('ps27')
pos_not_to_remove = list(set(pos_not_to_remove))
df['ps_not_to_remove'][index] = (";").join(str(elem) for elem in pos_not_to_remove)
## go thorugh all the lines , if parenthetical is absent remove ps 8,10,11,12,25,26,27
pos_to_remove = ['ps10','ps11','ps12','ps25','ps26','ps27']
for index in df.index:
if df['parenthetical'][index] == 'Absent':
cur_line_pos = df["Identification_Status"][index].split(";")
pos_not_to_remove = df['ps_not_to_remove'][index].split(";")
cur_line_pos = [pos for pos in cur_line_pos if pos not in pos_to_remove]
pos_not_to_remove = [pos for pos in pos_not_to_remove if pos not in pos_to_remove]
df["Identification_Status"][index] = ";".join(cur_line_pos)
df['ps_not_to_remove'][index] = ";".join(pos_not_to_remove)
pos_to_remove = ['ps11','ps12']
for index in df.index:
if df['parenthetical'][index] in ('PartMidEnd','PartStartMid','PartMidMid'):
cur_line_pos = df["Identification_Status"][index].split(";")
pos_not_to_remove = df['ps_not_to_remove'][index].split(";")
cur_line_pos = [pos for pos in cur_line_pos if pos not in pos_to_remove]
pos_not_to_remove = [pos for pos in pos_not_to_remove if pos not in pos_to_remove]
df["Identification_Status"][index] = ";".join(cur_line_pos)
df['ps_not_to_remove'][index] = ";".join(pos_not_to_remove)
## refine the possibilties of first and last line
first_line_index = 0
if df['case'][first_line_index] == 'None':
first_line_index += 1
last_line_index = df.index[-1]
if df['case'][last_line_index] == 'None':
last_line_index -= 1
## keep possibilities of first and last line
eligible_pos = ['ps1','ps2','ps17','ps18']
first_line_pos = df["Identification_Status"][first_line_index].split(";")
first_line_pos = [ps for ps in first_line_pos if ps in eligible_pos ]
df['Identification_Status'][first_line_index] = ";".join(first_line_pos)
df['ps_not_to_remove'][first_line_index] = ""
## keep possibilities of last line
eligible_pos = ['ps6','ps15','ps16','ps17','ps29','ps30','ps31']
last_line_pos = df["Identification_Status"][last_line_index].split(";")
last_line_pos = [ps for ps in eligible_pos if ps in last_line_pos ]
df['Identification_Status'][last_line_index] = ";".join(last_line_pos)
df['ps_not_to_remove'][last_line_index] = ""
# In[26]:
for index in df.index:
cur_line_pos = df["Identification_Status"][index].split(";")
if len(cur_line_pos) == 1:
continue
## append the weight to the possibilites
pos_with_weights = []
for pos in cur_line_pos:
wt = ''
pos_wt = str(pos)
try:
wt = df[pos][index].astype(int)
pos_wt += '-' + str(wt)
except:
continue
pos_with_weights.append(pos_wt)
# now sort in descending order using the weights as key
pos_with_weights = sorted(pos_with_weights,key=useWeights , reverse = True)
line_pos_string_with_weights = ';'.join([str(elem) for elem in pos_with_weights])
df['Identification_Status_with_weights'][index] = line_pos_string_with_weights
## copy over to identification status without the weights but in order of decreasing weights
pos_without_weight = []
for pos in pos_with_weights:
pos_without_weight.append(pos.split("-")[0])
line_pos_string = ';'.join([str(elem) for elem in pos_without_weight])
print(line_pos_string)
df['Identification_Status'][index] = line_pos_string
def examine_speaker_pos(df,audit_df):
print("examining speaker possibilties")
speaker_list = df.loc[df['Identification_Status'] == 'ps7','data'].astype(str)
speaker_list = [ elem.strip() for elem in speaker_list ]
speaker_lines_list = df.loc[(df['Identification_Status'] == 'ps7') | (df['Identification_Status'] == 'ps8'),'line_no'].to_list()
unique_speaker_list = []
speaker_in_two_lines_list = []
for speaker in speaker_list:
speaker = speaker.strip()
#print(speaker)
if speaker not in unique_speaker_list:
unique_speaker_list.append(speaker)
## strip the blank spaces
try:
print(unique_speaker_list)
except:
pass
print(speaker_lines_list)
# In[173]:
for index in df.index:
line_no = df['line_no'][index]
data = df['data'][index].strip()
if df['Identification_Status'][index] == 'ps7':
continue
if df['Identification_Status'][index] == 'ps8':
continue
try:
if ("".join(data.split()).upper() in unique_speaker_list) or (data.upper() in unique_speaker_list):
print (line_no,data)
if line_no not in speaker_lines_list and df['isIdentified'][index] != 'Yes':
speaker_lines_list.append(line_no)
except:
print(line_no,data,"data is not str")
pass
if index != df.index[-1]:
nl_data = df['data'][index+1]
cur_par = df['parenthetical'][index]
if data.strip() and nl_data.strip() and cur_par == 'Absent':
two_line_data = "".join((data+nl_data).split())
#print(two_line_data)
#print(two_line_data.lstrip().split(" ")[0])
#print(unique_speaker_list)
if two_line_data in unique_speaker_list:
#print("Yes")
print(line_no,data)
print(line_no,nl_data)
speaker_in_two_lines_list.append(line_no)
elif two_line_data.lstrip().split("(")[0] in unique_speaker_list and data.lstrip().split("(")[0].strip() not in unique_speaker_list :
print(line_no,data)
print(line_no,nl_data)
speaker_in_two_lines_list.append(line_no)
speaker_lines_list.sort()
speaker_in_two_lines_list.sort()
print(speaker_lines_list)
print(speaker_in_two_lines_list)
# In[174]:
# create new df with line no as index
df_line_index =df.copy().set_index('line_no')
df_line_index.head()
## correct the speaker in two lines
## first go through speaker in two lines
for sp_line in speaker_in_two_lines_list:
if df_line_index['Identification_Status'][sp_line] == 'ps8':
continue
data = df_line_index['data'][sp_line]
line_no = sp_line
nl_data = df_line_index['data'][sp_line+1]
new_data = data.strip() + nl_data.strip()
if re.search('\(',new_data):
par_pos = re.search('\(',new_data).start()
before_par = new_data[:par_pos]
after_par = new_data[par_pos:]
df_line_index['data'][sp_line] = before_par
df_line_index['data'][sp_line+1] = after_par
if re.match('\)',after_par.strip()[-1]):
df_line_index['Identification_Status'][sp_line+1] = 'ps10'
df_line_index['parenthetical'][sp_line+1] = 'Complete'
df_line_index['When_Identified'][sp_line+1] = 'ExaminingSpeakerLines'
elif re.match('\)',after_par.strip()):
df_line_index['Identification_Status'][sp_line+1] = 'ps26'
df_line_index['parenthetical'][sp_line+1] = 'PartStartMid'
df_line_index['When_Identified'][sp_line+1] = 'ExaminingSpeakerLines'
else:
df_line_index['Identification_Status'][sp_line+1] = 'ps11'
df_line_index['parenthetical'][sp_line+1] = 'StartingLeft'
df_line_index['When_Identified'][sp_line+1] = 'ExaminingSpeakerLines'
# add line no to speaker lines
speaker_lines_list.append(sp_line)
# print to report
# audit_report.write("%s.line no: %s , Found Speaker in two lines with continuing parenthetical, Separated speaker \n" %(audit_sno,sp_line))
# audit_sno += 1
else:
# print to report
print("%s.line no: %s , Found Speaker in two lines merged the line \n" %(audit_sno,sp_line))
# audit_report.write("%s.line no: %s , Found Speaker in two lines merged the line \n" %(audit_sno,sp_line))
# audit_sno += 1
# correct the line
df_line_index['data'][sp_line] = new_data
# add line no to speaker lines
speaker_lines_list.append(sp_line)
# delete the next line
df_line_index.drop((sp_line+1),inplace= True)
audit_df['line_removed'][sp_line+1] = 'Yes'
print (data,nl_data,new_data)
# In[176]:
## go through the speaker lines and rectify them , print to audit report
for sp_line in speaker_lines_list:
line_no = sp_line
##identify the above identified speaker lines as ps7
if df_line_index['Identification_Status'][sp_line] in ('ps7','ps8'):
continue
df_line_index['Identification_Status'][sp_line] = 'ps7'
df_line_index['parenthetical'][sp_line] = 'Absent'
if df_line_index['When_Identified'][sp_line] != 'FirstStrictConditions' :
df_line_index['When_Identified'][sp_line] = 'ExaminingSpeakerLines'
sp_data = df_line_index['data'][sp_line].strip()
print(sp_line)
try:
print(sp_data)
except:
pass
cur_indent = df_line_index['ssc'][sp_line]
try:
new_speaker_indent = int(cur_indent)
except:
new_speaker_indent = 35
## check and correct case
if df_line_index['case'][sp_line] != 'AllUpper':
try:
print(sp_data)
except:
pass
df_line_index['data'][sp_line] = sp_data.rjust(len(sp_data)+new_speaker_indent).upper()
df_line_index['case'][sp_line] = 'AllUpper'
try:
print("case corrected to:",df_line_index['data'][sp_line])
except:
pass
audit_df['case_corrected'][line_no] = 'Speaker Case corrected to All Upper Case'
## check and correct gap between speaker name
sp_data_nogap = "".join(sp_data.split())
if sp_data != sp_data_nogap:
try:
print(sp_data)
except:
pass
## removing gap only if first world is single
first_word = sp_data.split()[0]
if len(first_word) == 1:
df_line_index['data'][sp_line] = (sp_data_nogap.upper()).rjust(len(sp_data_nogap)+new_speaker_indent)
try:
print("speaker name corrected to:",df_line_index['data'][sp_line])
except:
pass
audit_df['space_removed_between_characters'][line_no] = 'Yes'
df_line_index.reset_index(inplace=True)
df = df_line_index.sort_index().reset_index(drop=True)
## mixed speaker identification
## use the speakers to idenfity mixed speaker lines ps8, 25,30,21,28
# In[184]:
df['ps_not_to_remove'] = df['ps_not_to_remove'].astype(str)
try:
print(unique_speaker_list)
except:
pass
print("performing checks for speaker followed by parenthetical")
for index in df.index:
if df['Identification_Status'][index] == 'ps8':
continue
line_no = df['line_no'][index]
print("line_no",line_no)
print(df.dtypes)
new_line_no = 0.0
data = df['data'][index]
cur_pos_list = df['Identification_Status'][index].split(";")
top_pos = cur_pos_list[0]
new_pos_list = cur_pos_list
try:
pos_not_to_remove = df['ps_not_to_remove'][index].split(";")
except:
pos_not_to_remove = []
set_1 = set(cur_pos_list)
set_2 = set(pos_not_to_remove)
pos_not_to_remove = list(set.intersection(set_1,set_2))
for speaker in unique_speaker_list:
check_done =False
if re.search(speaker,data) and df['Identification_Status'][index] not in ('ps7','ps8') :
# check if speaker is at start of line followed by something (like parenthetical)
pos_starts = re.search(speaker,data,re.IGNORECASE).start()
pos_end = re.search(speaker,data,re.IGNORECASE).end()
before_speaker = data[:pos_starts]
after_speaker = data[pos_end:]
print("match found")
try:
print("data 2347:",data)
print("speaker 2348:", speaker)
print("before speaker 2349 :", before_speaker)
print("after speaker 2350:",after_speaker)
except:
pass
try:
char1_after_speaker = after_speaker.lstrip()[0]
except:
char1_after_speaker = ''
cur_indent = df['ssc'][index]
try:
new_speaker_indent = int(cur_indent)
except:
new_speaker_indent = 35
try:
print(before_speaker)
print(after_speaker)
print("char1_after_speaker 2367:",char1_after_speaker)
except:
pass
## separate parenthtical if speaker is followed by parenthtical
if before_speaker.isspace() and char1_after_speaker == '(' and df['parenthetical'][index] == 'PartMidEnd' and "V.O." not in str(after_speaker):
#print("before_speaker 2372:", before_par)
print ("Seperating parenthetical")
print("Identifying speaker")
print(index)
df['data'][index] = before_speaker + speaker
df['parenthetical'][index] = 'Absent'
df['When_Identified'][index] = 'ExaminingSpeakerLines'
df['case'][index] = 'AllUpper'
df['Identification_Status'][index] = 'ps7'
nlb = df['nlb'][index]
df['nlb'][index] = 'N'
audit_df['line_broken_into_multiple_lines'][line_no] = 'Separated Speaker and Parenthetical'
#print(df['Identification_Status'][index])
#new_line_no = str(int(line_no) + 0.5)
new_line_no = line_no + 0.5
print(type(line_no),type(new_line_no))
print("identifying parenthetical")
print(df.dtypes)
df.loc[index + 0.25] = np.nan
df.loc[index + 0.25,'data'] = str(after_speaker)
df.loc[index + 0.25,'parenthetical'] = 'Complete'
df.loc[index + 0.25,'When_Identified'] ='ExaminingSpeakerLines'
df.loc[index + 0.25,'Identification_Status'] = 'ps10'
df.loc[index + 0.25,'case'] = ''
df.loc[index + 0.25,'plb'] = 'N'
df.loc[index + 0.25,'nlb'] = nlb
df.loc[index + 0.25,'line_no'] = new_line_no
#print(df.loc[index + 0.25]['line_no'] ,new_line_no)
#df['line_no'] = pd.to_numeric(df['line_no'],errors ='coerce')
df = df.sort_index().reset_index(drop=True)
audit_df.loc[new_line_no] = np.nan
audit_df['line_removed'][new_line_no] = 'No'
print(df.dtypes)
print(audit_df.dtypes)
continue
elif before_speaker.isspace() and char1_after_speaker == '(' and df['parenthetical'][index] != 'PartMidMid':
##make the line possibilities as ps8;ps25
## next non-blank characteer after speker is left (
print ("parenthetical mix")
new_pos_list = ['ps8','ps25']
## check and change the indent to speaker indent of 35
if df['ssc'][index] != 35:
df['ssc'][index] = new_speaker_indent
df['data'][index] = data.strip().rjust(len(data.strip()) + new_speaker_indent)
# with open(audit_report_path,'a') as audit_report:
# audit_report.write("%s. line no: %s , Corrected Speaker Mix indent to \n" %(audit_sno,index))
# audit_sno += 1
#df_line_index['Identification_Status'][index] = 'ps8;ps25'
# break
elif before_speaker.isspace() and (not after_speaker.isspace()) :
## add code to remove
# cannot be ps1,ps2,ps3,ps7,ps9,ps10,ps11,ps12,ps16,ps17,ps18,ps19,ps21,ps22,ps26,ps27,ps28,ps29,ps31,
ps_remove = ['ps1','ps2','ps3','ps7','ps9','ps10','ps11','ps12','ps16','ps17','ps18','ps19','ps21','ps22','ps26','ps27','ps28','ps29','ps31']
for ps in ps_remove:
try:
new_pos_list.remove(ps)
except:
continue
#df_line_index['Identification_Status'][index] = 'ps30;ps4;ps5;ps6;ps8;ps13;ps14;ps15'
print ("present but not parenthetical removed except - ps30;ps4;ps5;ps6;ps8;ps13;ps14;ps15")
# break
elif (not before_speaker.isspace()) and after_speaker.isspace():
new_pos_list = ['ps21','ps28','ps5','ps4']
#df_line_index['Identification_Status'][index] = 'ps21;ps28'
print ("before speaker present")
check_done = True
else:
new_pos_list = cur_pos_list
print("no change done")
## append the posibility which were not to be removed back to the list if it got eliminated in the intersection
for ps in pos_not_to_remove:
if ps not in new_pos_list:
new_pos_list.insert(0,ps)
df['Identification_Status'][index] = ";".join([str(elem) for elem in new_pos_list])
print('\n')
if check_done:
break
for index in df.index:
if df['Identification_Status'][index] == 'blank' or (len(df['Identification_Status'][index].split(";")) == 1):
df['isIdentified'][index] = 'Yes'
else:
df['isIdentified'][index] = 'No'
return df
def examine_speaker_next_lines(df,audit_df):
df.reset_index(inplace=True, drop=True)
## identifying lines after speaker
## get the speaker lines
speaker_lines_list = df.loc[(df['Identification_Status'] == 'ps7') | (df['Identification_Status'] == 'ps8'),'line_no'].to_list()
## go through the speaker lines and identify the lines after speaker
for line in speaker_lines_list:
blank_to_delete = []
index = df.loc[df['line_no'] == line,:].index.values[0]
data = df['data'][index]
speaker_name = data.strip()
print("\n")
print("speaker index",index)
print("speaker line no",line)
try:
print("data:\n",data)
except:
pass
# move the index to next nbl line to check it
if df['nlb'][index] == 'Y':
index += 2
else:
index += 1
possible_dialog_line = False
parenthetical_begun = False
## examine the lines(if any) after speaker and before dialougue
## move index till end paranthetical comes
while not possible_dialog_line:
data = df['data'][index]
line_no = df['line_no'][index]
cur_line_par = df['parenthetical'][index]
print("examining line")
try:
print(data)
print(line_no,cur_line_par)
except:
pass
if df['Identification_Status'][index] == 'ps10':
index += 1
#possible_dialog_line = True
elif cur_line_par == 'Complete':
# complete line in paranthetical
print("Identifying as Parenthetical Complete")
df['Identification_Status'][index] = 'ps10'
df['When_Identified'][index] = 'ExaminingLinesAfterSpeaker'
index +=1
possible_dialog_line = True
elif cur_line_par == 'StartingLeft' and not(parenthetical_begun) :
# line has paranthetical in beginning only
print("identifying as parenthetical Beginning")
df['Identification_Status'][index] = 'ps11'
df['When_Identified'][index] = 'ExaminingLinesAfterSpeaker'
parenthetical_begun = True
index +=1
elif cur_line_par == 'EndingRight' and parenthetical_begun:
# line has paranthetical at end only
print("Identifying as parenthetical end")
df['Identification_Status'][index] = 'ps12'
df['When_Identified'][index] = 'ExaminingLinesAfterSpeaker'
index +=1
possible_dialog_line = True
elif cur_line_par == 'Absent' and parenthetical_begun:
# multiple lines could be in paranthetical
print("Identifying as parenthetical middle")
df['Identification_Status'][index] = 'ps20'
df['When_Identified'][index] = 'ExaminingLinesAfterSpeaker'
index +=1
elif df['Identification_Status'][index] == 'ps13' or df['Identification_Status'][index] == 'ps14':
## dialogue end cans still be there
index += 1
# elif cur_line_par == 'PartBeginningMid':
# # starting part of line in paranthetical
# print("Identifying as parenthetical mix with succeeding dialogue")
# df['Identification_Status'][index] = 'ps26'
# df['When_Identified'][index] = 'ExaminingLinesAfterSpeaker'
# index +=1
# possible_dialog_line = True
# elif cur_line_par == 'PartMidEnd':
# # end part line in parenthetical
# print("Identifying as parenthetical mix with preceeding dialogue")
# df['Identification_Status'][index] = 'ps27'
# df['When_Identified'][index] = 'ExaminingLinesAfterSpeaker'
# ## breaking the line to dialogue and parenthtical complete-
# index +=1
# possible_dialog_line = True
elif cur_line_par == 'Absent':
print("line should be dialogue")
possible_dialog_line = True
else:
print("line could be dialogue")
possible_dialog_line = True
print(line_no,possible_dialog_line,parenthetical_begun)
## skip if blank
if df['Identification_Status'][index] == 'blank':
print("skipping blank line")
blank_to_delete.append(index)
index +=1
# check if the current line has possibility of being a dialogue , if not move to next speaker line
if 'ps15' not in df['Identification_Status'][index].split(";") :
print("line does not have possibility of dialogue, so cannot process")
continue
else:
print("dialogue line(s) after speaker")
cur_indent = df['ssc'][index]
cur_line_par = df['parenthetical'][index]
next_line_blank = True if df['nlb'][index] == 'Y' else False
if index+2 > df.index[-1]:
break
if next_line_blank:
next_nbl_indent = df['ssc'][index+2]
next_nbl_par = df['parenthetical'][index+2]
next_nbl_case = df['case'][index+2]
next_nbl_data = df['data'][index+2]
j = index + 2
else:
next_nbl_indent = df['ssc'][index+1]
next_nbl_par = df['parenthetical'][index+1]
next_nbl_case = df['case'][index+1]
next_nbl_data = df['data'][index+1]
j = index + 1
start_index = index
rev_index = index
lines_count = 1
data = df['data'][index]
print("all lines after speaker with same indent with parentheticals")
try:
print(data)
except:
pass
dia_indent = cur_indent
print(dia_indent)
## adding condition - next line can be parenthetical
## removing conditions - next to next line check not required
## get the number of line with same indent
while (cur_indent == next_nbl_indent or dia_indent == next_nbl_indent or next_nbl_par in ('Complete','StartingLeft','EndingRight')) and not (next_nbl_case == 'AllUpper') and (not next_line_blank): # or next_nbl_case in ('AllLower','MidUpper','Partial')): ## considering dialogue ends if next blank except next line blank and nn line case Alllower
#and len(next_nbl_data.split()) == 1):
lines_count +=1
cur_indent = next_nbl_indent
cur_line_par = next_nbl_par
next_line_blank = True if df['nlb'][j] == 'Y' else False
data = df['data'][j]
try:
print(data)
print(cur_indent)
except:
pass
rev_index = j
if j+1 >= df.index[-1]:
break
if next_line_blank:
break ## as not considering space between dialogue .. action becoming dialogue
#next_nbl_indent = df['ssc'][j+2]
#next_nbl_par = df['parenthetical'][j+2]
#next_nbl_case = df['case'][j+2]
#next_nbl_data = df['data'][j+2]
#j += 2
else:
next_nbl_indent = df['ssc'][j+1]
next_nbl_par = df['parenthetical'][j+1]
next_nbl_case = df['case'][j+1]
next_nbl_data = df['data'][j+1]
j += 1
print("\n Next line indent is",next_nbl_indent)
## now lets start examing these lines in reverse order
## if last line has parenthtical end or complete then it is action mixed not dialogue
last_line_par = cur_line_par
data = df['data'][rev_index]
dialogue_end_identified = False
dia_end = rev_index
###
# last line is mixed with parenthetical. if it is parenthetical Mid end -> separate to new action line
if last_line_par == 'PartMidEnd':
print("Dialogue mixed with parenthetical")
#separate line to before and after parenthetical
par_start = re.search('\(',data).start()
before_par = data[:par_start]
after_par = data[par_start:]
# make current line as before par and tag as dialogue
print(" identifying before parenthentical line as ps15")
try:
print(before_par)
except:
pass
df['data'][rev_index] = before_par
next_line_flag = df['nlb'][rev_index]
df['nlb'][rev_index] = 'N'
df['Identification_Status'][rev_index] = 'ps15'
df['parenthetical'][rev_index] = 'Absent'
df['When_Identified'][rev_index] = 'ExaminingLinesAfterSpeaker'
df['isIdentified'][rev_index] = 'Yes'
dialogue_end_identified = True
#make new next line as action line ps6
print("action after dialogue, separating to newline , identifying line as ps6")
try:
print(after_par)
print("after_par is here")
except:
pass
print("df['line_no'][rev_index]:",df['line_no'][rev_index])
print(df['line_no'][rev_index])
line_no = df['line_no'][rev_index]
next_line_no = df['line_no'][rev_index+1]
new_line_no = (float(line_no) + float(next_line_no)) / 2
if new_line_no in audit_df.index:
new_line_no = (float(new_line_no) + float(next_line_no))/2
new_line_no = new_line_no
audit_df.loc[new_line_no] = np.nan
audit_df.loc[new_line_no]['line_removed'] = 'No'
audit_df.loc[line_no]['line_broken_into_multiple_lines'] = 'Separated Action from Dialogue and added Speaker'
# add line before action end
df.loc[rev_index + 0.25] = np.nan
df.loc[rev_index + 0.25,'ssc'] = 0
new_data = speaker_name.capitalize() + ' ' + after_par.replace('(','').replace(')','')
df.loc[rev_index + 0.25,'data'] = new_data
df.loc[rev_index + 0.25,'case'] = ''
df.loc[rev_index + 0.25,'plb'] = 'N'
df.loc[rev_index + 0.25,'nlb'] = next_line_flag
df.loc[rev_index + 0.25,'Identification_Status'] = 'ps6'
df.loc[rev_index + 0.25,'When_Identified'] = 'ExaminingLinesAfterSpeaker'
df.loc[rev_index + 0.25,'isIdentified'] = 'Yes'
df.loc[rev_index + 0.25,'parenthetical'] = 'Absent'
df.loc[rev_index + 0.25,'line_no'] = new_line_no
try:
print(new_data)
except:
pass
# insert audit report
df = df.sort_index().reset_index(drop=True)
elif last_line_par == 'EndingRight':
print("last line has parenthtical end")
# if parentical is end then find the beginning and split after begiining identify as action
j=1
beginning_not_found = True
while beginning_not_found:
print("looking for beginning parenthtical")
data = df['data'][rev_index-j]
try:
print(data)
except:
pass
if df['parenthetical'][rev_index-j] in ('StartingLeft','MixedLeft') :
beginning_not_found = False
else:
j+=1
if beginning_not_found == False :
print("parenthetical beginning found")
if df['parenthetical'][rev_index-j] == 'MixedLeft' and (rev_index-j)>=start_index:
data = df['data'][rev_index-j]
#separate line to before and after parenthetical
par_start = re.search('\(',data).start()
before_par = data[:par_start]
after_par = data[par_start:]
# make current line as before par and tag as dialogue
print(" splitting and identifying before parenthentical line as ps15")
try:
print(before_par)
except:
pass
print("here")
df['data'][rev_index-j] = before_par
next_line_flag = df['nlb'][rev_index-j]
df['nlb'][rev_index-j] = 'N'
df['Identification_Status'][rev_index-j] = 'ps15'
df['parenthetical'][rev_index-j] = 'Absent'
df['When_Identified'][rev_index-j] = 'ExaminingLinesAfterSpeaker'
df['isIdentified'][rev_index-j] = 'Yes'
dialogue_end_identified = True
dia_end = rev_index-j
#make new next line as action line
print("action after dialogue, separating to newline ")
try:
print(after_par)
print("after par")
except:
pass
line_no = df['line_no'][rev_index-j]
print("2799",type(line_no))
try:
next_line_no = df['line_no'][rev_index-j+1]
print("2802",type(new_line_no))
except:
next_line_no = df['line_no'][int(rev_index-j+1)]
print("2805",rev_index-j+1)
try:
new_line_no = (line_no + next_line_no) / 2
except:
new_line_no = (float(line_no) + float(next_line_no)) / 2
try:
if new_line_no in audit_df.index:
new_line_no = (new_line_no + next_line_no)/2
except:
if new_line_no in audit_df.index:
new_line_no = (float(new_line_no) + float(next_line_no))/2
audit_df.loc[new_line_no] = np.nan
audit_df.loc[new_line_no]['line_removed'] = 'No'
audit_df.loc[line_no]['line_broken_into_multiple_lines'] = 'Separated Action from Dialogue and added Speaker'
# add new action line , audit report, and change flag
df.loc[rev_index-j + 0.25] = np.nan
df.loc[rev_index-j + 0.25,'ssc'] = 0
df.loc[rev_index-j + 0.25,'data'] = after_par
df.loc[rev_index-j + 0.25,'case'] = ''
df.loc[rev_index-j + 0.25,'plb'] = 'N'
df.loc[rev_index-j + 0.25,'nlb'] = next_line_flag
#df.loc[index + 0.25,'Identification_Status'] = 'ps6'
df.loc[rev_index-j + 0.25,'When_Identified'] = 'ExaminingLinesAfterSpeaker'
df.loc[rev_index-j + 0.25,'isIdentified'] = 'Yes'
df.loc[rev_index-j + 0.25,'parenthetical'] = 'StartingLeft'
df.loc[rev_index-j + 0.25,'line_no'] = new_line_no
df = df.sort_index().reset_index(drop=True)
rev_index += 1
# insert audit report
if df['parenthetical'][rev_index-j] == 'StartingLeft' and (rev_index-j)>=start_index:
if j >=1:
df['Identification_Status'][rev_index-j] = 'ps4'
df['When_Identified'][rev_index-j] = 'ExaminingLinesAfterSpeaker'
df['isIdentified'][rev_index-j] = 'Yes'
cur_data = df['data'][rev_index-j]
new_data = speaker_name.capitalize() + ' ' + cur_data.replace('(','').strip()
try:
print(new_data)
except:
pass
df['data'][rev_index-j] = new_data
df['parenthetical'][rev_index-j] = 'Absent'
j -= 1
while j != 0:
df['Identification_Status'][rev_index-j] = 'ps5'
df['When_Identified'][rev_index-j] = 'ExaminingLinesAfterSpeaker'
df['isIdentified'][rev_index-j] = 'Yes'
try:
print(df['data'][rev_index-j])
except:
pass
j -= 1
df['Identification_Status'][rev_index] = 'ps6'
df['When_Identified'][rev_index] = 'ExaminingLinesAfterSpeaker'
df['isIdentified'][rev_index] = 'Yes'
cur_data = df['data'][rev_index]
new_data = cur_data.replace(')','').strip()
df['data'][rev_index] = new_data
df['parenthetical'][rev_index] = 'Absent'
try:
print(new_data)
except:
pass
# insert audit report
elif last_line_par == 'Absent' and df['case'][rev_index] != 'AllUpper':
print("Identifying as dialogue end")
df['Identification_Status'][rev_index] = 'ps15'
df['When_Identified'][rev_index] = 'ExaminingLinesAfterSpeaker'
df['isIdentified'][rev_index] = 'Yes'
dialogue_end_identified = True
### Now the last line or lines till parenthtical start have been examined
## if dialogue end is not identified then any last dialogue will be end
if not dialogue_end_identified:
print("Could not identify the dialogue")
continue
else:
print("dialogue end identfied as")
data = df['data'][dia_end]
try:
print(data)
except:
pass
## not for the remaining lines identify first as dialoguee beginning and others as middle
j = start_index
cur_line_par = df['parenthetical'][j]
data = df['data'][j]
parenthetical_begun = False
if j < dia_end :
print("\n")
try:
print(data)
except:
pass
if cur_line_par == 'Absent':
print("Identifying as dialogue begining")
df['Identification_Status'][j] = 'ps13'
df['When_Identified'][j] = 'ExaminingLinesAfterSpeaker'
df['isIdentified'][j] = 'Yes'
elif cur_line_par in ('PartMidEnd'):
print("Identifying as dialogue mixed with parenthetical")
df['Identification_Status'][j] = 'ps27'
df['When_Identified'][j] = 'ExaminingLinesAfterSpeaker'
df['isIdentified'][j] = 'Yes'
elif cur_line_par == 'MixedLeft':
print("Identifying as dialogue mixed with parenthtical")
df['Identification_Status'][j] = 'ps27'
df['When_Identified'][j] = 'ExaminingLinesAfterSpeaker'
df['isIdentified'][j] = 'Yes'
parenthetical_begun = True
elif cur_line_par == 'StartingLeft':
print("Identifying as parenthetical beginning")
df['Identification_Status'][j] = 'ps11'
df['When_Identified'][j] = 'ExaminingLinesAfterSpeaker'
df['isIdentified'][j] = 'Yes'
parenthetical_begun = True
elif cur_line_par in ('PartStartMid'):
print("Identifying as parenthetical mixed with dialog")
df['Identification_Status'][j] = 'ps26'
df['When_Identified'][j] = 'ExaminingLinesAfterSpeaker'
df['isIdentified'][j] = 'Yes'
elif cur_line_par in ('PartMidMid') :
print("Identifying as dialogue mixed with parenthtical ")
df['Identification_Status'][j] = 'ps26;ps27'
df['When_Identified'][j] = 'ExaminingLinesAfterSpeaker'
df['isIdentified'][j] = 'Yes'
next_line_blank = True if df['nlb'][j] == 'Y' else False
if next_line_blank :
j += 2
else:
j += 1
else:
continue
## now end and beginning have been examined . rest are middle if parentheical absent
cur_line_par = df['parenthetical'][j]
data = df['data'][j]
while j < dia_end :
print("\n")
try:
print(data)
except:
pass
if cur_line_par == 'Absent':
print("Identifying as dialogue middle")
df['Identification_Status'][j] = 'ps14'
df['When_Identified'][j] = 'ExaminingLinesAfterSpeaker'
df['isIdentified'][j] = 'Yes'
elif cur_line_par in ('PartMidEnd'):
print("Identifying as dialogue mixed with parenthetical")
df['Identification_Status'][j] = 'ps27'
df['When_Identified'][j] = 'ExaminingLinesAfterSpeaker'
df['isIdentified'][j] = 'Yes'
elif cur_line_par == 'MixedLeft':
print("Identifying dialogue mixed with parenthtical")
df['Identification_Status'][j] = 'ps27'
df['When_Identified'][j] = 'ExaminingLinesAfterSpeaker'
df['isIdentified'][j] = 'Yes'
parenthetical_begun = True
elif cur_line_par == 'StartingLeft':
print("Identifying as parenthetical beginning")
df['Identification_Status'][j] = 'ps11'
df['When_Identified'][j] = 'ExaminingLinesAfterSpeaker'
df['isIdentified'][j] = 'Yes'
parenthetical_begun = True
elif cur_line_par in ('Absent') and parenthetical_begun:
print("Identifying as parenthetical middle")
df['Identification_Status'][j] = 'ps20'
df['When_Identified'][j] = 'ExaminingLinesAfterSpeaker'
df['isIdentified'][j] = 'Yes'
parenthetical_begun = True
elif cur_line_par in ('EndingRight') and parenthetical_begun:
print("Identifying as parenthetical ending")
df['Identification_Status'][j] = 'ps12'
df['When_Identified'][j] = 'ExaminingLinesAfterSpeaker'
df['isIdentified'][j] = 'Yes'
parenthetical_begun = False
elif cur_line_par in ('MixedRight') and parenthetical_begun:
print("Identifying as dialogue mixed with parenthetical ")
df['Identification_Status'][j] = 'ps26'
df['When_Identified'][j] = 'ExaminingLinesAfterSpeaker'
df['isIdentified'][j] = 'Yes'
parenthetical_begun = False
elif cur_line_par in ('PartStartMid'):
print("Identifying as parenthetical mixed with dialog")
df['Identification_Status'][j] = 'ps26'
df['When_Identified'][j] = 'ExaminingLinesAfterSpeaker'
df['isIdentified'][j] = 'Yes'
elif cur_line_par in ('PartMidMid') :
print("Identifying as dialogue mixed with parenthtical ")
df['Identification_Status'][j] = 'ps26;ps27'
df['When_Identified'][j] = 'ExaminingLinesAfterSpeaker'
df['isIdentified'][j] = 'Yes'
next_line_blank = True if df['nlb'][j] == 'Y' else False
if next_line_blank :
j += 2
else:
j += 1
data = df['data'][j]
print("printing_data before loop")
cur_line_par = df['parenthetical'][j]
for index in df.index:
if df['Identification_Status'][index] == 'blank' or (len(df['Identification_Status'][index].split(";")) == 1):
df['isIdentified'][index] = 'Yes'
else:
df['isIdentified'][index] = 'No'
return df
def prep_pnnbl_wts(csv_pnbl_nnbl,cur_dir):
pnbl_nnbl_df = pd.read_csv(csv_pnbl_nnbl,skiprows = [0])
pnbl_df = pnbl_nnbl_df.iloc[:,[0,28,29,30,31,32,33,34,35,36,37,38,39]]
nnbl_df = pnbl_nnbl_df.iloc[:,[0,41,42,43,44,45,46,47,48,49,50,51,52]]
pnbl_df.rename(columns={pnbl_df.columns[0]:'Possibilities',
pnbl_df.columns[1]:'ps2',
pnbl_df.columns[2]:'ps1',
pnbl_df.columns[3]:'ps3',
pnbl_df.columns[4]:'ps4',
pnbl_df.columns[5]:'ps5',
pnbl_df.columns[6]:'ps6',
pnbl_df.columns[7]:'ps7',
pnbl_df.columns[8]:'ps10',
pnbl_df.columns[9]:'ps13',
pnbl_df.columns[10]:'ps14',
pnbl_df.columns[11]:'ps15',
pnbl_df.columns[12]:'ps16',
},inplace=True)
nnbl_df.rename(columns={nnbl_df.columns[0]:'Possibilities',
nnbl_df.columns[1]:'ps3',
nnbl_df.columns[2]:'ps2',
nnbl_df.columns[3]:'ps1',
nnbl_df.columns[4]:'ps16',
nnbl_df.columns[5]:'ps13',
nnbl_df.columns[6]:'ps14',
nnbl_df.columns[7]:'ps15',
nnbl_df.columns[8]:'ps10',
nnbl_df.columns[9]:'ps7',
nnbl_df.columns[10]:'ps4',
nnbl_df.columns[11]:'ps5',
nnbl_df.columns[12]:'ps6',
},inplace=True)
pnbl_df.to_csv(os.path.join(cur_dir,'pnbl_weights.csv'),index = False)
nnbl_df.to_csv(os.path.join(cur_dir,'nnbl_weights.csv'),index = False)
#return pnbl_df,nnbl_df
def identify_using_pnbl_nnbl(df,identify_using,iteration):
def takeNumeric(ps):
return int(ps[2:])
def useWeights(ps):
return int(ps.split("-")[1])
count_lines_identified = 0
all_pos = [ "ps{0}".format(ps) for ps in range(1,35) ]
new_lines_identified = False
## column to store the current identification status
## for 1st iteration this is the stage1 output (output after the strict conditions)
##last_identification = 'stage-1_output'
df['Identification_Status'] = df[identify_using]
#pos_count_column_name_before = 'CountofPossibilities_before_Iteration' + str(iteration)
pos_count_column_name = 'CountofPossibilities_afterIteration' + str(iteration)
df[pos_count_column_name] = ''
line_nos_identified = []
total_pos_before = 0
total_pos_after = 0
pos_decreased = False
for index in df.index:
## process the line only if it is unidentified (inclduing more than 1 possibilities)
cur_line_pos = df['Identification_Status'][index].split(";")
line_no = df['line_no'][index]
# if (df['isIdentified'][index] == 'No'):
# print(line_no,": line currently unidentified")
# print(df['data'][index])
# line_pos_using_pnbl = all_pos
# line_pos_using_nnbl = all_pos
# total_pos_before += len(all_pos)
# el
if len(cur_line_pos) > 1:
print(line_no,": line currently has more than one possibilties")
try:
print(df['data'][index])
except:
pass
print(cur_line_pos)
line_pos_using_pnbl = cur_line_pos
line_pos_using_nnbl = cur_line_pos
total_pos_before += len(cur_line_pos)
else:
print(line_no, ": line already identified as",df['Identification_Status'][index])
total_pos_before += 1
total_pos_after += 1
continue
## for unidentified line find the previous-line which is non-blank
j=1
pnbl = 'not found'
print("looking for previous non-blank line")
while (pnbl == 'not found') and ((index-j) >= 0 ):
if (df['Identification_Status'][index-j] == 'blank'):
print ("previous line is blank")
print ("moving to subsequent previous line")
j+=1
else:
pnbl = "found"
print("found previous non-blank line")
pnbl_line_pos = df['Identification_Status'][index-j].split(";")
# check if the previous line found is identified or not (unidentified or having more than one possibilities)
if (pnbl == 'found'):
if (df['isIdentified'][index-j] == 'No') :
print("but as previous non-blank line is unidentified so cannot perform pnbl check, so skipping")
elif len(pnbl_line_pos) > 1:
print("but as previous non-blank line is unidentified (has more than one possibilties) so cannot perform pnbl check, so skipping")
else:
print("AND previous non-blank line is already identified as",df['Identification_Status'][index-j] )
try:
print(df['data'][index-j])
except:
pass
pnbl_identified_as = df['Identification_Status'][index-j]
df['pnbl_identified_as'][index] = pnbl_identified_as
try:
line_pos_using_pnbl = sorted(list(pnbl_df.loc[pnbl_df[pnbl_identified_as] > 0,pnbl_identified_as].index),key=takeNumeric)
line_pos_string = ';'.join([str(elem) for elem in line_pos_using_pnbl])
df['pos_using_pnbl'][index] = line_pos_string
except:
print("pnbl weights sheet does not have column",pnbl_identified_as)
pass
## for unidentified line find the next-line which is non-blank
j=1
nnbl = 'not found'
print("looking for next non-blank line")
while (nnbl == 'not found') and ((index+j) < (len(df))):
if (df['Identification_Status'][index+j] == 'blank'):
print ("next line is blank")
print ("moving to subsequent next line")
j+=1
else:
nnbl = "found"
print("found next non-blank line")
nnbl_line_pos = df['Identification_Status'][index+j].split(";")
# check if the line found is identified or not
if (nnbl == 'found'):
if (df['isIdentified'][index+j] == 'No'):
print("but as next non-blank line is unidentified so cannot perform nnbl check, so skipping")
elif len(nnbl_line_pos)>1:
print("but as next non-blank line is unidentified(has multiple possibilties) so cannot perform nnbl check, so skipping")
else:
print("AND next non-blank line is already identified as",df['Identification_Status'][index+j] )
try:
print(df['data'][index+j])
except:
pass
nnbl_identified_as = df['Identification_Status'][index+j]
df['nnbl_identified_as'][index] = nnbl_identified_as
try:
line_pos_using_nnbl = sorted(list(nnbl_df.loc[nnbl_df[nnbl_identified_as] > 0,nnbl_identified_as].index),key=takeNumeric)
line_pos_string = ';'.join([str(elem) for elem in line_pos_using_nnbl])
df['pos_using_nnbl'][index] = line_pos_string
except:
print("nnbl weights sheet does not have column",nnbl_identified_as)
pass
## now get the combined possibility , doing this for unidentified lines
if(df['isIdentified'][index] == 'No') or len(cur_line_pos)> 1:
set_a = set(line_pos_using_pnbl)
set_b = set(line_pos_using_nnbl)
## for some lines having mulitple possibilites some possibilities might get eliminated
## so take intersection with current possibilites
if len(cur_line_pos)> 1:
set_c = set(cur_line_pos)
pos_using_pnbl_nnbl = sorted(set.intersection(set_a,set_b,set_c))
else:
pos_using_pnbl_nnbl = sorted(set.intersection(set_a,set_b))
## append the top and other posibility back to the beginning list if it got eliminated in the intersection
## append the posibility which were not to be removed back to the list if it got eliminated in the intersection
try:
pos_not_to_remove = df['ps_not_to_remove'][index].split(";")
except:
pos_not_to_remove = []
for ps in pos_not_to_remove:
if ps not in pos_using_pnbl_nnbl:
pos_using_pnbl_nnbl.insert(0,ps)
print("pos_using_pnbl_nnbl is ",pos_using_pnbl_nnbl)
## if only one poss then that means identified
if len(pos_using_pnbl_nnbl) == 1:
new_lines_identified = True
identify_using = 'Identification_Status'
count_lines_identified += 1
line_nos_identified.append(df['line_no'][index])
df['When_Identified'][index] = 'PNBL_NNBL'
## append the weight to the possibilites
pos_with_weights = []
for pos in pos_using_pnbl_nnbl:
wt = ''
pos_wt = str(pos)
try:
wt = df[pos_wt][index]
except:
print("could not find weight for pos ",pos," at index ",index)
continue
try:
wt = int(wt)
pos_wt += '-' + str(wt)
except:
print("could not convert wt to int for pos ",pos," at index ",index)
continue
pos_with_weights.append(pos_wt)
print("pos_with_weights is ",pos_with_weights)
# now sort in descending order using the weights as key
pos_with_weights = sorted(pos_with_weights,key=useWeights , reverse = True)
print("sorted pos_with_weights is ",pos_with_weights)
line_pos_string_with_weights = ';'.join([str(elem) for elem in pos_with_weights])
df['Identification_Status_with_weights'][index] = line_pos_string_with_weights
## copy over to identification status without the weights but in order of decreasing weights
pos_without_weight = []
for pos in pos_with_weights:
pos_without_weight.append(pos.split("-")[0])
line_pos_string = ';'.join([str(elem) for elem in pos_without_weight])
print("line new possibilites",line_pos_string)
df['pos_using_pnbl_nnbl'][index] = line_pos_string
df['Identification_Status'][index] = line_pos_string
print(df['Identification_Status'][index])
##write the count of possibilities to a column, make new column for each iteration
df[pos_count_column_name][index] = len(pos_without_weight)
total_pos_after += len(pos_without_weight)
print( "new lines identified :" ,new_lines_identified)
print(total_pos_before,total_pos_after)
if (total_pos_before - total_pos_after) > 0:
pos_decreased = True
else:
pos_decreased = False
return df,new_lines_identified,identify_using,count_lines_identified,line_nos_identified,pos_decreased
def remove_ineligible_pos(df,identify_using,iteration):
def useWeights(ps):
return int(ps.split("-")[1])
def check_pos_eligibility(pos,pvs_line_pos,nxt_line_pos,first_line_flag,last_line_flag):
pos_eligible = False
pos_eligible = ps_conditions_dict.get(pos,pos)
# if pos == 'ps7':
# print(pvs_line_pos)
# print(nxt_line_pos)
# pos_eligible = 'ps7' if (any(ps in pvs_line_pos for ps in ['blank']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['ps10','ps11','ps13']) or last_line_flag) else False
# else :
# pos_eligible = pos
return pos_eligible
count_lines_identified = 0
pos_count_column_name = 'CountofEligiblePossibilities_afterIteration' + str(iteration)
df[pos_count_column_name] = ''
new_lines_identified = False
total_pos_before = 0
total_pos = 0
pos_decreased = False
line_not_identified = False
for index in df.index:
#for index in range(5,8):
total_pos_before += len(df[identify_using][index].split(";"))
line_not_identified = True if (len(df[identify_using][index].split(";")) > 1) else False
print (index,line_not_identified)
if line_not_identified :
line_pos = df[identify_using][index].split(";")
pvs_line_pos = []
nxt_line_pos = []
first_line_flag = False
last_line_flag = False
if index == 0:
first_line_flag = True
nxt_line_pos = df[identify_using][index+1].split(";")
elif index == df.index[-1]:
pvs_line_pos = df[identify_using][index-1].split(";")
last_line_flag = True
else:
pvs_line_pos = df[identify_using][index-1].split(";")
nxt_line_pos = df[identify_using][index+1].split(";")
line_eligible_pos = []
print('\n')
print (index)
print(pvs_line_pos)
print(nxt_line_pos)
ps_conditions_dict = {
'ps1': 'ps1' if (any(ps in pvs_line_pos for ps in ['blank','ps6','ps15','ps16','ps17']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['blank','ps4','ps6']) or last_line_flag) else False,
'ps2': 'ps2' if (any(ps in pvs_line_pos for ps in ['blank','ps6','ps15','ps16','ps17']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['ps3']) or last_line_flag) else False,
'ps3': 'ps3' if (any(ps in pvs_line_pos for ps in ['ps2']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['blank','ps4','ps6']) or last_line_flag) else False,
'ps4': 'ps4' if (any(ps in pvs_line_pos for ps in ['blank','ps1','ps3','ps15']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['blank','ps5','ps6']) or last_line_flag) else False,
'ps5': 'ps5' if (any(ps in pvs_line_pos for ps in ['ps4','ps5']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['ps5','ps6']) or last_line_flag) else False,
'ps6': 'ps6' if (any(ps in pvs_line_pos for ps in ['blank','ps1','ps3','ps4','ps5','ps15']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['blank','ps1','ps2','ps4','ps6','ps7','ps8','ps16']) or last_line_flag) else False,
'ps7': 'ps7' if (any(ps in pvs_line_pos for ps in ['blank','ps6']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['blank','ps10','ps11','ps13','ps15']) or last_line_flag) else False,
'ps8': 'ps8' if (any(ps in pvs_line_pos for ps in ['blank','ps6']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['ps9','ps10','ps11','ps13','ps15']) or last_line_flag) else False,
'ps9': 'ps9' if (any(ps in pvs_line_pos for ps in ['ps7']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['ps10','ps11','ps13','ps15']) or last_line_flag) else False,
'ps10':'ps10' if (any(ps in pvs_line_pos for ps in ['ps7','ps8','ps9']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['ps13','ps15']) or last_line_flag) else False,
'ps11':'ps11' if (any(ps in pvs_line_pos for ps in ['ps7','ps8','ps9','ps14','blank']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['ps12','ps26']) or last_line_flag) else False,
'ps12':'ps12' if (any(ps in pvs_line_pos for ps in ['ps11']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['ps13']) or last_line_flag) else False,
'ps13':'ps13' if (any(ps in pvs_line_pos for ps in ['blank','ps7','ps8','ps9','ps10','ps12']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['blank','ps14','ps15']) or last_line_flag) else False,
'ps14':'ps14' if (any(ps in pvs_line_pos for ps in ['ps13','ps14','blank']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['ps14','ps15','blank']) or last_line_flag) else False,
'ps15':'ps15' if (any(ps in pvs_line_pos for ps in ['ps7','ps8','ps9','ps10','ps12','ps13','ps14','blank']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['blank','ps1','ps3','ps4','ps6','ps7','ps8','ps16']) or last_line_flag) else False,
'ps16':'ps16' if (any(ps in pvs_line_pos for ps in ['blank','ps6','ps15']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['blank','ps1','ps3']) or last_line_flag) else False,
'ps17':'ps17' if (any(ps in pvs_line_pos for ps in ['blank','ps1','ps3','ps6','ps15','ps16','ps17','ps18']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['ps1','ps2','ps8','ps16','ps17','ps18','ps19','blank']) or last_line_flag) else False,
'ps18':'ps18' if (any(ps in pvs_line_pos for ps in ['blank','ps6','ps15','ps16']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['blank','ps5','ps16']) or last_line_flag) else False,
'ps19':'ps19' if (any(ps in pvs_line_pos for ps in ['blank','ps15','ps6']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['blank','ps4','ps6']) or last_line_flag) else False,
'ps21':'ps21' if (any(ps in pvs_line_pos for ps in ['blank','ps1','ps3','ps4','ps5','ps15']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['ps10','ps11','ps13']) or last_line_flag) else False,
'ps22':'ps22' if (any(ps in pvs_line_pos for ps in ['blank','ps1','ps3','ps4','ps5','ps15']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['blank','ps1','ps3']) or last_line_flag) else False,
'ps25':'ps25' if (any(ps in pvs_line_pos for ps in ['blank','ps6']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['ps12','ps13']) or last_line_flag) else False,
'ps26':'ps26' if (any(ps in pvs_line_pos for ps in ['ps7','ps8','ps9','ps11','blank','ps27']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['ps14','ps15','ps27']) or last_line_flag) else False,
'ps27':'ps27' if (any(ps in pvs_line_pos for ps in ['ps7','ps8','ps9','ps10','ps12','ps13','ps14','ps26','blank']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['ps26','ps14','ps15']) or last_line_flag) else False,
'ps28':'ps28' if (any(ps in pvs_line_pos for ps in ['ps7','ps8','ps9','ps10','ps12','ps13','ps14','blank']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['blank','ps9','ps10','ps11','ps13','ps15']) or last_line_flag) else False,
'ps29':'ps29' if (any(ps in pvs_line_pos for ps in ['ps7','ps8','ps9','ps10','ps12','ps13','ps14']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['blank','ps1','ps2','ps4','ps6','ps7','ps8','ps16']) or last_line_flag) else False,
'ps30':'ps30' if (any(ps in pvs_line_pos for ps in ['blank','ps6','ps7']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['ps14','ps15','blank']) or last_line_flag) else False,
'ps31':'ps31' if (any(ps in pvs_line_pos for ps in ['blank','ps15','ps6']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['blank','ps1','ps2']) or last_line_flag) else False
}
print("current possibilities",line_pos)
for pos in line_pos:
#print (pos)
pos_checked = check_pos_eligibility(pos,pvs_line_pos,nxt_line_pos,first_line_flag,last_line_flag)
if pos_checked:
line_eligible_pos.append(pos_checked)
print("eligible possibilities",line_eligible_pos)
## append back the possibilties (top and other which are not to be removed)
## append the top and other posibility back to the beginning list if it got eliminated in the intersection
try:
pos_not_to_remove = df['ps_not_to_remove'][index].split(";")
except:
pos_not_to_remove = []
for ps in pos_not_to_remove:
if ps not in line_eligible_pos:
line_eligible_pos.insert(0,ps)
print (";".join(line_pos))
print (";".join(line_eligible_pos))
df['Identification_Status_ineligible_removed'][index] = ";".join(line_eligible_pos)
df['CountofPossibilities_afterIneligibleRemoved'][index] = len(line_eligible_pos)
##write the count of possibilities to a column, make new column for each iteration
df[pos_count_column_name][index] = len(line_eligible_pos)
total_pos += len(line_eligible_pos)
if len(line_eligible_pos) == 1:
count_lines_identified +=1
new_lines_identified = True
df['When_Identified'][index] = 'RemovingIneligiblePossibilities'
# if len(line_pos) - len(line_eligible_pos) > 0:
# pos_decreased = True
else:
df['Identification_Status_ineligible_removed'][index] = df[identify_using][index]
total_pos += 1
continue
## copy over the inelgible removed to Identification Status and sort in decreasing order of weights
##df.loc[:,'Identification_Status'] = df.loc[:,'Identification_Status_ineligible_removed']
pos_eligible = df['Identification_Status_ineligible_removed'][index].split(";")
## append the weight to the possibilites
print("test")
pos_with_weights = []
for pos in pos_eligible:
wt = ''
pos_wt = str(pos)
try:
wt = df[pos_wt][index]
except:
print("could not find weight for pos ",pos," at index ",index)
continue
try:
wt = int(wt)
pos_wt += '-' + str(wt)
except:
print("could not convert wt to int for pos ",pos," at index ",index)
continue
pos_with_weights.append(pos_wt)
# now sort in descending order using the weights as key
pos_with_weights = sorted(pos_with_weights,key=useWeights , reverse = True)
print(pos_with_weights)
line_pos_string_with_weights = ';'.join([str(elem) for elem in pos_with_weights])
df['Identification_Status_with_weights'][index] = line_pos_string_with_weights
## copy over to identification status without the weights but in order of decreasing weights
pos_without_weight = []
for pos in pos_with_weights:
pos_without_weight.append(pos.split("-")[0])
line_pos_string = ';'.join([str(elem) for elem in pos_without_weight])
print(line_pos_string)
df['Identification_Status_ineligible_removed'][index] = line_pos_string
if (total_pos_before - total_pos) > 0:
pos_decreased = True
else:
pos_decreased = False
print(total_pos_before,total_pos)
return df,new_lines_identified,pos_decreased,count_lines_identified,total_pos
def do_while_pnnbl_ineligible(df):
## import the pnbl and nnbl weights
##pnbl_df,nnbl_df = prep_pnnbl_wts(csv_pnbl_nnbl)
# pnbl_df.set_index('Possibilities',inplace= True)
# nnbl_df.set_index('Possibilities',inplace= True)
cur_dir = mypath
pnbl_df = pd.read_csv(os.path.join(cur_dir,'pnbl_weights.csv') , index_col = 'Possibilities' , keep_default_na = False)
pnbl_df = pnbl_df.head(34)
pnbl_df = pnbl_df.apply(pd.to_numeric,errors ='ignore')
nnbl_df= pd.read_csv(os.path.join(cur_dir,'nnbl_weights.csv'),index_col = 'Possibilities' , keep_default_na = False )
nnbl_df = nnbl_df.head(34)
nnbl_df = nnbl_df.apply(pd.to_numeric,errors ='ignore')
run_again = True
total_pos_initial = 0
for index in df.index:
total_pos_initial += len(df['Identification_Status'][index].split(";"))
while run_again:
## run the identification using pnbl_nnbl till no new lines get identified
new_lines_identified = True
iteration = 1
line_nos_identified_iteration = []
line_nos_identified = []
count_total = 0
pos_decreased = False
if 'Identification_Status' in df.columns:
identify_using = 'Identification_Status'
else:
identify_using = 'stage-1_output'
if 'Identification_Status_with_weights' not in df:
df['Identification_Status_with_weights'] = ''
if 'pnbl_identified_as' not in df:
df['pnbl_identified_as'] = ''
if 'pos_using_pnbl' not in df:
df['pos_using_pnbl'] = ''
if 'nnbl_identified_as' not in df:
df['nnbl_identified_as'] = ''
if 'pos_using_nnbl' not in df:
df['pos_using_nnbl'] = ''
if 'pos_using_pnbl_nnbl' not in df:
df['pos_using_pnbl_nnbl'] = ''
while new_lines_identified or pos_decreased:
print("Identifying lines using pnbl_nnbl ")
print("using:",identify_using)
df,new_lines_identified,identify_using,count,line_nos_identified_iteration,pos_decreased = identify_using_pnbl_nnbl(df,identify_using,iteration)
#df.to_csv( 'After_Iteration' + str(iteration) + '.csv')
print("New lines identified in Iteration",iteration,": ",count)
iteration += 1
count_total+= count
line_nos_identified.append(line_nos_identified_iteration)
print("lines identified in iteration",line_nos_identified)
print(df['Identification_Status'].value_counts())
print ("Total new lines identified in pnbl nnbl after all iteration:",count_total)
print ("line nos identified in all iterations",line_nos_identified)
## run the identification by eliminating possibilities no new lines get identified
new_lines_identified = True
pos_decreased = True
iteration = 1
count_total = 0
total_pos_start = 0
total_pos_after = 0
identify_using = 'Identification_Status'
if 'Identification_Status_ineligible_removed' not in df.columns:
df['Identification_Status_ineligible_removed'] = ''
df['CountofPossibilities_afterIneligibleRemoved'] = ''
for index in df.index:
total_pos_start += len(df['Identification_Status'][index].split(";"))
# df.to_csv('test_pnnbl.csv')
while new_lines_identified or pos_decreased:
print("\n Identifying lines using eliminating ineligible possibilities ")
print("using:",identify_using)
df,new_lines_identified,pos_decreased,count,total_pos_after = remove_ineligible_pos(df,identify_using,iteration)
#df.to_csv( 'eligiblePossibilitiesAfter_Iteration' + str(iteration) + '.csv')
print("New lines identified in Iteration",iteration,": ",count)
identify_using = 'Identification_Status_ineligible_removed'
iteration += 1
count_total+= count
print ("Total new lines identified by eliminating ineligible possibilities after all iteration:",count_total)
## copy over column
df['Identification_Status'] = df['Identification_Status_ineligible_removed']
print(df['Identification_Status'].value_counts())
print(total_pos_start,total_pos_after,iteration)
## run both pnnbl and pos ineligible if
## pos is decreased using ineligble code
run_again = True if total_pos_start > total_pos_after else False
print(total_pos_initial,total_pos_after)
return df
def examine_same_content_lines(df):
df_udn = df.loc[df['isIdentified'] == 'No', : ]
df_udn['data_strip'] = df_udn['data'].str.strip()
df_occurences = df_udn.value_counts(['data_strip']).reset_index(name='count')
pos_sp_list = df_occurences.loc[df_occurences['count'] > 1,'data_strip'].to_list()
for pos_sp in pos_sp_list:
print (pos_sp)
for index in df.index:
if df['isIdentified'][index] == 'Yes':
continue
if 'ps7' not in df['Identification_Status'][index].split(";"):
continue
#df['data'][index].strip
## preceeded by
prev_line_blank = True if df['plb'][index] == 'Y' else False
if prev_line_blank:
if index - 2 >= 0:
pnbl = index - 2
else:
print("start of script \n")
continue
else:
if index - 1 >= 0:
pnbl = index - 1
else:
print("start of script \n")
continue
if df['data'][index].strip() == pos_sp and df['Identification_Status'][pnbl] == ('ps15' or 'ps6'):
print(index)
try:
print(df['data'][pnbl])
except:
pass
print(df['Identification_Status'][pnbl])
if df['parenthetical'][index] == 'Absent':
df['Identification_Status'][index] == 'ps7'
df['isIdentified'][index] = 'Yes'
df['When_Identified'][index] = 'ExaminingSameContentLines'
else:
df['Identification_Status'][index] == 'ps8;ps25'
return df
def examine_action_possibilities_part1(df):
# loop through the lines and check lines possibility for being action
for index in df.index[2:-2]:
if df['isIdentified'][index] == 'Yes':
continue
print("unidentified line index is",index)
cur_line_indent = df['ssc'][index]
next_line_blank = df['nlb'][index]
prev_line_blank = df['plb'][index]
if next_line_blank == 'N':
next_nbl_line_indent = df['ssc'][index+1]
next_nbl_line_pos = df['Identification_Status'][index+1].split(";")
else:
next_nbl_line_indent = df['ssc'][index+2]
next_nbl_line_pos = df['Identification_Status'][index+2].split(";")
if prev_line_blank == 'N':
prev_nbl_line_indent = df['ssc'][index-1]
prev_nbl_line_pos = df['Identification_Status'][index-1].split(";")
else:
prev_nbl_line_indent = df['ssc'][index-2]
prev_nbl_line_pos = df['Identification_Status'][index-2].split(";")
## check for ps5,ps4
if cur_line_indent >=15 and cur_line_indent <=25:
data = df['data'][index]
# if cur_line_indent == prev_nbl_line_indent and cur_line_indent == next_nbl_line_indent:
# ps4_in_prev = True if 'ps4' in prev_nbl_line_pos[0] else False
# ps5_in_prev = True if 'ps5' in prev_nbl_line_pos[0] else False
# ps5_in_next = True if 'ps5' in next_nbl_line_pos else False
# ps6_in_next = True if 'ps6' in next_nbl_line_pos else False
# ps16_in_prev = True if 'ps16' in prev_nbl_line_pos[0] else False
# ps17_in_prev = True if 'ps17' in prev_nbl_line_pos[0] else False
# next_line_flag = True if next_line_blank == 'Y' else False
# prev_action_special_transition = True if any([ps4_in_prev,ps5_in_prev,ps16_in_prev,ps17_in_prev]) else False
# prev_action = True if any([ps4_in_prev,ps5_in_prev]) else False
# if all([ps5_in_next,ps6_in_next,prev_action_special_transition]):
# cur_line_new_pos = 'ps1;ps5;ps4'
# df['Identification_Status'][index] = cur_line_new_pos
# df['When_Identified'][index] = 'ExaminingActionPossibilities'
# print(data)
# print(cur_line_new_pos)
# print("\n")
# continue
# elif prev_action and ((ps5_in_next and ps6_in_next) or next_line_flag):
# cur_line_new_pos = 'ps6;ps5;ps4'
# df['Identification_Status'][index] = cur_line_new_pos
# df['When_Identified'][index] = 'ExaminingActionPossibilities'
# print(data)
# print(cur_line_new_pos)
# print("\n")
# continue
# elif prev_action and (ps5_in_next and ps6_in_next) :
# cur_line_new_pos = 'ps5;ps4'
# df['Identification_Status'][index] = cur_line_new_pos
# df['When_Identified'][index] = 'ExaminingActionPossibilities'
# print(data)
# print(cur_line_new_pos)
# print("\n")
# continue
## check for ps6
if len(prev_nbl_line_pos) == 1 and cur_line_indent == prev_nbl_line_indent and next_nbl_line_pos[0] == 'ps7':
ps4_in_prev = True if 'ps4' == prev_nbl_line_pos[0] else False
ps5_in_prev = True if 'ps5' == prev_nbl_line_pos[0] else False
if any([ps4_in_prev,ps5_in_prev]):
cur_line_new_pos = 'ps6'
df['Identification_Status'][index] = cur_line_new_pos
df['When_Identified'][index] = 'ExaminingActionPossibilities'
try:
print(data)
except:
pass
print(cur_line_new_pos)
print("\n")
continue
return df
def examine_action_possibilities_part2(df):
# loop through the lines and check lines possibility for being action
for index in df.index[2:-2]:
if df['isIdentified'][index] == 'Yes':
continue
cur_line_pos = df['Identification_Status'][index].split(";")
if cur_line_pos[0] == 'ps1':
continue
if df['plb'][index] == 'N' :
pnbl_pos = df['Identification_Status'][index-1].split(";")
else:
pnbl_pos = df['Identification_Status'][index-2].split(";")
if df['nlb'][index] == 'N' :
nnbl_pos = df['Identification_Status'][index+1].split(";")
else:
nnbl_pos = df['Identification_Status'][index+2].split(";")
line_no = df['line_no'][index]
data = df['data'][index]
## declare ps6 if nnbl ps7 and pnbl has either ps1 ,ps3, ps15, ps6 as possibility
if len(pnbl_pos) == 1 and len(nnbl_pos) == 1 and nnbl_pos[0] == 'ps7':
ps1_equal_prev = True if 'ps1' == pnbl_pos[0] else False
ps3_equal_prev = True if 'ps3' == pnbl_pos[0] else False
ps6_equal_prev = True if 'ps6' == pnbl_pos[0] else False
ps15_equal_prev = True if 'ps15' == pnbl_pos[0] else False
if any([ps1_equal_prev,ps3_equal_prev,ps6_equal_prev,ps15_equal_prev]):
print("Identifying line as ps6 as before speaker and after 1,3,6, 15",)
try:
print(line_no , data)
except:
pass
df['Identification_Status'][index] = 'ps6'
df['When_Identified'][index] = 'ExaminingActionPossibilitiesAfterIneligible'
df['isIdentified'][index] = 'Yes'
continue
# loop through to examine for ps5
for index in df.index[2:-2]:
if df['Identification_Status'][index] == 'blank':
continue
if len(df['Identification_Status'][index].split(";")) == 1 :
continue
if df['plb'][index] == 'N' :
pnbl_pos = df['Identification_Status'][index-1].split(";")
else:
pnbl_pos = df['Identification_Status'][index-2].split(";")
if df['nlb'][index] == 'N' :
nnbl_pos = df['Identification_Status'][index+1].split(";")
else:
nnbl_pos = df['Identification_Status'][index+2].split(";")
line_no = df['line_no'][index]
data = df['data'][index]
## declare ps5 if prev ps4,ps5 and next ps5,ps6
if pnbl_pos in ('ps4','ps5') and nnbl_pos in ('ps5','ps6'):
print("Identifying line as ps5 as between 4,5 and 5,6")
df['Identification_Status'][index] = 'ps5'
df['When_Identified'][index] = 'ExaminingActionPossibilitiesAfterIneligible'
df['isIdentified'][index] = 'Yes'
continue
### declare ps5 if prev has a possibility as ps4,ps5 and next ps5,ps6
# ps4_in_prev = True if 'ps4' in pnbl_pos else False
# ps5_in_prev = True if 'ps5' in pnbl_pos else False
# ps5_in_next = True if 'ps5' in nnbl_pos else False
# ps6_in_next = True if 'ps6' in nnbl_pos else False
# if any([ps4_in_prev,ps5_in_prev]) and any([ps5_in_next,ps6_in_next]):
# print("Identifying line as ps5 in between possibilities of 4,5 and 5,6")
# df['Identification_Status'][index] = 'ps5'
# df['When_Identified'][index] = 'ExaminingActionPossibilitiesAfterIneligible'
# continue
return df
def examine_same_indent_bunch(df):
total_pos_before = 0
total_pos_after = 0
for index in df.index:
line_pos = df['Identification_Status'][index].split(";")
total_pos_before += len(line_pos)
index_iter = iter(df.index)
for index in index_iter:
# print("index",index)
# print(df['Identification_Status'][index])
# print(len(df['Identification_Status'][index].split(";")))
line_pos = df['Identification_Status'][index].split(";")
if len(line_pos) == 1:
continue
print(index)
cur_indent = df['ssc'][index]
next_line_blank = True if df['nlb'][index] == 'Y' else False
if index+2 > df.index[-1]:
break
if next_line_blank:
next_nbl_indent = df['ssc'][index+2]
nbl_identified = True if len(df['Identification_Status'][index+2].split(";")) == 1 else False
j = index + 2
lines_count = 2
else:
next_nbl_indent = df['ssc'][index+1]
nbl_identified = True if len(df['Identification_Status'][index+1].split(";")) == 1 else False
j = index + 1
lines_count = 1
start_index = index
rev_index = index
nbl_lines_count = 1
data = df['data'][index]
print("lines with same indent")
try:
print(data)
except:
pass
bunch_index = []
bunch_index.append(start_index)
## get the number of line with same indent
while cur_indent == next_nbl_indent and not nbl_identified:
nbl_lines_count +=1
cur_indent = next_nbl_indent
next_line_blank = True if df['nlb'][j] == 'Y' else False
data = df['data'][j]
try:
print(data)
except:
pass
bunch_index.append(j)
rev_index = j
if j+2 >= df.index[-1]:
break
if next_line_blank:
next_nbl_indent = df['ssc'][j+2]
j += 2
else:
next_nbl_indent = df['ssc'][j+1]
j += 1
print(nbl_lines_count)
## preceeded by
prev_line_blank = True if df['plb'][index] == 'Y' else False
if prev_line_blank:
if start_index - 2 >= 0:
pnbl = start_index - 2
else:
print("start of script \n")
continue
else:
if start_index - 1 >= 0:
pnbl = start_index - 1
else:
print("start of script \n")
continue
print("preceeded by",df['Identification_Status'][pnbl])
try:
print(df['data'][pnbl])
except:
pass
## followed by
next_line_blank = True if df['nlb'][rev_index] == 'Y' else False
if next_line_blank:
if rev_index + 2 <= df.index[-1]:
nnbl = rev_index + 2
else:
print("end of script \n")
continue
else:
if rev_index + 1 <= df.index[-1]:
nnbl = rev_index + 1
else:
print("end of script \n")
continue
try:
print(df['data'][nnbl])
except:
pass
print("followed by",df['Identification_Status'][nnbl])
print("\n")
if df['Identification_Status'][pnbl] == 'ps15' and df['Identification_Status'][nnbl] == 'ps7':
last_line_pos = df['Identification_Status'][rev_index].split(";")
if nbl_lines_count == 1 and len(df['Identification_Status'][rev_index].split(";")) > 1:
if line_pos[0] == 'ps1':
continue
print("CASE A2")
# single line is ps6
df['Identification_Status'][rev_index] = 'ps6'
df['isIdentified'][rev_index] = 'Yes'
df['When_Identified'][rev_index] = 'ExaminingSameIndentBunch'
bunch_index.remove(rev_index)
print("ps6", df['data'][rev_index])
elif nbl_lines_count > 1 and len(last_line_pos) > 1 and 'ps15' not in last_line_pos:
print("CASE A1")
# last line is ps6
df['Identification_Status'][rev_index] = 'ps6'
df['isIdentified'][rev_index] = 'Yes'
df['When_Identified'][rev_index] = 'ExaminingSameIndentBunch'
bunch_index.remove(rev_index)
print("ps6", df['data'][rev_index])
# rest of lines ; remove possibility other than slugline,transition and action
bunch_iter = iter(bunch_index)
# remove possibilities other than action, slug , transition
ps_not_to_remove = ['ps1','ps2','ps3','ps4','ps5','ps6','ps16','ps18','ps19']
for k in bunch_iter:
cur_line_pos = df['Identification_Status'][k].split(";")
print(cur_line_pos)
new_line_pos = []
for pos in cur_line_pos:
if pos in ps_not_to_remove:
new_line_pos.append(pos)
df['Identification_Status'][k] = ";".join(new_line_pos)
df['When_Identified'][k] = 'ExaminingSameIndentBunch'
print(df['Identification_Status'][k], df['data'][k])
# if nbl_lines_count > 1:
# print("CASE A")
# # last line is ps6
# df['Identification_Status'][rev_index] = 'ps6'
# df['When_Identified'][rev_index] = 'ExaminingSameIndentBunch'
# bunch_index.remove(rev_index)
# print("ps6", df['data'][rev_index])
# # first line is ps4
# df['Identification_Status'][start_index] = 'ps4'
# df['When_Identified'][start_index] = 'ExaminingSameIndentBunch'
# bunch_index.remove(start_index)
# print("ps4", df['data'][start_index])
# bunch_iter = iter(bunch_index)
# # middle lines are ps5
# for k in bunch_iter:
# df['Identification_Status'][k] = 'ps5'
# df['When_Identified'][k] = 'ExaminingSameIndentBunch'
# print("ps5", df['data'][k])
# if df['Identification_Status'][pnbl] != 'ps15' and df['Identification_Status'][nnbl] in ('ps4','ps6'):
# print("CASE B")
# bunch_iter = iter(bunch_index)
# # remove possibilities other than action, slug , transition
# ps_not_to_remove = ['ps1','ps2','ps3','ps4','ps5','ps6','ps16','ps18','ps19']
# for k in bunch_iter:
# cur_line_pos = df['Identification_Status'][k].split(";")
# print(cur_line_pos)
# new_line_pos = []
# for pos in cur_line_pos:
# if pos in ps_not_to_remove:
# new_line_pos.append(pos)
# df['Identification_Status'][k] = ";".join(new_line_pos)
# df['When_Identified'][k] = 'ExaminingSameIndentBunch'
# print(df['Identification_Status'][k], df['data'][k])
for advance in range(start_index,rev_index):
next(index_iter)
for index in df.index:
line_pos = df['Identification_Status'][index].split(";")
total_pos_after += len(line_pos)
# In[ ]:
print(total_pos_before,total_pos_after)
return df
def examine_relative_indent(df):
total_pos_before = 0
total_pos_after = 0
for index in df.index:
line_pos = df['Identification_Status'][index].split(";")
total_pos_before += len(line_pos)
index_iter = iter(df.index)
for index in index_iter:
# print("index",index)
# print(df['Identification_Status'][index])
# print(len(df['Identification_Status'][index].split(";")))
line_pos = df['Identification_Status'][index].split(";")
if len(line_pos) == 1:
continue
data = df['data'][index]
cur_indent = df['ssc'][index]
cur_parenthetical_absent = True if df['parenthetical'][index] == 'Absent' else False
## preceeded by
prev_line_blank = True if df['plb'][index] == 'Y' else False
if prev_line_blank:
if index - 2 >= 0:
pnbl = index - 2
else:
# print("start of script \n")
continue
else:
if index - 1 >= 0:
pnbl = index - 1
else:
# print("start of script \n")
continue
# print("preceeded by",df['Identification_Status'][pnbl])
pnbl_data = df['data'][pnbl]
pnbl_indent = df['ssc'][pnbl]
pnbl_parenthetical_absent = True if df['parenthetical'][pnbl] == 'Absent' else False
# print (pnbl,pnbl_indent,pnbl_data)
## followed by
next_line_blank = True if df['nlb'][index] == 'Y' else False
if next_line_blank:
if index + 2 <= df.index[-1]:
nnbl = index + 2
else:
print("end of script \n")
continue
else:
if index + 1 <= df.index[-1]:
nnbl = index + 1
else:
# print("end of script \n")
continue
nnbl_data = df['data'][nnbl]
nnbl_indent = df['ssc'][nnbl]
nnbl_parenthetical_absent = True if df['parenthetical'][nnbl] == 'Absent' else False
nnbl_line_pos = df['Identification_Status'][nnbl].split(";")
# print("followed by",df['Identification_Status'][nnbl])
# print(nnbl,nnbl_indent,nnbl_data)
## followed followed by
next_next_line_blank = True if df['nlb'][nnbl] == 'Y' else False
if next_next_line_blank:
if nnbl + 2 <= df.index[-1]:
nnnbl = nnbl + 2
else:
print("end of script \n")
continue
else:
if nnbl + 1 <= df.index[-1]:
nnnbl = nnbl + 1
else:
# print("end of script \n")
continue
nnnbl_data = df['data'][nnnbl]
nnnbl_indent = df['ssc'][nnnbl]
nnnbl_parenthetical_absent = True if df['parenthetical'][nnnbl] == 'Absent' else False
try:
if cur_indent > nnbl_indent and nnbl_indent > nnnbl_indent and cur_parenthetical_absent and nnbl_parenthetical_absent and nnnbl_parenthetical_absent:
try:
print(data)
except:
pass
print("current possibility",line_pos)
if 'ps7' in line_pos and 'ps7' not in nnbl_line_pos:
print("Identifying as ps7")
df['Identification_Status'][index] = 'ps7'
df['When_Identified'][index] = 'ExaminingRelativeIndent'
df['isIdentified'][index] = 'Yes'
print("\n")
except:
pass
for index in df.index:
line_pos = df['Identification_Status'][index].split(";")
total_pos_after += len(line_pos)
print(total_pos_before,total_pos_after)
# total_pos_before = 0
# total_pos_after = 0
# for index in df.index:
# line_pos = df['Identification_Status'][index].split(";")
# total_pos_before += len(line_pos)
# index_iter = iter(df.index)
# for index in index_iter:
# # print("index",index)
# # print(df['Identification_Status'][index])
# # print(len(df['Identification_Status'][index].split(";")))
# line_pos = df['Identification_Status'][index].split(";")
# if len(line_pos) == 1:
# continue
# data = df['data'][index]
# cur_indent = df['data_begins/Space count'][index]
# cur_parenthetical_absent = True if df['Parenthetical'][index] == 'Absent' else False
# ## preceeded by
# prev_line_blank = True if df['prvious_line_blank'][index] == 'Y' else False
# if prev_line_blank:
# if index - 2 >= 0:
# pnbl = index - 2
# else:
# # print("start of script \n")
# continue
# else:
# if index - 1 >= 0:
# pnbl = index - 1
# else:
# # print("start of script \n")
# continue
# # print("preceeded by",df['Identification_Status'][pnbl])
# pnbl_data = df['data'][pnbl]
# pnbl_indent = df['data_begins/Space count'][pnbl]
# pnbl_parenthetical_absent = True if df['Parenthetical'][pnbl] == 'Absent' else False
# # print (pnbl,pnbl_indent,pnbl_data)
# ## followed by
# next_line_blank = True if df['next_line_blank'][index] == 'Y' else False
# if next_line_blank:
# if index + 2 <= df.index[-1]:
# nnbl = index + 2
# else:
# print("end of script \n")
# continue
# else:
# if index + 1 <= df.index[-1]:
# nnbl = index + 1
# else:
# # print("end of script \n")
# continue
# nnbl_data = df['data'][nnbl]
# nnbl_indent = df['data_begins/Space count'][nnbl]
# nnbl_parenthetical_absent = True if df['Parenthetical'][nnbl] == 'Absent' else False
# # print("followed by",df['Identification_Status'][nnbl])
# # print(nnbl,nnbl_indent,nnbl_data)
# if cur_indent > pnbl_indent and cur_indent > nnbl_indent and cur_parenthetical_absent and pnbl_parenthetical_absent and nnbl_parenthetical_absent:
# print(data)
# print("current possibility",line_pos)
# print("Identifying as ps7")
# df['Identification_Status'][index] = 'ps7'
# df['When_Identified'][index] = 'ExaminingRelativeIndent'
# df['Identified'][index] = 'identified'
# print("\n")
# for index in df.index:
# line_pos = df['Identification_Status'][index].split(";")
# total_pos_after += len(line_pos)
# # In[ ]:
# print(total_pos_before,total_pos_after)
return df
def examine_pos_sp_indent(df,csv_removed_space_between_words,csv_pnnbl_ineligble_after_relative_indent):
df_indents = pd.read_csv(csv_removed_space_between_words,usecols = ['line_no','ssc'])
df_indents['ssc'].value_counts().sort_index()
identification_status = pd.read_csv(csv_pnnbl_ineligble_after_relative_indent, usecols = ['line_no','Identification_Status','isIdentified'] )
identification_status['line_no'] = identification_status['line_no'].astype(float)
df_indents = df_indents.merge(identification_status, how = 'inner' , on = 'line_no')
df_indents['ssc'].value_counts().sort_index()
df_indents.loc[df_indents['isIdentified'] == 'Yes' , 'ssc'].value_counts().sort_index()
sp_indents_df = df_indents.loc[df_indents['Identification_Status'] == 'ps7','ssc'].value_counts().sort_values(ascending = False).head(5)
sp_indents_list = sp_indents_df.index.values.tolist()
sp_indents_list.sort()
sp_indents_list
try:
pos_sp_indent = sp_indents_list[-1]
except:
pos_sp_indent = 200
margin = 3
for index in df.index:
if df['isIdentified'][index] == 'Yes':
continue
cur_indent = df['ssc'][index]
if cur_indent >= pos_sp_indent-margin and cur_indent <= pos_sp_indent+margin:
data = df['data'][index]
word_count = len(data.split())
#print(word_count)
if 'ps7' in df['Identification_Status'][index] and df['parenthetical'][index] == 'Absent' and word_count <= 2:
try:
print(index,data)
except:
pass
print("Identifying as speaker")
df['Identification_Status'][index] = 'ps7'
df['isIdentified'][index] = 'Yes'
df['When_Identified'][index] = 'ExaminingPossibleSpeakerIndent'
return df
def examine_action_middle_possibilities_using_pnnbl_top(df):
# loop through to examine for ps5
for index in df.index[2:-2]:
if df['isIdentified'][index] == 'Yes':
continue
if df['plb'][index] == 'N' :
pnbl_pos = df['Identification_Status'][index-1].split(";")
pnbl_index = index -1
else:
pnbl_pos = df['Identification_Status'][index-2].split(";")
pnbl_index = index -2
if df['nlb'][index] == 'N' :
nnbl_pos = df['Identification_Status'][index+1].split(";")
else:
nnbl_pos = df['Identification_Status'][index+2].split(";")
try:
if df['plb'][pnbl_index] == 'N' :
ppnbl_pos = df['Identification_Status'][pnbl_index-1].split(";")
else:
ppnbl_pos = df['Identification_Status'][pnbl_index-2].split(";")
except:
pass
line_no = df['line_no'][index]
data = df['data'][index]
cur_line_pos = df['Identification_Status'][index].split(";")
## from here
# ## declare ps5 if prev ps4,ps5 and next ps5,ps6
# if cur_line_pos[0] == 'ps5' and pnbl_pos[0] in ('ps4','ps5') and nnbl_pos[0] in ('ps5','ps6'):
# ## if cur parenthtical or
# if df['Parenthetical'][index] != 'Absent':
# print("skipping as current has parenthetical ")
# print(line_no,data)
# continue
# if ppnbl_pos[0] == 'ps16' :
# print(" skipping as pre previous top transition")
# print(line_no,data)
# continue
# try:
# if ppnbl_pos[1] == 'ps16' or pnbl_pos[1] == 'ps16':
# print("skipping as previous or pre previous top2 transition")
# print(line_no,data)
# continue
# except:
# pass
# print("Identifying line as ps5 as between 4,5 and 5,6")
# print(data)
# df['Identification_Status'][index] = 'ps5'
# df['When_Identified'][index] = 'ExaminingActionMiddlePossibilitiesUsingTopPnnbl'
# df['Identified'] = 'identified'
# continue
## till here
return df
def examine_speaker_extension(df,audit_df):
# loop through to examine speaker extensiton top 'ps8'
for index in df.index[2:-2]:
if df['isIdentified'][index] == 'Yes':
continue
# if df['prvious_line_blank'][index] == 'N' :
# pnbl_pos = df['Identification_Status'][index-1].split(";")
# pnbl_index = index -1
# else:
# pnbl_pos = df['Identification_Status'][index-2].split(";")
# pnbl_index = index -2
if df['nlb'][index] == 'N' :
nnbl_pos = df['Identification_Status'][index+1].split(";")
nnbl_par = df['parenthetical'][index+1]
else:
nnbl_pos = df['Identification_Status'][index+2].split(";")
nnbl_par = df['parenthetical'][index+2]
# try:
# if df['prvious_line_blank'][pnbl_index] == 'N' :
# ppnbl_pos = df['Identification_Status'][pnbl_index-1].split(";")
# else:
# ppnbl_pos = df['Identification_Status'][pnbl_index-2].split(";")
# except:
# pass
line_no = df['line_no'][index]
data = df['data'][index]
cur_line_pos = df['Identification_Status'][index].split(";")
cur_line_par = df['parenthetical'][index]
extn_found = False
extn_list = ['O.S.','V.O.',"CONT'D","CONTD",'VOICE']
for extn in extn_list:
if extn in str(data):
extn_found = True
break
## if hishest is ps8
if cur_line_pos[0] == 'ps8' and cur_line_par == 'PartMidEnd' and nnbl_par == 'Absent' and not extn_found:
try:
print(data)
except:
pass
if re.search('\(',data,re.IGNORECASE) :
pos_starts = re.search('\(',data,re.IGNORECASE).start()
#pos_end = re.search('(',data,re.IGNORECASE).end()
before_par = data[:pos_starts]
after_par = data[pos_starts:]
print ("Separating Parenthetical")
print("Identifying as speaker")
print(index)
try:
print(before_par)
except:
pass
df['data'][index] = before_par
df['parenthetical'][index] = 'Absent'
df['When_Identified'][index] = 'ExaminingSpeakerLines'
df['case'][index] = 'AllUpper'
df['Identification_Status'][index] = 'ps7'
nlb = df['nlb'][index]
df['nlb'][index] = 'N'
line_no = df['line_no'][index]
next_line_no = df['line_no'][index+1]
new_line_no = (line_no + next_line_no) / 2
if new_line_no in audit_df.index:
new_line_no = (new_line_no + next_line_no)/2
audit_df.loc[new_line_no] = ''
audit_df.loc[new_line_no]['line_removed'] = 'No'
audit_df.loc[line_no]['line_broken_into_multiple_lines'] = 'Separated Speaker and Parenthetical'
print(df['Identification_Status'][index])
try:
print(after_par)
except:
pass
print("identifying parenthetical")
df.loc[index + 0.25] = ''
df.loc[index + 0.25,'data'] = after_par
df.loc[index + 0.25,'parenthetical'] = 'Complete'
df.loc[index + 0.25,'When_Identified'] ='ExaminingSpeakerLines'
df.loc[index + 0.25,'Identification_Status'] = 'ps10'
df.loc[index + 0.25,'case'] = ''
df.loc[index + 0.25,'plb'] = 'N'
df.loc[index + 0.25,'nlb'] = nlb
df.loc[index + 0.25,'line_no'] = new_line_no
df = df.sort_index().reset_index(drop=True)
continue
##now examine the speakers having : or apstrophe after them and separate to new line
speaker_list = df.loc[df['Identification_Status'] == 'ps7','data'].astype(str)
speaker_list = [ elem.strip() for elem in speaker_list ]
speaker_lines_list = df.loc[df['Identification_Status'] == 'ps7','line_no'].to_list()
unique_speaker_list = []
speaker_in_two_lines_list = []
for speaker in speaker_list:
speaker = speaker.strip()
#print(speaker)
if speaker not in unique_speaker_list:
unique_speaker_list.append(speaker)
## strip the blank spaces
print(unique_speaker_list)
for index in df.index[2:-2]:
if df['isIdentified'][index] == 'Yes':
continue
line_no = df['line_no'][index]
data = df['data'][index]
cur_line_pos = df['Identification_Status'][index].split(";")
extn_found = False
extn_list = ['O.S.','V.O.',"CONT'D","CONTD",'VOICE']
for extn in extn_list:
if extn in str(data):
extn_found = True
break
for speaker in unique_speaker_list:
if re.search(speaker,data) and df['Identification_Status'][index] not in ('ps7','ps8','ps9') :
# check if speaker is at start of line followed by something (like : apostrpohe)
print(index)
pos_starts = re.search(speaker,data,re.IGNORECASE).start()
pos_end = re.search(speaker,data,re.IGNORECASE).end()
before_speaker = data[:pos_starts]
after_speaker = data[pos_end:]
print("speaker match found")
try:
print("data 4567:", data)
print("speaker 4568:",speaker)
print("before speaker:",before_speaker)
print("after speaker:",after_speaker)
except:
pass
try:
char1_after_speaker = after_speaker.lstrip()[0]
except:
char1_after_speaker = ''
try:
print("char1_after_speaker 4579 :",char1_after_speaker)
except:
pass
speaker_skip_list = ['MONTAGES','MUSICAL MONTAGES','MORNING','AT HOTEL','TV','ESSENTIALS','ESSENTIAL','LATER']
## separate parenthtical if speaker is followed by parenthtical
if before_speaker.isspace() and char1_after_speaker == '(' and df['parenthetical'][index] == 'PartMidEnd' and not extn_found:
print("before speaker inside the if condition:",before_speaker)
print ("Seperating Parenthetical")
print("Identifying speaker")
print(index)
df['data'][index] = before_speaker + speaker
df['parenthetical'][index] = 'Absent'
df['When_Identified'][index] = 'ExaminingSpeakerLines'
df['case'][index] = 'AllUpper'
df['Identification_Status'][index] = 'ps7'
nlb = df['nlb'][index]
df['nlb'][index] = 'N'
line_no = df['line_no'][index]
next_line_no = df['line_no'][index+1]
new_line_no = (line_no + next_line_no) / 2
if new_line_no in audit_df.index:
new_line_no = (new_line_no + next_line_no)/2
audit_df.loc[new_line_no] = ''
audit_df.loc[new_line_no]['line_removed'] = 'No'
audit_df.loc[line_no]['line_broken_into_multiple_lines'] = 'Separated Speaker and Parenthetical'
#print(df['Identification_Status'][index])
print("identifying parenthetical")
df.loc[index + 0.25] = ''
df.loc[index + 0.25,'data'] = after_speaker
df.loc[index + 0.25,'parenthetical'] = 'Complete'
df.loc[index + 0.25,'When_Identified'] ='ExaminingSpeakerLines'
df.loc[index + 0.25,'Identification_Status'] = 'ps10'
df.loc[index + 0.25,'case'] = ''
df.loc[index + 0.25,'plb'] = 'N'
df.loc[index + 0.25,'nlb'] = nlb
df.loc[index + 0.25,'line_no'] = new_line_no
df = df.sort_index().reset_index(drop=True)
continue
elif before_speaker.isspace() and char1_after_speaker == ':' and not extn_found and speaker not in speaker_skip_list:
print("before speaker in elif condition 4624:", before_speaker)
print ("Seperating : colon dialogue")
print("Identifying speaker")
print(index)
df['data'][index] = before_speaker + speaker
df['parenthetical'][index] = 'Absent'
df['When_Identified'][index] = 'ExaminingSpeakerLines'
df['case'][index] = 'AllUpper'
df['Identification_Status'][index] = 'ps7'
nlb = df['nlb'][index]
df['nlb'][index] = 'N'
#print(df['Identification_Status'][index])
line_no = df['line_no'][index]
next_line_no = df['line_no'][index+1]
new_line_no = (line_no + next_line_no) / 2
if new_line_no in audit_df.index:
new_line_no = (new_line_no + next_line_no)/2
audit_df.loc[new_line_no] = ''
audit_df.loc[new_line_no]['line_removed'] = 'No'
audit_df.loc[line_no]['line_broken_into_multiple_lines'] = 'Separated Speaker and Dialogue seperated by colon:'
print("possible dialogue")
print(after_speaker)
df.loc[index + 0.25] = ''
df.loc[index + 0.25,'data'] = after_speaker
#df.loc[index + 0.25,'Parenthetical'] = 'Complete'
df.loc[index + 0.25,'When_Identified'] ='ExaminingSpeakerLines'
df.loc[index + 0.25,'Identification_Status'] = ";".join(cur_line_pos)
df.loc[index + 0.25,'case'] = ''
df.loc[index + 0.25,'plb'] = 'N'
df.loc[index + 0.25,'nlb'] = nlb
df.loc[index + 0.25,'line_no'] = new_line_no
df = df.sort_index().reset_index(drop=True)
continue
elif before_speaker.isspace() and (char1_after_speaker == '' or char1_after_speaker == '"') and not extn_found:
print("before speaker in seperating apostrophe:", before_speaker)
print ("Seperating apostrophe")
print("Identifying speaker")
print(index)
df['data'][index] = before_speaker + speaker
df['parenthetical'][index] = 'Absent'
df['When_Identified'][index] = 'ExaminingSpeakerLines'
df['case'][index] = 'AllUpper'
df['Identification_Status'][index] = 'ps7'
nlb = df['nlb'][index]
df['nlb'][index] = 'N'
line_no = df['line_no'][index]
next_line_no = df['line_no'][index+1]
new_line_no = (line_no + next_line_no) / 2
if new_line_no in audit_df.index:
new_line_no = (new_line_no + next_line_no)/2
audit_df.loc[new_line_no] = ''
audit_df.loc[new_line_no]['line_removed'] = 'No'
audit_df.loc[line_no]['line_broken_into_multiple_lines'] = 'Separated Speaker and Dialogue seperated by colon:'
#print(df['Identification_Status'][index])
print("identifying as parenthetical")
df.loc[index + 0.25] = ''
df.loc[index + 0.25,'data'] = '(' + after_speaker.strip() + ')'
df.loc[index + 0.25,'parenthetical'] = 'Complete'
df.loc[index + 0.25,'When_Identified'] ='ExaminingSpeakerLines'
df.loc[index + 0.25,'Identification_Status'] = 'ps10'
df.loc[index + 0.25,'case'] = ''
df.loc[index + 0.25,'plb'] = 'N'
df.loc[index + 0.25,'nlb'] = nlb
df.loc[index + 0.25,'line_no'] = new_line_no
df = df.sort_index().reset_index(drop=True)
continue
# df.to_csv(p.output_file_path,index=False)
# lines_not_removed = audit_df.loc[audit_df['line_removed'] != 'Yes'].index.to_list()
# audit_df.sort_index(inplace= True)
# audit_df.reset_index(inplace= True)
# for line in lines_not_removed:
# new_data = ''
# try:
# new_data =df.loc[df['line_no'] == line, 'data'].values[0]
# except:
# pass
# #print(new_data)
# audit_df.loc[audit_df['line_no'] == line, 'data_corrected'] = new_data
# #print(audit_df.loc[audit_df['line_no'] == line, 'data_corrected'])
return df
def examine_action_using_top2_part1(df):
# loop through
for index in df.index[2:-2]:
if df['isIdentified'][index] == 'Yes':
continue
cur_indent = df['ssc'][index]
nnbl_indent = 0
if df['plb'][index] == 'N' :
pnbl_pos = df['Identification_Status'][index-1].split(";")
pnbl_index = index -1
else:
pnbl_pos = df['Identification_Status'][index-2].split(";")
pnbl_index = index -2
if df['nlb'][index] == 'N' :
nnbl_pos = df['Identification_Status'][index+1].split(";")
nnbl_par = df['parenthetical'][index+1]
nnbl_indent = df['ssc'][index+1]
else:
nnbl_pos = df['Identification_Status'][index+2].split(";")
nnbl_par = df['parenthetical'][index+2]
nnbl_indent = df['ssc'][index+2]
# try:
# if df['prvious_line_blank'][pnbl_index] == 'N' :
# ppnbl_pos = df['Identification_Status'][pnbl_index-1].split(";")
# else:
# ppnbl_pos = df['Identification_Status'][pnbl_index-2].split(";")
# except:
# pass
line_no = df['line_no'][index]
data = df['data'][index]
cur_line_pos = df['Identification_Status'][index].split(";")
## skip if next is dialogue
if ("".join(nnbl_pos) == 'ps13') or ("".join(nnbl_pos) == 'ps15'):
continue
try:
pnbl_top2 = pnbl_pos[1]
except:
pnbl_top2 = ''
try:
nnbl_top2 = nnbl_pos[1]
except:
nnbl_top2 = ''
## ps4 identification made stricter if pnbl top is ps8
try:
if pnbl_pos[0] == 'ps8':
continue
except:
pass
## examine ps4 = action beginning
line_identified = False
if cur_line_pos[0] == 'ps4' and pnbl_pos[0] != 'ps4' and nnbl_pos[0] != 'ps7':
try:
print(data)
except:
pass
print(pnbl_pos[0],cur_line_pos[0],nnbl_pos[0])
if 'ps6' in (pnbl_pos[0]) and df['nlb'][index] == 'N' and ('ps7' not in cur_line_pos):
## can make strict by indent also
print('identifying as ps4 case 1 top 1')
df['Identification_Status'][index] = 'ps4'
df['When_Identified'][index] = 'UsingTop2PNNBL'
line_identified = True
# elif pnbl_top2 and 'ps6' in pnbl_top2:
# print('identifying as ps4 case 1 top2')
# df['Identification_Status'][index] = 'ps4'
# df['When_Identified'][index] = 'UsingTop2PNNBL'
# line_identified = True
else:
print("ps6 not in previous")
# if line_identified :
# # run pnnbl ineligible
# do_while_pnnbl_ineligible_v035.run_pnnbl_ineligible(df)
if cur_line_pos[0] == 'ps4' and not line_identified and pnbl_pos[0] != 'ps4':
print("checking for ps5/6 in next")
if pnbl_pos[0] == 'ps5':
print("skipping as previous top is ps5" )
continue
elif pnbl_top2 and 'ps5' in pnbl_top2:
print("skipping as previous top2 is ps5" )
continue
if cur_indent == nnbl_indent:
if 'ps5' in nnbl_pos[0] or 'ps6' in nnbl_pos[0]:
try:
print(data,'identifying as ps4 case 2 top1')
except:
pass
df['Identification_Status'][index] = 'ps4'
df['When_Identified'][index] = 'UsingTop2PNNBL'
elif nnbl_top2 and 'ps5' in nnbl_top2:
try:
print(data,'identifying as ps4 case 2 top2 ps5')
except:
pass
df['Identification_Status'][index] = 'ps4'
df['When_Identified'][index] = 'UsingTop2PNNBL'
elif nnbl_top2 and 'ps6' in nnbl_top2:
try:
print(data,'identifying as ps4 case 2 top2 ps6')
except:
pass
df['Identification_Status'][index] = 'ps4'
df['When_Identified'][index] = 'UsingTop2PNNBL'
else:
print("current indent is not equal to next indent")
print("\n")
## commented as ps6 getting wrong
# for index in df.index[2:-2]:
# if df['isIdentified'][index] == 'Yes':
# continue
# if df['plb'][index] == 'N' :
# pnbl_pos = df['Identification_Status'][index-1].split(";")
# pnbl_index = index -1
# else:
# pnbl_pos = df['Identification_Status'][index-2].split(";")
# pnbl_index = index -2
# if df['nlb'][index] == 'N' :
# nnbl_pos = df['Identification_Status'][index+1].split(";")
# nnbl_par = df['parenthetical'][index+1]
# else:
# nnbl_pos = df['Identification_Status'][index+2].split(";")
# nnbl_par = df['parenthetical'][index+2]
# # try:
# # if df['prvious_line_blank'][pnbl_index] == 'N' :
# # ppnbl_pos = df['Identification_Status'][pnbl_index-1].split(";")
# # else:
# # ppnbl_pos = df['Identification_Status'][pnbl_index-2].split(";")
# # except:
# # pass
# line_no = df['line_no'][index]
# data = df['data'][index]
# cur_line_pos = df['Identification_Status'][index].split(";")
# try:
# pnbl_top2 = pnbl_pos[1]
# except:
# pnbl_top2 = ''
# try:
# nnbl_top2 = nnbl_pos[1]
# except:
# nnbl_top2 = ''
# ## examine action end
# if cur_line_pos[0] == 'ps6':
# try:
# print("pnbl",df['data'][pnbl_index])
# except:
# pass
# #print(pnbl_pos)
# if 'ps4' in pnbl_pos[0] or 'ps5' in pnbl_pos[0] :
# #print(pnbl_pos[0])
# try:
# print(data)
# except:
# pass
# print("identifying as ps6 using top1 pnbl")
# df['Identification_Status'][index] = 'ps6'
# df['When_Identified'][index] = 'UsingTop2PNNBL'
# elif pnbl_top2 and ('ps4' in pnbl_top2 or 'ps5' in pnbl_top2):
# try:
# print(data)
# except:
# pass
# df['Identification_Status'][index] = 'ps6'
# df['When_Identified'][index] = 'UsingTop2PNNBL'
# print("identifying as ps6 using top2 pnbl")
return df
def refine_action_possibilties(df):
for index in df.index[1:-1]:
if df['isIdentified'][index] == 'Yes':
continue
pnbl_pos = []
nnbl_pos = []
if index == 0:
pnbl_pos = ['blank']
elif df['plb'][index] == 'N' :
pnbl_pos = df['Identification_Status'][index-1].split(";")
pnbl_index = index -1
elif index - 1 == 0:
pnpl_pos = ['blank']
else:
pnbl_pos = df['Identification_Status'][index-2].split(";")
pnbl_index = index -2
if index == df.index[-1]:
nnbl_pos = ['blank']
elif df['nlb'][index] == 'N' :
nnbl_pos = df['Identification_Status'][index+1].split(";")
nnbl_par = df['parenthetical'][index+1]
elif index+1 == df.index[-1]:
nnbl_pos = ['blank']
else:
nnbl_pos = df['Identification_Status'][index+2].split(";")
nnbl_par = df['parenthetical'][index+2]
# try:
# if df['prvious_line_blank'][pnbl_index] == 'N' :
# ppnbl_pos = df['Identification_Status'][pnbl_index-1].split(";")
# else:
# ppnbl_pos = df['Identification_Status'][pnbl_index-2].split(";")
# except:
# pass
line_no = df['line_no'][index]
data = df['data'][index]
cur_line_pos = df['Identification_Status'][index].split(";")
# try:
# pnbl_top2 = pnbl_pos[1]
# except:
# pnbl_top2 = ''
# try:
# nnbl_top2 = nnbl_pos[1]
# except:
# nnbl_top2 = ''
# print(line_no,data)
# print(pnbl_pos)
# print(cur_line_pos)
# print(nnbl_pos)
line_new_pos = []
#using pnbl and nnbl identified lines refine/identify current line
# if "".join(pnbl_pos) in ('ps15','ps6') and cur_line_pos[0] == 'ps4':
# print(line_no,data)
# print("pnbl is 15 or 6 and current top is 'ps4'")
# print("Identifying as ps4")
# df['Identification_Status'][index] = 'ps4'
# cur_line_pos = ['ps4']
# df['When_Identified'][index] = 'RefiningActionPossibilities'
line_new_pos = cur_line_pos
if "".join(nnbl_pos) == 'ps7':
try:
print(line_no,data)
except:
pass
print("remove ps5,14")
if 'ps5' in line_new_pos:
line_new_pos.remove('ps5')
if 'ps14' in line_new_pos:
line_new_pos.remove('ps14')
if "".join(nnbl_pos) == 'ps4':
try:
print(line_no,data)
except:
pass
print("remove ps5")
if 'ps5' in line_new_pos:
line_new_pos.remove('ps5')
if "".join(pnbl_pos) == 'ps4':
try:
print(line_no,data)
except:
pass
print("remove ps3 and 7")
if 'ps3' in line_new_pos:
line_new_pos.remove('ps3')
if 'ps7' in line_new_pos:
line_new_pos.remove('ps7')
df['Identification_Status'][index] = ";".join(line_new_pos)
return df
def prep_pnnbl_eligible_csv(pnbl_eligibility_matrix,nnbl_eligibility_matrix):
cur_dir = mypath
# cur_dir = os.getcwd()
pnbl_eligible_df = pd.read_csv(pnbl_eligibility_matrix, skiprows = [0])
nnbl_eligible_df = pd.read_csv(nnbl_eligibility_matrix, skiprows = [0])
pnbl_eligible_df.rename(columns={pnbl_eligible_df.columns[1]:'Possibilities',pnbl_eligible_df.columns[0]:'Description'}
,inplace = True)
nnbl_eligible_df.rename(columns={nnbl_eligible_df.columns[1]:'Possibilities',nnbl_eligible_df.columns[0]:'Description'}
,inplace = True)
pnbl_eligible_df.to_csv(os.path.join(cur_dir,'pnbl_eligible_pos.csv'), index =False)
nnbl_eligible_df.to_csv(os.path.join(cur_dir,'nnbl_eligible_pos.csv'), index =False)
pnbl_eligible_df = pd.read_csv(os.path.join(cur_dir,'pnbl_eligible_pos.csv'), index_col = ['Possibilities'])
nnbl_eligible_df = pd.read_csv(os.path.join(cur_dir,'nnbl_eligible_pos.csv'), index_col = ['Possibilities'])
def check_eligibility_using_identified_pnnbl(df):
total_pos_before = 0
total_pos_after = 0
lines_identified = 0
cur_dir = mypath
pnbl_eligible_df = pd.read_csv(os.path.join(cur_dir,'pnbl_eligible_pos.csv'))
nnbl_eligible_df = pd.read_csv(os.path.join(cur_dir,'nnbl_eligible_pos.csv'))
for index in df.index:
if df['isIdentified'][index] == 'Yes':
total_pos_before += 1
total_pos_after += 1
print(total_pos_before,total_pos_after)
continue
line_no = df['line_no'][index]
data = df['data'][index]
cur_line_pos = df['Identification_Status'][index].split(";")
if cur_line_pos[0] != '':
total_pos_before += len(cur_line_pos)
pnbl_pos = []
nnbl_pos = []
if index == 0:
pnbl_pos = ['blank']
elif df['plb'][index] == 'N' :
pnbl_pos = df['Identification_Status'][index-1].split(";")
pnbl_index = index -1
elif index - 1 == 0:
pnpl_pos = ['blank']
else:
pnbl_pos = df['Identification_Status'][index-2].split(";")
pnbl_index = index -2
if index == df.index[-1]:
nnbl_pos = ['blank']
elif df['nlb'][index] == 'N' :
nnbl_pos = df['Identification_Status'][index+1].split(";")
nnbl_par = df['parenthetical'][index+1]
elif index+1 == df.index[-1]:
nnbl_pos = ['blank']
else:
nnbl_pos = df['Identification_Status'][index+2].split(";")
nnbl_par = df['parenthetical'][index+2]
line_new_pos = cur_line_pos
try:
print(line_no,data)
except:
pass
print("current line pos", cur_line_pos,df['Identification_Status'][index])
try:
print("previous line pos",pnbl_pos)
print("next line pos",nnbl_pos)
except:
pass
if len(pnbl_pos) == 1 and pnbl_pos[0] != 'blank':
print("pnbl is identified as ", pnbl_pos)
## keep only possibilities which can exist with this pnbl
## filter
pnbl_eligible_pos = pnbl_eligible_df.loc[pnbl_eligible_df[pnbl_pos[0]] == 'yes','Possibilities'].to_list()
print("eligible possibilties as per pnbl",pnbl_eligible_pos)
line_new_pos = [ps for ps in line_new_pos if ps in pnbl_eligible_pos]
print("line new possibilities", line_new_pos)
else:
print("previous line not identified")
if len(nnbl_pos) == 1 and nnbl_pos[0] != 'blank':
print("nnbl is identified as ", nnbl_pos)
## keep only possibilities which can exist with this pnbl
## filter
nnbl_eligible_pos = nnbl_eligible_df.loc[nnbl_eligible_df[nnbl_pos[0]] == 'yes','Possibilities'].to_list()
print("eligible possibilties as per nnbl",nnbl_eligible_pos)
line_new_pos = [ps for ps in line_new_pos if ps in nnbl_eligible_pos]
print("line new possibilities", line_new_pos)
else:
print("next line not identified")
## make null as special term
if len(line_new_pos) == 0:
print("making null possibility special term ps17")
line_new_pos = ['ps17']
if len(line_new_pos) == 1:
df['isIdentified'][index] = 'Yes'
lines_identified += 1
df['Identification_Status'][index] = (";").join(line_new_pos)
total_pos_after += len(line_new_pos)
print(total_pos_before,total_pos_after)
print(total_pos_before,total_pos_after)
pos_decreased = True if total_pos_after < total_pos_before else False
return df,pos_decreased,lines_identified
def do_while_examine_using_identified_pnnbl(df):
pos_decreased = True
total_lines_identified = 0
iteration = 0
while pos_decreased :
iteration += 1
df,pos_decreased,lines_identified = check_eligibility_using_identified_pnnbl(df)
total_lines_identified += lines_identified
print(iteration,total_lines_identified)
print(iteration,total_lines_identified)
return df
def start_top_identifications_part1(df):
# loop through to examine speaker extension
for index in df.index[1:-1]:
if df['isIdentified'][index] == 'Yes':
continue
pnbl_pos = []
nnbl_pos = []
pnbl_index = index -1
nnbl_index = index +1
if index == 0:
pnbl_pos = ['blank']
elif df['plb'][index] == 'N' :
pnbl_pos = df['Identification_Status'][index-1].split(";")
pnbl_index = index -1
elif index - 1 == 0:
pnpl_pos = ['blank']
else:
pnbl_pos = df['Identification_Status'][index-2].split(";")
pnbl_index = index -2
if index == df.index[-1]:
nnbl_pos = ['blank']
elif df['nlb'][index] == 'N' :
nnbl_pos = df['Identification_Status'][index+1].split(";")
nnbl_par = df['parenthetical'][index+1]
nnbl_index = index +1
elif index+1 == df.index[-1]:
nnbl_pos = ['blank']
else:
nnbl_pos = df['Identification_Status'][index+2].split(";")
nnbl_par = df['parenthetical'][index+2]
nnbl_index = index +2
cur_indent = df['ssc'][index]
pnbl_indent = df['ssc'][pnbl_index]
nnbl_indent = df['ssc'][nnbl_index]
try:
if df['plb'][pnbl_index] == 'N' :
ppnbl_pos = df['Identification_Status'][pnbl_index-1].split(";")
else:
ppnbl_pos = df['Identification_Status'][pnbl_index-2].split(";")
ppnbl_exists = True
except:
ppnbl_exists = False
pass
line_no = df['line_no'][index]
data = df['data'][index]
cur_line_pos = df['Identification_Status'][index].split(";")
cur_par = df['parenthetical'][index]
try:
pnbl_par = df['parenthetical'][pnbl_index]
pnbl_case = df['case'][pnbl_index]
pnbl_data = df['data'][pnbl_index]
except:
pass
try:
pnbl_top2 = pnbl_pos[1]
except:
pnbl_top2 = ''
# try:
# nnbl_top2 = nnbl_pos[1]
# except:
# nnbl_top2 = ''
# print(line_no,data)
# print(pnbl_pos)
# print(cur_line_pos)
# print(nnbl_pos)
line_new_pos = []
#using pnbl and nnbl identified lines refine/identify current line
# if "".join(pnbl_pos) in ('ps15','ps6') and cur_line_pos[0] == 'ps4':
# print(line_no,data)
# print("pnbl is 15 or 6 and current top is 'ps4'")
# print("Identifying as ps4")
# df['Identification_Status'][index] = 'ps4'
# cur_line_pos = ['ps4']
# df['When_Identified'][index] = 'RefiningActionPossibilities'
cur_line_pos = df['Identification_Status'][index].split(";")
top1 = cur_line_pos[0]
top2 = top1
top3 = top1
top4 = top1
top5 = top1
if len(cur_line_pos) == 5:
top5 = cur_line_pos[4]
if len(cur_line_pos) >= 4:
top4 = cur_line_pos[3]
if len(cur_line_pos) >= 3:
top3 = cur_line_pos[2]
if len(cur_line_pos) >= 2:
top2 = cur_line_pos[1]
## if top is 1,6,7,16 identify them
## identify as 7 where 9 is top and 7 is 2nd and parenthetical absent
if cur_line_pos[0] == 'ps1' or (cur_line_pos[0] =='ps6' and pnbl_par =='Absent' and "".join(nnbl_pos) != 'ps6' and nnbl_pos[0] != 'ps5' and nnbl_pos[0] != 'ps6') or cur_line_pos[0] == 'ps7' or cur_line_pos[0] == 'ps9' or cur_line_pos[0] == 'ps8' or (cur_line_pos[0] == 'ps16' and nnbl_pos[0] != 'ps13' and nnbl_pos[0] != 'ps15' and nnbl_pos[0] != 'ps10'):
try:
print(line_no,data)
except:
pass
if(len(cur_line_pos) > 1):
if cur_line_pos[0] == 'ps6' and 'ps15' in (top1,top2,top3,top4,top5):
print("not identifying as ps6 can also be ps15 ")
continue
if cur_line_pos[0] == 'ps6' and ( pnbl_pos[0] == 'ps13' or pnbl_pos == 'ps14'):
print("not identifying as ps6 as could be ps15")
continue
if cur_line_pos[0] == 'ps6' and (pnbl_case == 'AllUpper' and len(pnbl_data.split()) == 1) :
print("not identifying as ps6 can also be ps15 ")
continue
if cur_line_pos[0] == 'ps6' and (len(data.split()) == 1 and cur_indent > pnbl_indent ) :
print("not identifying as ps6 can also be ps7 ")
continue
print("identifying as top",cur_line_pos[0])
if cur_line_pos[0] == 'ps9' and top2 == 'ps7' and cur_par == 'Absent':
line_new_pos.append(top2)
else:
line_new_pos.append(cur_line_pos[0])
try:
if nnbl_pos[0] == 'ps1' and 'ps2' in cur_line_pos :
line_new_pos.append('ps2')
print("added ps2 to ps1")
except:
print("possiblity next line pos not available")
pass
try:
if pnbl_pos[0] == 'ps1' and 'ps3' in cur_line_pos :
line_new_pos.append('ps3')
print("added ps3 to ps1")
except:
print("possiblity previous line pos not available")
pass
if top1 == 'ps1' and (top2 == 'ps6' or top3 == 'ps6' or top2 == 'ps8'):
## not indentifying as ps1
continue
if cur_line_pos[0] == 'ps1' and 'ps30' in cur_line_pos :
line_new_pos.append('ps30')
print("added ps30 to ps1")
df['Identification_Status'][index] = ";".join(line_new_pos)
df['When_Identified'][index] = 'StartIdentifyingTopsPart1'
continue
ppnbl_top_not_16 = True
if ppnbl_exists:
ppnbl_top_not_16 = False if ppnbl_pos[0] == 'ps16' else True
if cur_line_pos[0] == 'ps5' or cur_line_pos[1] == 'ps5':
if 'ps16' not in pnbl_top2:
if (pnbl_pos[0] == 'ps4' or (pnbl_pos[0] == 'ps5' and ppnbl_top_not_16 )) and df['nlb'][index] == 'N' and cur_indent == pnbl_indent and cur_indent == nnbl_indent :
print("code commented")
# print("identifying current as ps5")
# print(line_no,data)
# df['Identification_Status'][index] = 'ps5'
# df['When_Identified'][index] = 'StartIdentifyingTops'
elif pnbl_pos[0] == 'ps4' and df['nlb'][index] == 'Y' and cur_indent == pnbl_indent :
print("identifying current as ps6 as next also blank")
try:
print(line_no,data)
except:
pass
df['Identification_Status'][index] = 'ps6'
df['When_Identified'][index] = 'StartIdentifyingTopsPart1'
## additonally identify the ps8
#speaker_list = df.loc[df['Identification_Status'] == 'ps7','data'].to_list()
for index in df.index[1:-1]:
if df['isIdentified'][index] == 'Yes':
continue
cur_line_pos = df['Identification_Status'][index].split(";")
data = df['data'][index]
extn_found = False
extn_list = ['O.S.','V.O.',"CONT'D","CONTD",'VOICE','CONT.']
for extn in extn_list:
if extn in str(data):
extn_found = True
break
if cur_line_pos[0] == 'ps8' and extn_found:
df['Identification_Status'][index] = 'ps8'
df['When_Identified'][index] = 'StartIdentifyingTopsPart1'
return df
def start_top_identifications_part1_diluted(df):
print("in tops diluted")
# loop through to examine speaker extension
for index in df.index[1:-1]:
if df['isIdentified'][index] == 'Yes':
continue
print(index)
pnbl_pos = []
nnbl_pos = []
pnbl_index = index -1
nnbl_index = index +1
if index == 0:
pnbl_pos = ['blank']
elif df['plb'][index] == 'N' :
pnbl_pos = df['Identification_Status'][index-1].split(";")
pnbl_index = index -1
elif index - 1 == 0:
pnpl_pos = ['blank']
else:
pnbl_pos = df['Identification_Status'][index-2].split(";")
pnbl_index = index -2
if index == df.index[-1]:
nnbl_pos = ['blank']
elif df['nlb'][index] == 'N' :
nnbl_pos = df['Identification_Status'][index+1].split(";")
nnbl_par = df['parenthetical'][index+1]
nnbl_index = index +1
elif index+1 == df.index[-1]:
nnbl_pos = ['blank']
else:
nnbl_pos = df['Identification_Status'][index+2].split(";")
nnbl_par = df['parenthetical'][index+2]
nnbl_index = index +2
cur_indent = df['ssc'][index]
try:
pnbl_indent = df['ssc'][pnbl_index]
pnbl_case = df['case'][pnbl_index]
except:
pnbl_indent = -1
pnbl_case = ''
try:
nnbl_indent = df['ssc'][nnbl_index]
nnbl_case = df['case'][nnbl_index]
except:
nnbl_indent = -1
nnbl_case = ''
#nnbl_indent = df['ssc'][nnbl_index]
try:
if df['plb'][pnbl_index] == 'N' :
ppnbl_pos = df['Identification_Status'][pnbl_index-1].split(";")
else:
ppnbl_pos = df['Identification_Status'][pnbl_index-2].split(";")
ppnbl_exists = True
except:
ppnbl_exists = False
pass
line_no = df['line_no'][index]
data = df['data'][index]
cur_line_pos = df['Identification_Status'][index].split(";")
cur_line_case = df['case'][index]
pnbl_par = df['parenthetical'][pnbl_index]
try:
pnbl_top2 = pnbl_pos[1]
except:
pnbl_top2 = ''
# try:
# nnbl_top2 = nnbl_pos[1]
# except:
# nnbl_top2 = ''
# print(line_no,data)
# print(pnbl_pos)
# print(cur_line_pos)
# print(nnbl_pos)
line_new_pos = []
#using pnbl and nnbl identified lines refine/identify current line
# if "".join(pnbl_pos) in ('ps15','ps6') and cur_line_pos[0] == 'ps4':
# print(line_no,data)
# print("pnbl is 15 or 6 and current top is 'ps4'")
# print("Identifying as ps4")
# df['Identification_Status'][index] = 'ps4'
# cur_line_pos = ['ps4']
# df['When_Identified'][index] = 'RefiningActionPossibilities'
cur_line_pos = df['Identification_Status'][index].split(";")
top1 = cur_line_pos[0]
top2 = top1
top3 = top1
top4 = top1
top5 = top1
if len(cur_line_pos) == 5:
top5 = cur_line_pos[4]
if len(cur_line_pos) >= 4:
top4 = cur_line_pos[3]
if len(cur_line_pos) >= 3:
top3 = cur_line_pos[2]
if len(cur_line_pos) >= 2:
top2 = cur_line_pos[1]
if top1 == 'ps1' and (top2 == 'ps6' or top3 == 'ps6' or top2 == 'ps8'):
## not indentifying as ps1
continue
## if top is 1,6,7,16 identify them
if cur_line_pos[0] == 'ps1' or (cur_line_pos[0] =='ps6' and pnbl_par =='Absent' and "".join(nnbl_pos) != 'ps6' and nnbl_pos[0] != 'ps5') or cur_line_pos[0] == 'ps7' or (cur_line_pos[0] == 'ps16' and nnbl_pos[0] != 'ps15'):
try:
print(line_no,data)
except:
pass
if(len(cur_line_pos) > 1):
if cur_line_pos[0] == 'ps6' and 'ps15' in (top1,top2,top3,top4,top5):
print("not identifying as ps6 can also be ps15 ")
continue
print("identifying as top",cur_line_pos[0])
line_new_pos.append(cur_line_pos[0])
if cur_line_pos[0] == 'ps1' and 'ps30' in cur_line_pos :
line_new_pos.append('ps30')
print("added ps30 to ps1")
df['Identification_Status'][index] = ";".join(line_new_pos)
df['When_Identified'][index] = 'StartIdentifyingTopsDiluted'
continue
ppnbl_top_not_16 = True
if ppnbl_exists:
ppnbl_top_not_16 = False if ppnbl_pos[0] == 'ps16' else True
if cur_line_pos[0] == 'ps5' or cur_line_pos[1] == 'ps5':
if 'ps16' not in pnbl_top2:
if (pnbl_pos[0] == 'ps4' or (pnbl_pos[0] == 'ps5' and ppnbl_top_not_16 )) and df['nlb'][index] == 'N' and cur_indent == pnbl_indent and cur_indent == nnbl_indent and pnbl_case != 'AllUpper' and cur_line_case != 'AllUpper' and nnbl_case !='AllUpper':
print("Lenient: code not commented")
print("identifying current as ps5")
try:
print(line_no,data)
except:
pass
df['Identification_Status'][index] = 'ps5'
df['When_Identified'][index] = 'StartIdentifyingTopsDiluted'
elif pnbl_pos[0] == 'ps4' and df['nlb'][index] == 'Y' and cur_indent == pnbl_indent :
print("identifying current as ps6 as next also blank")
try:
print(line_no,data)
except:
pass
df['Identification_Status'][index] = 'ps6'
df['When_Identified'][index] = 'StartIdentifyingTopsDiluted'
return df
def examine_speaker_mix_part1(df,audit_df):
df = df.sort_index().reset_index(drop=True)
audit_df = df.sort_index().reset_index(drop=True)
#df = df.sort_index().reset_index(drop=True)
for index in df.index:
if df['isIdentified'][index] == 'Yes' or pd.isna(df['Identification_Status'][index]):
continue
line_no = df['line_no'][index]
data = df['data'][index]
cur_line_pos = df['Identification_Status'][index].split(";")
## if parenthetical at last then split to new line
if 'ps30' not in cur_line_pos:
continue
if cur_line_pos[0] == 'ps1' or cur_line_pos[0] == 'ps2' or cur_line_pos[0] == 'ps14' or cur_line_pos[0] == 'ps5' or cur_line_pos[0] == 'ps13' or cur_line_pos[0] == 'ps4' :
# skipping as could be slugline
continue
extn_found = False
extn_list = ['O.S.','V.O.',"CONT'D","CONTD",'VOICE']
for extn in extn_list:
if extn in str(data):
extn_found = True
break
if df['parenthetical'][index] == 'PartMidEnd' and not extn_found :
try:
print(data)
except:
pass
if re.search('\(',data,re.IGNORECASE):
pos_starts = re.search('\(',data,re.IGNORECASE).start()
#pos_end = re.search('(',data,re.IGNORECASE).end()
before_par = data[:pos_starts]
after_par = data[pos_starts:]
print("before_par = data[:pos_starts] line 5557:", before_par)
print("after_par = data[pos_starts:] line 5558 :", after_par)
print ("Seperating Parenthetical")
print("Identifying as speaker mix with dialogue and current pos")
print(cur_line_pos)
print(index)
if not before_par.isupper():
# skip as possibly not speaker
continue
# try:
# print(before_par)
# except:
# pass
#print("df['data'][index]:",df['data'][index])
try:
df['data'][index] = before_par
except:
df['data'][int(index)] = before_par
df['parenthetical'][index] = 'Absent'
df['When_Identified'][index] = 'ExaminingSpeakerMix'
df['Identification_Status'][index] = ";".join(cur_line_pos)
nlb = df['nlb'][index]
df['nlb'][index] = 'N'
line_no = df['line_no'][index]
next_line_no = df['line_no'][index+1]
try:
new_line_no = (line_no + next_line_no) / 2
except:
new_line_no = (int(line_no) + int(next_line_no)) / 2
if new_line_no in audit_df.index:
new_line_no = (new_line_no + next_line_no)/2
audit_df.loc[new_line_no] = np.nan
audit_df.loc[new_line_no]['line_removed'] = 'No'
print(
"index:",index,"\n",
"df['data'][index]:",df['data'][index],"\n",
"df['parenthetical'][index]:",df['parenthetical'][index],"\n",
"df['When_Identified'][index]:",df['When_Identified'][index],"\n",
"df['Identification_Status'][index]:",df['Identification_Status'][index],"\n",
"df['nlb'][index]:",df['nlb'][index],"\n",
)
try:
audit_df.loc[line_no]['line_broken_into_multiple_lines'] = 'Separated Speaker Mixed with Parenthetical'
except:
audit_df.loc[np.float64(line_no)]['line_broken_into_multiple_lines'] = 'Separated Speaker Mixed with Parenthetical'
print(df['Identification_Status'][index])
try:
print(after_par)
except:
pass
print("identifying parenthetical")
df.loc[index + 0.25] = np.nan
df.loc[index + 0.25,'data'] = after_par
df.loc[index + 0.25,'parenthetical'] = 'Complete'
df.loc[index + 0.25,'When_Identified'] ='ExaminingSpeakerMix'
df.loc[index + 0.25,'Identification_Status'] = 'ps10'
df.loc[index + 0.25,'case'] = ''
df.loc[index + 0.25,'plb'] = 'N'
df.loc[index + 0.25,'nlb'] = nlb
df.loc[index + 0.25,'line_no'] = new_line_no
df = df.sort_index().reset_index(drop=True)
continue
return df
# df.to_csv(p.output_file_path,index=False)
# lines_not_removed = audit_df.loc[audit_df['line_removed'] != 'Yes'].index.to_list()
# audit_df.sort_index(inplace= True)
# audit_df.reset_index(inplace= True)
# for line in lines_not_removed:
# new_data = ''
# try:
# new_data =df.loc[df['line_no'] == line, 'data'].values[0]
# except:
# pass
# #print(new_data)
# audit_df.loc[audit_df['line_no'] == line, 'data_corrected'] = new_data
# #print(audit_df.loc[audit_df['line_no'] == line, 'data_corrected'])
# audit_df.to_csv(p.audit_report_path, index = False)
def examine_speaker_mix_part2(df,audit_df):
## examine the ps30s and split with colon and all caps speaker
print("Start speaker mix part2")
for index in df.index:
line_no = df['line_no'][index]
data = df['data'][index]
cur_line_pos = df['Identification_Status'][index].split(";")
if 'ps30' not in cur_line_pos[0]:
continue
try:
if 'ps30' not in cur_line_pos[1]:
continue
except:
pass
if cur_line_pos[0] == 'ps1' or cur_line_pos[0] == 'ps2' :
# skipping as could be slugline
continue
speaker = ''
dialogue = ''
## search colon and spearate after colon
try:
print("data:\n",data)
except:
pass
extn_found = False
extn_list = ['O.S.','V.O.',"CONT'D","CONTD",'VOICE']
print(extn_list)
for extn in extn_list:
if extn in str(data):
extn_found = True
break
if re.search('\:',data,re.IGNORECASE) and not extn_found:
pos_starts = re.search('\:',data,re.IGNORECASE).start()
#pos_end = re.search('(',data,re.IGNORECASE).end()
before_colon = data[:pos_starts]
after_colon = data[pos_starts+1:]
if not before_colon or before_colon.strip().isspace():
print ("nothing before colon")
continue
print ("Seperating speaker dialogue separated by colon")
print(index)
try:
print(before_colon)
except:
pass
df['data'][index] = before_colon
df['parenthetical'][index] = 'Absent'
df['When_Identified'][index] = 'ExaminingSpeakerMixDialogue'
#df['case_format'][index] = 'AllUpper'
df['Identification_Status'][index] = 'ps7'
nlb = df['nlb'][index]
df['nlb'][index] = 'N'
line_no = df['line_no'][index]
next_line_no = df['line_no'][index+1]
new_line_no = (line_no + next_line_no) / 2
if new_line_no in audit_df.index:
new_line_no = (new_line_no + next_line_no)/2
audit_df.loc[new_line_no] = ''
audit_df.loc[new_line_no]['line_removed'] = 'No'
audit_df.loc[line_no]['line_broken_into_multiple_lines'] = 'Separated Speaker and Dialogue mixed with colon:'
#print(df['Identification_Status'][index])
try:
print(after_colon)
except:
pass
print("identifying after colon as dialogue end")
df.loc[index + 0.25] = ''
df.loc[index + 0.25,'data'] = after_colon
df.loc[index + 0.25,'parenthetical'] = 'Absent'
df.loc[index + 0.25,'When_Identified'] ='ExaminingSpeakerMixDialogue'
df.loc[index + 0.25,'Identification_Status'] = 'ps15'
df.loc[index + 0.25,'case'] = ''
df.loc[index + 0.25,'plb'] = 'N'
df.loc[index + 0.25,'nlb'] = nlb
df.loc[index + 0.25,'line_no'] = new_line_no
df = df.sort_index().reset_index(drop=True)
continue
elif cur_line_pos[0] == 'ps30':
words = data.lstrip().split(" ")
k = 0
for word in words:
try:
print(word)
except:
pass
if word.isupper():
k += 1
else:
break
print(k)
if k != 0:
for i in range(0,k):
speaker += words[i] + ' '
for j in range(k,len(words)):
dialogue += words[j] + ''
print ("Seperating speaker dialogue for ps30")
print(index)
try:
print(speaker.strip())
except:
pass
if not speaker or not speaker.strip() or not dialogue.strip():
print("unable to separate speaker from line, speaker possibly blank or line is not speaker dialogue mix",index)
continue
df['data'][index] = speaker.strip()
print("df['data'][index]:",df['data'][index])
df['parenthetical'][index] = 'Absent'
print("df['parenthetical'][index]:",df['parenthetical'][index])
df['When_Identified'][index] = 'ExaminingSpeakerMixDialogue'
print("df['When_Identified'][index]:",df['When_Identified'][index])
df['case'][index] = 'AllUpper'
print("df['case'][index]:",df['case'][index])
df['Identification_Status'][index] = 'ps7'
print("df['Identification_Status'][index]:",df['Identification_Status'][index])
nlb = df['nlb'][index]
print("nlb",nlb)
df['nlb'][index] = 'N'
print("df['nlb'][index]:",df['nlb'][index])
#print(df['Identification_Status'][index])
line_no = df['line_no'][index]
print("line_no", line_no)
next_line_no = df['line_no'][index+1]
print("next_line_no:", next_line_no)
try:
print("entering")
new_line_no = (float(line_no) + float(next_line_no)) / 2
except:
a = float(line_no)
b = float(next_line_no)
c = float((line_no + new_line_no)/2)
print(type(c))
new_line_no = c
print("new_line_no:", new_line_no)
try:
print("try block")
if new_line_no in audit_df.index:
print("inside if block")
new_line_no = (new_line_no + next_line_no)/2
except Exception as e:
print(f"An error occurred: {e}")
# if new_line_no in audit_df.index:
# print("inside if block")
# new_line_no = (new_line_no + next_line_no)/2
print("after if block")
#audit_df.loc[new_line_no] = ''
#audit_df.loc[new_line_no]['line_removed'] = 'No'
try:
print("try")
audit_df.loc[new_line_no, 'line_removed'] = 'No'
except:
print("except")
audit_df.loc[new_line_no] = ''
audit_df.loc[new_line_no, 'line_removed'] = 'No'
print("audit_df.loc[new_line_no]['line_removed']:",audit_df.loc[new_line_no]['line_removed'])
try:
print("try")
audit_df.loc[line_no ,'line_broken_into_multiple_lines'] = 'Separated Speaker and Dialogue '
except:
print("except")
audit_df.loc[line_no] = ''
audit_df.loc[line_no ,'line_broken_into_multiple_lines'] = 'Separated Speaker and Dialogue '
print("audit_df.loc[line_no]['line_broken_into_multiple_lines']:",audit_df.loc[line_no]['line_broken_into_multiple_lines'])
#audit_df.loc[line_no]['line_broken_into_multiple_lines'] = 'Separated Speaker and Dialogue '
print("identifying dialogue from ps30 as ps13;ps15")
try:
print(dialogue.strip())
except:
pass
df.loc[index + 0.25] = ''
df.loc[index + 0.25,'data'] = dialogue.strip()
df.loc[index + 0.25,'parenthetical'] = 'Absent'
df.loc[index + 0.25,'When_Identified'] ='ExaminingSpeakerMixDialogue'
df.loc[index + 0.25,'Identification_Status'] = 'ps15;ps13'
df.loc[index + 0.25,'case'] = ''
df.loc[index + 0.25,'plb'] = 'N'
df.loc[index + 0.25,'nlb'] = nlb
df.loc[index + 0.25,'line_no'] = new_line_no
df = df.sort_index().reset_index(drop=True)
continue
return df
# df.to_csv(p.output_file_path, index = False)
# lines_not_removed = audit_df.loc[audit_df['line_removed'] != 'Yes'].index.to_list()
# audit_df.sort_index(inplace= True)
# audit_df.reset_index(inplace= True)
# for line in lines_not_removed:
# new_data = ''
# try:
# new_data =df.loc[df['line_no'] == line, 'data'].values[0]
# except:
# pass
# #print(new_data)
# audit_df.loc[audit_df['line_no'] == line, 'data_corrected'] = new_data
# #print(audit_df.loc[audit_df['line_no'] == line, 'data_corrected'])
# audit_df.to_csv(p.audit_report_path, index = False)
def start_top_identifications_part2(df):
for index in df.index:
if df['isIdentified'][index] == 'Yes' or pd.isna(df['Identification_Status'][index]):
continue
pnbl_pos = []
nnbl_pos = []
pnbl_index = -1
prev_flag = False
next_flag = False
pnbl_index = index -1
nnbl_index = index +1
if index == 0:
pnbl_pos = ['blank']
pnbl_index = 'first'
elif df['plb'][index] == 'N' :
pnbl_pos = df['Identification_Status'][index-1].split(";")
pnbl_index = index -1
elif index - 1 == 0:
pnpl_pos = ['blank']
pnbl_index = 'first'
else:
pnbl_pos = df['Identification_Status'][index-2].split(";")
pnbl_index = index -2
if index == df.index[-1]:
nnbl_pos = ['blank']
nnbl_index = 'last'
elif df['nlb'][index] == 'N' :
nnbl_pos = df['Identification_Status'][index+1].split(";")
nnbl_par = df['parenthetical'][index+1]
nnbl_index = index + 1
elif index+1 == df.index[-1]:
nnbl_pos = ['blank']
nnbl_index = 'last'
else:
nnbl_pos = df['Identification_Status'][index+2].split(";")
nnbl_par = df['parenthetical'][index+2]
nnbl_index = index + 2
cur_indent = df['ssc'][index]
try:
pnbl_indent = df['ssc'][pnbl_index]
except:
pnbl_indent = -1
try:
nnbl_indent = df['ssc'][nnbl_index]
except:
nnbl_indent = -1
# try:
# if df['prvious_line_blank'][pnbl_index] == 'N' :
# ppnbl_pos = df['Identification_Status'][pnbl_index-1].split(";")
# else:
# ppnbl_pos = df['Identification_Status'][pnbl_index-2].split(";")
# except:
# pass
line_no = df['line_no'][index]
data = df['data'][index]
cur_line_pos = df['Identification_Status'][index].split(";")
pnbl_par = df['parenthetical'][pnbl_index] if pnbl_index != 'first' else False
try:
pnbl_top2 = pnbl_pos[1]
except:
pnbl_top2 = ''
# try:
# nnbl_top2 = nnbl_pos[1]
# except:
# nnbl_top2 = ''
cur_indent = df['ssc'][index]
#print(cur_indent)
#print("examining")
#print(line_no,cur_indent,data)
#print(cur_line_pos)
# print(pnbl_pos)
# print(cur_line_pos)
# print(nnbl_pos)
line_new_pos = []
pdil_pos = []
ndil_pos = []
## lets find previous different indent line
#print("looking for previous different indent line")
pdil_index = index
while pdil_index != 0:
pdil_indent = df['ssc'][pdil_index]
pdil_line_no = df['line_no'][pdil_index]
if df['Identification_Status'][pdil_index] != 'blank' and pdil_indent != cur_indent:
#print(pdil_line_no,pdil_indent,df['Identification_Status'][pdil_index])
#print(df['data'][pdil_index])
pdil_pos = df['Identification_Status'][pdil_index].split(";")
break
pdil_index -= 1
if pdil_index == 0:
prev_flag = 'start'
#print(prev_flag)
#print("looking for next different indent line")
ndil_index = index
while ndil_index != df.index[-1]:
ndil_indent = df['ssc'][ndil_index]
ndil_line_no = df['line_no'][ndil_index]
if df['Identification_Status'][ndil_index] != 'blank' and ndil_indent != cur_indent:
#print(ndil_line_no,ndil_indent,df['Identification_Status'][ndil_index])
#print(df['data'][ndil_index])
ndil_pos = df['Identification_Status'][ndil_index].split(";")
break
ndil_index += 1
if ndil_index == df.index[-1]:
next_flag = 'end'
#print(next_flag)
cur_line_pos = df['Identification_Status'][index].split(";")
top1 = cur_line_pos[0]
top2 = top1
top3 = top1
top4 = top1
top5 = top1
if len(cur_line_pos) == 5:
top5 = cur_line_pos[4]
if len(cur_line_pos) >= 4:
top4 = cur_line_pos[3]
if len(cur_line_pos) >= 3:
top3 = cur_line_pos[2]
if len(cur_line_pos) >= 2:
top2 = cur_line_pos[1]
## if cur line contains both 15 and 6
if 'ps6' in cur_line_pos and 'ps15' in cur_line_pos:
print("CURRENT CONATINS 15 6")
try:
print(data)
except:
pass
print("check pdil , ndil possibilties")
print(pdil_pos)
print(cur_line_pos)
print(ndil_pos)
print(prev_flag)
print(next_flag)
line_new_pos = cur_line_pos
if prev_flag != 'start' and next_flag != 'end' :
if pdil_pos[0] == 'ps15' or pdil_pos[0] == 'ps16' :
if ndil_pos[0] == 'ps7' or ndil_pos[0] == 'ps10':
print("remove ps15")
line_new_pos.remove('ps15')
print(line_new_pos)
df['Identification_Status'][index] = ";".join(line_new_pos)
elif pdil_pos[0] == 'ps7' or pdil_pos[0] == 'ps10' :
if ndil_pos[0] == 'ps1' or ndil_pos[0] == 'ps4' or ndil_pos[0] == 'ps6' or ndil_pos[0] == 'ps16':
print("remove ps6")
line_new_pos.remove('ps6')
df['Identification_Status'][index] = ";".join(line_new_pos)
print("\n")
lcp = df['lcp'][index]
if top1 == 'ps1' or top2 == 'ps1' or top3 == 'ps1' or top4 == 'ps1' or top5 == 'ps1':
if lcp < 60 :
print(pnbl_pos)
print(nnbl_pos)
print("pssible slug",data)
print(top1,top2,top3,top4,top5)
if pnbl_pos == 'ps6' or pnbl_pos == 'ps15' or pnbl_pos == 'ps16' or pnbl_pos == 'ps17':
if nnbl_pos == 'ps4':
print("line is ps1")
cur_line_pos = df['Identification_Status'][index].split(";")
line_new_pos = []
# print(df['line_no'][index])
# print("CHEKING")
# print(data)
# print(pnbl_pos)
# print(nnbl_pos)
if "".join(nnbl_pos) == 'ps6' and df['nlb'][index] == 'N':
line_new_pos = [ps for ps in cur_line_pos if ps != 'ps6']
print(line_new_pos)
df['Identification_Status'][index] = ";".join(line_new_pos)
print("\n")
cur_line_pos = df['Identification_Status'][index].split(";")
line_new_pos = []
if (nnbl_pos[0] == 'ps4' and top1 != 'ps1' and top2 != 'ps1') or nnbl_pos[0] == 'ps1' or nnbl_pos[0] == 'ps7':
print(pnbl_pos)
if pnbl_index != 'first' :
if pnbl_pos[0] == 'ps5' or pnbl_top2 == 'ps5':
if cur_indent == pnbl_indent:
try:
print(line_no,data,"identifying as PS6")
except:
pass
df['Identification_Status'][index] = 'ps6'
df['When_Identified'][index] = 'StartTopIdentificationPart2'
continue
if len(df['Identification_Status'][index].split(";")) == 1 :
continue
cur_line_pos = df['Identification_Status'][index].split(";")
line_new_pos = []
if cur_line_pos[0] in ('ps5','ps6') and cur_line_pos[1] in ('ps5','ps6'):
if df['nlb'][index] == 'Y' and "".join(nnbl_pos) == 'ps6':
if df['plb'][index] == 'N':
if cur_indent == pnbl_indent:
try:
print(line_no,data,"identifying as ps6")
except:
pass
df['Identification_Status'][index] = 'ps6'
continue
else:
# remove ps5
line_new_pos = [ps for ps in cur_line_pos if ps != 'ps5']
try:
print(line_no,data,"removed ps5")
except:
pass
df['Identification_Status'][index] = ";".join(line_new_pos)
continue
return df
def start_slug_identification(df):
# loop through to examine slug
for index in df.index[1:-1]:
if df['isIdentified'][index] == 'Yes':
continue
pnbl_pos = []
nnbl_pos = []
pnbl_index = index -1
if index == 0:
pnbl_pos = ['blank']
elif df['plb'][index] == 'N' :
pnbl_pos = df['Identification_Status'][index-1].split(";")
pnbl_index = index -1
elif index - 1 == 0:
pnpl_pos = ['blank']
else:
pnbl_pos = df['Identification_Status'][index-2].split(";")
pnbl_index = index -2
if index == df.index[-1]:
nnbl_pos = ['blank']
elif df['nlb'][index] == 'N' :
nnbl_pos = df['Identification_Status'][index+1].split(";")
nnbl_par = df['parenthetical'][index+1]
elif index+1 == df.index[-1]:
nnbl_pos = ['blank']
else:
nnbl_pos = df['Identification_Status'][index+2].split(";")
nnbl_par = df['parenthetical'][index+2]
# try:
# if df['prvious_line_blank'][pnbl_index] == 'N' :
# ppnbl_pos = df['Identification_Status'][pnbl_index-1].split(";")
# else:
# ppnbl_pos = df['Identification_Status'][pnbl_index-2].split(";")
# except:
# pass
line_no = df['line_no'][index]
data = df['data'][index]
cur_line_pos = df['Identification_Status'][index].split(";")
pnbl_par = df['parenthetical'][pnbl_index]
try:
pnbl_top2 = pnbl_pos[1]
except:
pnbl_top2 = ''
# try:
# nnbl_top2 = nnbl_pos[1]
# except:
# nnbl_top2 = ''
# print(line_no,data)
# print(pnbl_pos)
# print(cur_line_pos)
# print(nnbl_pos)
line_new_pos = []
if "".join(pnbl_pos) == 'ps16' and not ('ps1' in nnbl_pos):
print(nnbl_pos)
if 'ps1' in cur_line_pos and 'ps18' in cur_line_pos:
wt1 = int(df['ps1'][index])
wt18 = int(df['ps18'][index])
if wt1 > wt18:
print("identifying current as ps1 ")
try:
print(line_no,data)
except:
pass
df['Identification_Status'][index] = 'ps1'
df['When_Identified'][index] = 'StartIdentifyingSlug'
continue
# if len(cur_line_pos) == 2:
# if cur_line_pos[0] == 'ps1' and cur_line_pos[1] == 'ps17':
# wt1 = int(df['ps1'][index])
# wt17 = int(df['ps17'][index])
# if wt1 - wt17 > 20:
# print("identifying current as ps1 ")
# try:
# print(line_no,data)
# except:
# pass
# df['Identification_Status'][index] = 'ps1'
# df['When_Identified'][index] = 'StartIdentifyingSlug'
# continue
# if len(cur_line_pos) == 3:
# if cur_line_pos[0] == 'ps1' and cur_line_pos[1] == 'ps2' and cur_line_pos[2] == 'ps17':
# wt1 = int(df['ps1'][index])
# wt17 = int(df['ps17'][index])
# if wt1 - wt17 > 20:
# print("removing ps17 ")
# try:
# print(line_no,data)
# except:
# pass
# cur_line_pos = [ps != 'ps17' for ps in cur_line_pos]
# df['Identification_Status'][index] = ';'.join(cur_line_pos)
# df['When_Identified'][index] = 'StartIdentifyingSlug'
# continue
return df
def start_top_identifications_part3(df):
for index in df.index:
if df['isIdentified'][index] == 'Yes' or pd.isna(df['Identification_Status'][index]):
continue
pnbl_pos = []
nnbl_pos = []
pnbl_index = -1
prev_flag = False
next_flag = False
if index == 0:
pnbl_pos = ['blank']
pnbl_index = 'first'
elif df['plb'][index] == 'N' :
pnbl_pos = df['Identification_Status'][index-1].split(";")
pnbl_index = index -1
elif index - 1 == 0:
pnpl_pos = ['blank']
pnbl_index = 'first'
else:
pnbl_pos = df['Identification_Status'][index-2].split(";")
pnbl_index = index -2
if index == df.index[-1]:
nnbl_pos = ['blank']
nnbl_index = 'last'
elif df['nlb'][index] == 'N' :
nnbl_pos = df['Identification_Status'][index+1].split(";")
nnbl_par = df['parenthetical'][index+1]
nnbl_index = index + 1
elif index+1 == df.index[-1]:
nnbl_pos = ['blank']
nnbl_index = 'last'
else:
nnbl_pos = df['Identification_Status'][index+2].split(";")
nnbl_par = df['parenthetical'][index+2]
nnbl_index = index + 2
# try:
# if df['prvious_line_blank'][pnbl_index] == 'N' :
# ppnbl_pos = df['Identification_Status'][pnbl_index-1].split(";")
# else:
# ppnbl_pos = df['Identification_Status'][pnbl_index-2].split(";")
# except:
# pass
line_no = df['line_no'][index]
data = df['data'][index]
cur_line_pos = df['Identification_Status'][index].split(";")
pnbl_par = df['parenthetical'][pnbl_index] if pnbl_index != 'first' else False
try:
pnbl_top2 = pnbl_pos[1]
except:
pnbl_top2 = ''
# try:
# nnbl_top2 = nnbl_pos[1]
# except:
# nnbl_top2 = ''
cur_indent = df['ssc'][index]
#print(cur_indent)
#print("examining")
#print(line_no,cur_indent,data)
#print(cur_line_pos)
# print(pnbl_pos)
# print(cur_line_pos)
# print(nnbl_pos)
line_new_pos = []
print("\n")
cur_line_pos = df['Identification_Status'][index].split(";")
top1 = cur_line_pos[0]
top2 = top1
top3 = top1
top4 = top1
top5 = top1
if len(cur_line_pos) == 5:
top5 = cur_line_pos[4]
if len(cur_line_pos) >= 4:
top4 = cur_line_pos[3]
if len(cur_line_pos) >= 3:
top3 = cur_line_pos[2]
if len(cur_line_pos) >= 2:
top2 = cur_line_pos[1]
lcp = df['lcp'][index]
if len(pnbl_pos) == 0:
pnbl_pos = ['blank']
pnbl_top1 = pnbl_pos[0]
pnbl_top2 = pnbl_top1
pnbl_top3 = pnbl_top1
pnbl_top4 = pnbl_top1
pnbl_top5 = pnbl_top1
if len(pnbl_pos) == 5:
pnbl_top5 = pnbl_pos[4]
if len(pnbl_pos) >= 4:
pnbl_top4 = pnbl_pos[3]
print(pnbl_pos[3])
if len(pnbl_pos) >= 3:
pnbl_top3 = pnbl_pos[2]
if len(pnbl_pos) >= 2:
pnbl_top2 = pnbl_pos[1]
# lcp = df['last_character_placement'][index]
nnbl_top1 = nnbl_pos[0]
nnbl_top2 = nnbl_top1
nnbl_top3 = nnbl_top1
# nnbl_top4 = nnbl_top1
# nnbl_top5 = nnbl_top1
if len(nnbl_pos) >= 3:
nnbl_top3 = nnbl_pos[2]
if len(nnbl_pos) >= 2:
nnbl_top2 = nnbl_pos[1]
if top1 == 'ps1':
try:
print("possible slug",data)
except:
pass
print(pnbl_pos)
print(pnbl_top4)
print(nnbl_pos)
if pnbl_top1 == 'ps16' or pnbl_top2 == 'ps16' or pnbl_top3 == 'ps16' or pnbl_top4 == 'ps16' or pnbl_top5 == 'ps16' :
if nnbl_top1 == 'ps4' or nnbl_top2 == 'ps4' or nnbl_top3 == 'ps4':
print("identifying current as ps1 as between top transitiona and action")
try:
print(line_no,data)
except:
pass
df['Identification_Status'][index] = 'ps1'
df['When_Identified'][index] = 'StartIdentifyingTopsPart3'
return df
def start_top_identifications_part4(df):
for index in df.index:
if df['isIdentified'][index] == 'Yes' or pd.isna(df['Identification_Status'][index]):
continue
pnbl_pos = []
nnbl_pos = []
pnbl_index = -1
prev_flag = False
next_flag = False
pnbl_index = index -1
nnbl_index = index +1
if index == 0:
pnbl_pos = ['blank']
pnbl_index = 'first'
elif df['plb'][index] == 'N' :
pnbl_pos = df['Identification_Status'][index-1].split(";")
pnbl_index = index -1
elif index - 1 == 0:
pnpl_pos = ['blank']
pnbl_index = 'first'
else:
pnbl_pos = df['Identification_Status'][index-2].split(";")
pnbl_index = index -2
if index == df.index[-1]:
nnbl_pos = ['blank']
nnbl_index = 'last'
elif df['nlb'][index] == 'N' :
nnbl_pos = df['Identification_Status'][index+1].split(";")
nnbl_par = df['parenthetical'][index+1]
nnbl_index = index + 1
elif index+1 == df.index[-1]:
nnbl_pos = ['blank']
nnbl_index = 'last'
else:
nnbl_pos = df['Identification_Status'][index+2].split(";")
nnbl_par = df['parenthetical'][index+2]
nnbl_index = index + 2
cur_indent = df['ssc'][index]
try:
pnbl_indent = df['ssc'][pnbl_index]
except:
pnbl_indent = -1
try:
nnbl_indent = df['ssc'][nnbl_index]
except:
nnbl_indent = -1
# try:
# if df['prvious_line_blank'][pnbl_index] == 'N' :
# ppnbl_pos = df['Identification_Status'][pnbl_index-1].split(";")
# else:
# ppnbl_pos = df['Identification_Status'][pnbl_index-2].split(";")
# except:
# pass
line_no = df['line_no'][index]
data = df['data'][index]
case = df['case'][index]
cur_line_pos = df['Identification_Status'][index].split(";")
pnbl_par = df['parenthetical'][pnbl_index] if pnbl_index != 'first' else False
try:
pnbl_top2 = pnbl_pos[1]
except:
pnbl_top2 = ''
# try:
# nnbl_top2 = nnbl_pos[1]
# except:
# nnbl_top2 = ''
#cur_indent = df['data_begins/Space count'][index]
#print(cur_indent)
#print("examining")
#print(line_no,cur_indent,data)
#print(cur_line_pos)
# print(pnbl_pos)
# print(cur_line_pos)
# print(nnbl_pos)
line_new_pos = []
print("\n")
cur_line_pos = df['Identification_Status'][index].split(";")
top1 = cur_line_pos[0]
top2 = top1
top3 = top1
top4 = top1
top5 = top1
if len(cur_line_pos) == 5:
top5 = cur_line_pos[4]
if len(cur_line_pos) >= 4:
top4 = cur_line_pos[3]
if len(cur_line_pos) >= 3:
top3 = cur_line_pos[2]
if len(cur_line_pos) >= 2:
top2 = cur_line_pos[1]
lcp = df['lcp'][index]
if len(pnbl_pos) == 0:
pnbl_pos = ['blank']
pnbl_top1 = pnbl_pos[0]
pnbl_top2 = pnbl_top1
pnbl_top3 = pnbl_top1
pnbl_top4 = pnbl_top1
pnbl_top5 = pnbl_top1
if len(pnbl_pos) == 5:
pnbl_top5 = pnbl_pos[4]
if len(pnbl_pos) >= 4:
pnbl_top4 = pnbl_pos[3]
print(pnbl_pos[3])
if len(pnbl_pos) >= 3:
pnbl_top3 = pnbl_pos[2]
if len(pnbl_pos) >= 2:
pnbl_top2 = pnbl_pos[1]
# lcp = df['last_character_placement'][index]
nnbl_top1 = nnbl_pos[0]
nnbl_top2 = nnbl_top1
nnbl_top3 = nnbl_top1
# nnbl_top4 = nnbl_top1
# nnbl_top5 = nnbl_top1
if len(nnbl_pos) >= 3:
nnbl_top3 = nnbl_pos[2]
if len(nnbl_pos) >= 2:
nnbl_top2 = nnbl_pos[1]
## between 15 and 6 , top 4, nlb=N
if top1 == 'ps4' and ";".join(pnbl_pos) == 'ps15' and ";".join(nnbl_pos) == 'ps6' and case != 'AllUpper':
if cur_indent == nnbl_indent and df['nlb'][index] == 'N':
print("identifying current as ps4 as between dialogue and action end and top action begin")
try:
print(line_no,data)
except:
pass
df['Identification_Status'][index] = 'ps4'
df['When_Identified'][index] = 'StartIdentifyingTopsPart4'
continue
## between 15,6 and 1 , top 3 has 16, nlb=Y , plb =Y
if ('ps16' in (top1,top2,top3)) and (";".join(pnbl_pos) == 'ps15' or ";".join(pnbl_pos) == 'ps6') and ";".join(nnbl_pos) == 'ps1':
if df['plb'][index] == 'Y' and df['nlb'][index] == 'Y' and top1 != 'ps6':
print("identifying current as transition ")
try:
print(line_no,data)
except:
pass
df['Identification_Status'][index] = 'ps16'
df['When_Identified'][index] = 'StartIdentifyingTopsPart4'
continue
last_line_index = df.index[-1]
if df['Identification_Status'][last_line_index] == 'blank':
last_line_index -= 1
cur_line_pos = df['Identification_Status'][last_line_index].split(";")
if len(cur_line_pos) > 1 :
if cur_line_pos[0] == 'ps6' or cur_line_pos[0] == 'ps15':
print("Identifying last line as top",cur_line_pos[0])
try:
print(df['line_no'][last_line_index],df['data'][last_line_index])
except:
pass
df['Identification_Status'][last_line_index] = cur_line_pos[0]
df['When_Identified'][last_line_index] = 'IdentifyingLastLine'
for index in df.index:
cur_line_pos = df['Identification_Status'][index].split(";")
if len(cur_line_pos) != 1 :
df['isIdentified'][index] == 'No'
else:
df['isIdentified'][index] == 'Yes'
return df
def start_top_identifications_part5(df):
for index in df.index:
if df['isIdentified'][index] == 'Yes' or pd.isna(df['Identification_Status'][index]):
continue
pnbl_pos = []
nnbl_pos = []
pnbl_index = -1
prev_flag = False
next_flag = False
pnbl_index = index -1
nnbl_index = index +1
if index == 0:
pnbl_pos = ['blank']
pnbl_index = 'first'
elif df['plb'][index] == 'N' :
print(pnbl_pos)
pnbl_pos = df['Identification_Status'][index-1].split(";")
pnbl_index = index -1
elif index - 1 == 0:
pnpl_pos = ['blank']
pnbl_index = 'first'
else:
pnbl_pos = df['Identification_Status'][index-2].split(";")
pnbl_index = index -2
if index == df.index[-1]:
nnbl_pos = ['blank']
nnbl_index = 'last'
elif df['nlb'][index] == 'N' :
nnbl_pos = df['Identification_Status'][index+1].split(";")
nnbl_par = df['parenthetical'][index+1]
nnbl_index = index + 1
elif index+1 == df.index[-1]:
nnbl_pos = ['blank']
nnbl_index = 'last'
else:
nnbl_pos = df['Identification_Status'][index+2].split(";")
nnbl_par = df['parenthetical'][index+2]
nnbl_index = index + 2
cur_indent = df['ssc'][index]
try:
pnbl_indent = df['ssc'][pnbl_index]
except:
pnbl_indent = -1
try:
nnbl_indent = df['ssc'][nnbl_index]
except:
nnbl_indent = -1
# try:
# if df['prvious_line_blank'][pnbl_index] == 'N' :
# ppnbl_pos = df['Identification_Status'][pnbl_index-1].split(";")
# else:
# ppnbl_pos = df['Identification_Status'][pnbl_index-2].split(";")
# except:
# pass
line_no = df['line_no'][index]
data = df['data'][index]
cur_line_pos = df['Identification_Status'][index].split(";")
pnbl_par = df['parenthetical'][pnbl_index] if pnbl_index != 'first' else False
try:
pnbl_top2 = pnbl_pos[1]
except:
pnbl_top2 = ''
# try:
# nnbl_top2 = nnbl_pos[1]
# except:
# nnbl_top2 = ''
#cur_indent = df['data_begins/Space count'][index]
#print(cur_indent)
#print("examining")
#print(line_no,cur_indent,data)
#print(cur_line_pos)
# print(pnbl_pos)
# print(cur_line_pos)
# print(nnbl_pos)
line_new_pos = []
print("\n")
cur_line_pos = df['Identification_Status'][index].split(";")
top1 = cur_line_pos[0]
top2 = top1
top3 = top1
top4 = top1
top5 = top1
if len(cur_line_pos) == 5:
top5 = cur_line_pos[4]
if len(cur_line_pos) >= 4:
top4 = cur_line_pos[3]
if len(cur_line_pos) >= 3:
top3 = cur_line_pos[2]
if len(cur_line_pos) >= 2:
top2 = cur_line_pos[1]
lcp = df['lcp'][index]
if len(pnbl_pos) == 0:
pnbl_pos = ['blank']
pnbl_top1 = pnbl_pos[0]
pnbl_top2 = pnbl_top1
pnbl_top3 = pnbl_top1
pnbl_top4 = pnbl_top1
pnbl_top5 = pnbl_top1
if len(pnbl_pos) == 5:
pnbl_top5 = pnbl_pos[4]
if len(pnbl_pos) >= 4:
pnbl_top4 = pnbl_pos[3]
print(pnbl_pos[3])
if len(pnbl_pos) >= 3:
pnbl_top3 = pnbl_pos[2]
if len(pnbl_pos) >= 2:
pnbl_top2 = pnbl_pos[1]
# lcp = df['last_character_placement'][index]
nnbl_top1 = nnbl_pos[0]
nnbl_top2 = nnbl_top1
nnbl_top3 = nnbl_top1
nnbl_top4 = nnbl_top1
# nnbl_top5 = nnbl_top1
if len(nnbl_pos) >= 4:
nnbl_top4 = nnbl_pos[3]
if len(nnbl_pos) >= 3:
nnbl_top3 = nnbl_pos[2]
if len(nnbl_pos) >= 2:
nnbl_top2 = nnbl_pos[1]
## pnbl is ps5 or 4 , cur top 5 , next top 2 has 6, cur_indent = pvs indent ;then current is ps5
if "".join(pnbl_pos) == 'ps5' or "".join(pnbl_pos) == 'ps4':
if top1 == 'ps5' and cur_indent == nnbl_indent:
if (nnbl_top1 == 'ps6' or nnbl_top2 == 'ps6') and (nnbl_top1 !='ps1' and nnbl_top1 !='ps2'):
print("identifying current as ps5 as between actions")
try:
print(line_no,data)
except:
pass
df['Identification_Status'][index] = 'ps5'
df['When_Identified'][index] = 'StartIdentifyingTopsPart5'
continue
elif nnbl_top1 == 'ps5' :
print("identifying current as ps5 as between actions")
try:
print(line_no,data)
except:
pass
df['Identification_Status'][index] = 'ps5'
df['When_Identified'][index] = 'StartIdentifyingTopsPart5'
continue
## pnbl is ps16 , cur top2 has ps1 ,##next top 2 has 4 or 6 , declare ps1 , same indent ?
if "".join(pnbl_pos) == 'ps16':
if top1 == 'ps1' or top2 == 'ps1':
if nnbl_top1 == 'ps4' or nnbl_top2 == 'ps4' or nnbl_top1 == 'ps6' or nnbl_top2 == 'ps6':
print("identifying current as ps1 as between transition and action")
try:
print(line_no,data)
except:
pass
df['Identification_Status'][index] = 'ps1'
df['When_Identified'][index] = 'StartIdentifyingTopsPart5'
continue
line_new_pos = []
## nnbl top2 does not have ps4 remove 1,3 from current line
## dont remove if next line is identified as speaker and ps1 is top
if nnbl_top1 != 'ps4' and nnbl_top2 != 'ps4' and nnbl_top3 != 'ps4' and nnbl_top4 != 'ps4' and nnbl_top1 != 'ps6' and nnbl_top2 != 'ps6':
if (nnbl_top1 == 'ps7' or nnbl_top1 =='ps8') and (top1 == 'ps1' or top1 == 'ps3'):
print("not removing ps1 as next is speaker and current top is slugline")
else:
line_new_pos = [ps for ps in cur_line_pos if ps != 'ps1' ]
line_new_pos = [ps for ps in line_new_pos if ps != 'ps3' ]
print("Removing ps1 ps3 from current as next does not have ps4 in top4 ps6 in top2")
try:
print(line_no,data)
except:
pass
df['Identification_Status'][index] = ";".join(line_new_pos)
#df['When_Identified'][index] = ''
cur_line_pos = df['Identification_Status'][index].split(";")
line_new_pos = []
## remove ps2 and ps18 as a possibility if right indent < 75
lcp = df['lcp'][index]
if lcp < 68 and ('ps2' in cur_line_pos or 'ps18' in cur_line_pos):
print("Removing ps2 ps18 from current as lcp < 75")
try:
print(line_no,data)
except:
pass
line_new_pos = [ps for ps in cur_line_pos if ps != 'ps2' ]
line_new_pos = [ps for ps in line_new_pos if ps != 'ps18' ]
df['Identification_Status'][index] = ";".join(line_new_pos)
for index in df.index:
#print(index)
cur_line_pos = df['Identification_Status'][index].split(";")
if len(cur_line_pos) != 1 :
df['isIdentified'][index] == 'No'
else:
df['isIdentified'][index] == 'Yes'
return df
def start_top_identifications_part6(df):
for index in df.index:
if df['isIdentified'][index] == 'Yes' or pd.isna(df['Identification_Status'][index]):
continue
pnbl_pos = []
nnbl_pos = []
pnbl_index = -1
prev_flag = False
next_flag = False
pnbl_index = index -1
nnbl_index = index +1
if index == 0:
pnbl_pos = ['blank']
pnbl_index = 'first'
elif df['plb'][index] == 'N' :
print(pnbl_pos)
pnbl_pos = df['Identification_Status'][index-1].split(";")
pnbl_index = index -1
elif index - 1 == 0:
pnpl_pos = ['blank']
pnbl_index = 'first'
else:
pnbl_pos = df['Identification_Status'][index-2].split(";")
pnbl_index = index -2
if index == df.index[-1]:
nnbl_pos = ['blank']
nnbl_index = 'last'
elif df['nlb'][index] == 'N' :
nnbl_pos = df['Identification_Status'][index+1].split(";")
nnbl_par = df['parenthetical'][index+1]
nnbl_index = index + 1
elif index+1 == df.index[-1]:
nnbl_pos = ['blank']
nnbl_index = 'last'
else:
nnbl_pos = df['Identification_Status'][index+2].split(";")
nnbl_par = df['parenthetical'][index+2]
nnbl_index = index + 2
cur_indent = df['ssc'][index]
try:
pnbl_indent = df['ssc'][pnbl_index]
except:
pnbl_indent = -1
try:
nnbl_indent = df['ssc'][nnbl_index]
except:
nnbl_indent = -1
# try:
# if df['prvious_line_blank'][pnbl_index] == 'N' :
# ppnbl_pos = df['Identification_Status'][pnbl_index-1].split(";")
# else:
# ppnbl_pos = df['Identification_Status'][pnbl_index-2].split(";")
# except:
# pass
line_no = df['line_no'][index]
data = df['data'][index]
cur_line_pos = df['Identification_Status'][index].split(";")
pnbl_par = df['parenthetical'][pnbl_index] if pnbl_index != 'first' else False
try:
pnbl_top2 = pnbl_pos[1]
except:
pnbl_top2 = ''
# try:
# nnbl_top2 = nnbl_pos[1]
# except:
# nnbl_top2 = ''
#cur_indent = df['data_begins/Space count'][index]
#print(cur_indent)
#print("examining")
#print(line_no,cur_indent,data)
#print(cur_line_pos)
# print(pnbl_pos)
# print(cur_line_pos)
# print(nnbl_pos)
line_new_pos = []
print("\n")
cur_line_pos = df['Identification_Status'][index].split(";")
top1 = cur_line_pos[0]
top2 = top1
top3 = top1
top4 = top1
top5 = top1
if len(cur_line_pos) == 5:
top5 = cur_line_pos[4]
if len(cur_line_pos) >= 4:
top4 = cur_line_pos[3]
if len(cur_line_pos) >= 3:
top3 = cur_line_pos[2]
if len(cur_line_pos) >= 2:
top2 = cur_line_pos[1]
lcp = df['lcp'][index]
if len(pnbl_pos) == 0:
pnbl_pos = ['blank']
pnbl_top1 = pnbl_pos[0]
pnbl_top2 = pnbl_top1
pnbl_top3 = pnbl_top1
pnbl_top4 = pnbl_top1
pnbl_top5 = pnbl_top1
if len(pnbl_pos) == 5:
pnbl_top5 = pnbl_pos[4]
if len(pnbl_pos) >= 4:
pnbl_top4 = pnbl_pos[3]
print(pnbl_pos[3])
if len(pnbl_pos) >= 3:
pnbl_top3 = pnbl_pos[2]
if len(pnbl_pos) >= 2:
pnbl_top2 = pnbl_pos[1]
# lcp = df['last_character_placement'][index]
nnbl_top1 = nnbl_pos[0]
nnbl_top2 = nnbl_top1
nnbl_top3 = nnbl_top1
# nnbl_top4 = nnbl_top1
# nnbl_top5 = nnbl_top1
if len(nnbl_pos) >= 3:
nnbl_top3 = nnbl_pos[2]
if len(nnbl_pos) >= 2:
nnbl_top2 = nnbl_pos[1]
## top 1 is ps1 pnbl is 17 nnbl has ps4 in top2
if top1 == 'ps1' and "".join(pnbl_pos) == 'ps17' :
if nnbl_top1 == 'ps4' or nnbl_top2 == 'ps4':
print("identifying current as ps1 as between special term and action")
try:
print(line_no,data)
except:
pass
df['Identification_Status'][index] = 'ps1'
df['When_Identified'][index] = 'StartIdentifyingTopsPart6'
continue
for index in df.index:
#print(index)
try:
cur_line_pos = df['Identification_Status'][index].split(";")
except:
print("JJJJ",index)
if len(cur_line_pos) != 1 :
df['isIdentified'][index] == 'No'
else:
df['isIdentified'][index] == 'Yes'
return df
def start_top_identifications_part7(df):
for index in df.index:
if df['isIdentified'][index] == 'Yes' or pd.isna(df['Identification_Status'][index]):
continue
pnbl_pos = []
nnbl_pos = []
pnbl_index = -1
prev_flag = False
next_flag = False
pnbl_index = index -1
nnbl_index = index +1
if index == 0:
pnbl_pos = ['blank']
pnbl_index = 'first'
elif df['plb'][index] == 'N' :
print(pnbl_pos)
pnbl_pos = df['Identification_Status'][index-1].split(";")
pnbl_index = index -1
elif index - 1 == 0:
pnbl_pos = ['blank']
pnbl_index = 'first'
else:
pnbl_pos = df['Identification_Status'][index-2].split(";")
pnbl_index = index -2
if index == df.index[-1]:
nnbl_pos = ['blank']
nnbl_index = 'last'
elif df['nlb'][index] == 'N' :
nnbl_pos = df['Identification_Status'][index+1].split(";")
nnbl_par = df['parenthetical'][index+1]
nnbl_index = index + 1
elif index+1 == df.index[-1]:
nnbl_pos = ['blank']
nnbl_index = 'last'
else:
nnbl_pos = df['Identification_Status'][index+2].split(";")
nnbl_par = df['parenthetical'][index+2]
nnbl_index = index + 2
cur_indent = df['ssc'][index]
try:
pnbl_indent = df['ssc'][pnbl_index]
except:
pnbl_indent = -1
try:
nnbl_indent = df['ssc'][nnbl_index]
except:
nnbl_indent = -1
# try:
# if df['prvious_line_blank'][pnbl_index] == 'N' :
# ppnbl_pos = df['Identification_Status'][pnbl_index-1].split(";")
# else:
# ppnbl_pos = df['Identification_Status'][pnbl_index-2].split(";")
# except:
# pass
line_no = df['line_no'][index]
data = df['data'][index]
cur_line_pos = df['Identification_Status'][index].split(";")
pnbl_par = df['parenthetical'][pnbl_index] if pnbl_index != 'first' else False
try:
pnbl_top2 = pnbl_pos[1]
except:
pnbl_top2 = ''
# try:
# nnbl_top2 = nnbl_pos[1]
# except:
# nnbl_top2 = ''
#cur_indent = df['data_begins/Space count'][index]
#print(cur_indent)
#print("examining")
#print(line_no,cur_indent,data)
#print(cur_line_pos)
# print(pnbl_pos)
# print(cur_line_pos)
# print(nnbl_pos)
line_new_pos = []
print("\n")
cur_line_pos = df['Identification_Status'][index].split(";")
top1 = cur_line_pos[0]
top2 = top1
top3 = top1
top4 = top1
top5 = top1
if len(cur_line_pos) == 5:
top5 = cur_line_pos[4]
if len(cur_line_pos) >= 4:
top4 = cur_line_pos[3]
if len(cur_line_pos) >= 3:
top3 = cur_line_pos[2]
if len(cur_line_pos) >= 2:
top2 = cur_line_pos[1]
lcp = df['lcp'][index]
if len(pnbl_pos) == 0:
pnbl_pos = ['blank']
pnbl_top1 = pnbl_pos[0]
pnbl_top2 = pnbl_top1
pnbl_top3 = pnbl_top1
pnbl_top4 = pnbl_top1
pnbl_top5 = pnbl_top1
if len(pnbl_pos) == 5:
pnbl_top5 = pnbl_pos[4]
if len(pnbl_pos) >= 4:
pnbl_top4 = pnbl_pos[3]
print(pnbl_pos[3])
if len(pnbl_pos) >= 3:
pnbl_top3 = pnbl_pos[2]
if len(pnbl_pos) >= 2:
pnbl_top2 = pnbl_pos[1]
# lcp = df['last_character_placement'][index]
nnbl_top1 = nnbl_pos[0]
nnbl_top2 = nnbl_top1
nnbl_top3 = nnbl_top1
# nnbl_top4 = nnbl_top1
# nnbl_top5 = nnbl_top1
if len(nnbl_pos) >= 3:
nnbl_top3 = nnbl_pos[2]
if len(nnbl_pos) >= 2:
nnbl_top2 = nnbl_pos[1]
## top 1 and 2 are (ps6 and ps15)
ppnbl_indent = 0
print(top1,top2)
if (top1 == 'ps15' and top2 == 'ps6') or (top1 == 'ps6' and top2 == 'ps15') :
if cur_indent < pnbl_indent:
if df['plb'][pnbl_index] == 'N':
ppnbl_indent = df['ssc'][pnbl_index-1]
else:
ppnbl_indent = df['ssc'][pnbl_index-2]
if str(ppnbl_indent) < str(pnbl_indent):
print("identifying current as ps15 as possibly followed by speaker")
try:
print(line_no,data)
except:
pass
df['Identification_Status'][index] = 'ps15'
df['When_Identified'][index] = 'StartIdentifyingTopsPart7'
continue
## commentting as previous could be dialogue middle also
# elif cur_indent == pnbl_indent:
# print("identifying current as ps6 previous has same indent")
# try:
# print(line_no,data)
# except:
# pass
# df['Identification_Status'][index] = 'ps6'
# df['When_Identified'][index] = 'StartIdentifyingTopsPart7'
# continue
for index in df.index:
#print(index)
cur_line_pos = df['Identification_Status'][index].split(";")
if len(cur_line_pos) != 1 :
df['isIdentified'][index] == 'No'
else:
df['isIdentified'][index] == 'Yes'
return df
def start_top_identifications_part8(df):
for index in df.index:
if df['isIdentified'][index] == 'Yes' or pd.isna(df['Identification_Status'][index]):
continue
pnbl_pos = []
nnbl_pos = []
pnbl_index = -1
prev_flag = False
next_flag = False
pnbl_index = index -1
nnbl_index = index +1
if index == 0:
pnbl_pos = ['blank']
pnbl_index = 'first'
elif df['plb'][index] == 'N' :
print(pnbl_pos)
pnbl_pos = df['Identification_Status'][index-1].split(";")
pnbl_index = index -1
elif index - 1 == 0:
pnbl_pos = ['blank']
pnbl_index = 'first'
else:
pnbl_pos = df['Identification_Status'][index-2].split(";")
pnbl_index = index -2
if index == df.index[-1]:
nnbl_pos = ['blank']
nnbl_index = 'last'
elif df['nlb'][index] == 'N' :
nnbl_pos = df['Identification_Status'][index+1].split(";")
nnbl_par = df['parenthetical'][index+1]
nnbl_index = index + 1
elif index+1 == df.index[-1]:
nnbl_pos = ['blank']
nnbl_index = 'last'
else:
nnbl_pos = df['Identification_Status'][index+2].split(";")
nnbl_par = df['parenthetical'][index+2]
nnbl_index = index + 2
cur_indent = df['ssc'][index]
try:
pnbl_indent = df['ssc'][pnbl_index]
except:
pnbl_indent = -1
try:
nnbl_indent = df['ssc'][nnbl_index]
except:
nnbl_indent = -1
# try:
# if df['prvious_line_blank'][pnbl_index] == 'N' :
# ppnbl_pos = df['Identification_Status'][pnbl_index-1].split(";")
# else:
# ppnbl_pos = df['Identification_Status'][pnbl_index-2].split(";")
# except:
# pass
line_no = df['line_no'][index]
data = df['data'][index]
cur_line_pos = df['Identification_Status'][index].split(";")
pnbl_par = df['parenthetical'][pnbl_index] if pnbl_index != 'first' else False
try:
pnbl_top2 = pnbl_pos[1]
except:
pnbl_top2 = ''
# try:
# nnbl_top2 = nnbl_pos[1]
# except:
# nnbl_top2 = ''
#cur_indent = df['data_begins/Space count'][index]
#print(cur_indent)
#print("examining")
#print(line_no,cur_indent,data)
#print(cur_line_pos)
# print(pnbl_pos)
# print(cur_line_pos)
# print(nnbl_pos)
line_new_pos = []
print("\n")
cur_line_pos = df['Identification_Status'][index].split(";")
top1 = cur_line_pos[0]
top2 = top1
top3 = top1
top4 = top1
top5 = top1
if len(cur_line_pos) == 5:
top5 = cur_line_pos[4]
if len(cur_line_pos) >= 4:
top4 = cur_line_pos[3]
if len(cur_line_pos) >= 3:
top3 = cur_line_pos[2]
if len(cur_line_pos) >= 2:
top2 = cur_line_pos[1]
lcp = df['lcp'][index]
if len(pnbl_pos) == 0:
pnbl_pos = ['blank']
pnbl_top1 = pnbl_pos[0]
pnbl_top2 = pnbl_top1
pnbl_top3 = pnbl_top1
pnbl_top4 = pnbl_top1
pnbl_top5 = pnbl_top1
if len(pnbl_pos) == 5:
pnbl_top5 = pnbl_pos[4]
if len(pnbl_pos) >= 4:
pnbl_top4 = pnbl_pos[3]
print(pnbl_pos[3])
if len(pnbl_pos) >= 3:
pnbl_top3 = pnbl_pos[2]
if len(pnbl_pos) >= 2:
pnbl_top2 = pnbl_pos[1]
# lcp = df['last_character_placement'][index]
nnbl_top1 = nnbl_pos[0]
nnbl_top2 = nnbl_top1
nnbl_top3 = nnbl_top1
# nnbl_top4 = nnbl_top1
# nnbl_top5 = nnbl_top1
if len(nnbl_pos) >= 3:
nnbl_top3 = nnbl_pos[2]
if len(nnbl_pos) >= 2:
nnbl_top2 = nnbl_pos[1]
## top 1 and 2 are (ps6 and ps15)
ppnbl_indent = 0
print(top1,top2)
if (top1 == 'ps4' and top2 == 'ps6') or (top1 == 'ps6' and top2 == 'ps4') :
if "".join(pnbl_pos) == 'ps1' :
if "".join(nnbl_pos) == 'ps6' and df['nlb'][index] == 'N':
print("identifying current as ps4 ")
try:
print(line_no,data)
except:
pass
df['Identification_Status'][index] = 'ps4'
df['When_Identified'][index] = 'StartIdentifyingTopsPart8'
continue
elif 'ps5' not in nnbl_pos and nnbl_top1 != 'ps6' :
print("identifying current as ps6 as possibly between slug and speaker")
try:
print(line_no,data)
except:
pass
df['Identification_Status'][index] = 'ps6'
df['When_Identified'][index] = 'StartIdentifyingTopsPart8'
continue
for index in df.index:
#print(index)
cur_line_pos = df['Identification_Status'][index].split(";")
if len(cur_line_pos) != 1 :
df['isIdentified'][index] == 'No'
else:
df['isIdentified'][index] == 'Yes'
return df
#1.1
def decrease_wt_dial_between_action(df):
def useWeights(ps):
return int(ps.split("-")[1])
for index in df.index:
wt_changed = False
if df['isIdentified'][index] == 'Yes' or pd.isna(df['Identification_Status'][index]):
continue
pnbl_pos = []
nnbl_pos = []
pnbl_index = -1
prev_flag = False
next_flag = False
pnbl_index = index -1
nnbl_index = index +1
if index == 0:
pnbl_pos = ['blank']
pnbl_index = 'first'
elif df['plb'][index] == 'N' :
pnbl_pos = df['Identification_Status'][index-1].split(";")
print(pnbl_pos)
pnbl_index = index -1
elif index - 1 == 0:
pnpl_pos = ['blank']
pnbl_index = 'first'
else:
pnbl_pos = df['Identification_Status'][index-2].split(";")
pnbl_index = index -2
if index == df.index[-1]:
nnbl_pos = ['blank']
nnbl_index = 'last'
elif df['nlb'][index] == 'N' :
nnbl_pos = df['Identification_Status'][index+1].split(";")
nnbl_par = df['parenthetical'][index+1]
nnbl_index = index + 1
elif index+1 == df.index[-1]:
nnbl_pos = ['blank']
nnbl_index = 'last'
else:
nnbl_pos = df['Identification_Status'][index+2].split(";")
nnbl_par = df['parenthetical'][index+2]
nnbl_index = index + 2
line_no = df['line_no'][index]
data = df['data'][index]
cur_line_pos = df['Identification_Status'][index].split(";")
line_new_pos = []
print("\n")
cur_line_pos = df['Identification_Status'][index].split(";")
top1 = cur_line_pos[0]
top2 = top1
if len(cur_line_pos) >= 2:
top2 = cur_line_pos[1]
if len(pnbl_pos) == 0:
pnbl_pos = ['blank']
pnbl_top1 = pnbl_pos[0]
nnbl_top1 = nnbl_pos[0]
print("checking dialogue between action",index,pnbl_pos,cur_line_pos,nnbl_pos)
## if previous top is action start and current top (top 2 ) is dia middle then decrease weight of dialogue middle by 5
## also decrease wt of dialogue start by 11
if pnbl_top1 == 'ps4' and (top1 == 'ps14' or top2 == 'ps14'):
print(str(int(df['ps14'][index]) - 5))
df['ps14'][index] = str(int(df['ps14'][index]) - 5)
wt_changed = True
## if previous top is action start and next top1 is ps6 then increase wt of ps5 by 11
## also decrease wt of dialogue start by 11
if pnbl_top1 == 'ps4' and nnbl_top1 == 'ps6':
df['ps5'][index] = str(int(df['ps5'][index]) + 11)
#df['ps13'][index] = str(int(df['ps13'][index]) - 11)
wt_changed = True
if not wt_changed:
continue
else:
## append the weight to the possibilites
pos_with_weights = []
for pos in cur_line_pos:
print(pos)
wt = 0
pos_wt = str(pos)
try:
wt = df[pos][index].astype(int)
pos_wt += '-' + str(wt)
except:
try:
wt = int(df[pos][index])
pos_wt += '-' + str(wt)
except:
try:
wt = df[pos][index]
pos_wt += '-' + str(wt)
except:
continue
print(pos_wt)
pos_with_weights.append(pos_wt)
# now sort in descending order using the weights as key
pos_with_weights = sorted(pos_with_weights,key=useWeights , reverse = True)
line_pos_string_with_weights = ';'.join([str(elem) for elem in pos_with_weights])
df['Identification_Status_with_weights'][index] = line_pos_string_with_weights
## copy over to identification status without the weights but in order of decreasing weights
pos_without_weight = []
for pos in pos_with_weights:
pos_without_weight.append(pos.split("-")[0])
line_pos_string = ';'.join([str(elem) for elem in pos_without_weight])
print(line_pos_string)
df['Identification_Status'][index] = line_pos_string
for index in df.index:
#print(index)
cur_line_pos = df['Identification_Status'][index].split(";")
if len(cur_line_pos) != 1 :
df['isIdentified'][index] == 'No'
else:
df['isIdentified'][index] == 'Yes'
return df
def examine_among_two(df):
for index in df.index:
if df['isIdentified'][index] == 'Yes' or pd.isna(df['Identification_Status'][index]):
continue
pnbl_pos = []
nnbl_pos = []
pnbl_index = -1
prev_flag = False
next_flag = False
pnbl_index = index -1
nnbl_index = index +1
if index == 0:
pnbl_pos = ['blank']
pnbl_index = 'first'
elif df['plb'][index] == 'N' :
print(pnbl_pos)
pnbl_pos = df['Identification_Status'][index-1].split(";")
pnbl_index = index -1
elif index - 1 == 0:
pnpl_pos = ['blank']
pnbl_index = 'first'
else:
pnbl_pos = df['Identification_Status'][index-2].split(";")
pnbl_index = index -2
if index == df.index[-1]:
nnbl_pos = ['blank']
nnbl_index = 'last'
elif df['nlb'][index] == 'N' :
nnbl_pos = df['Identification_Status'][index+1].split(";")
nnbl_par = df['parenthetical'][index+1]
nnbl_index = index + 1
elif index+1 == df.index[-1]:
nnbl_pos = ['blank']
nnbl_index = 'last'
else:
nnbl_pos = df['Identification_Status'][index+2].split(";")
nnbl_par = df['parenthetical'][index+2]
nnbl_index = index + 2
cur_indent = df['ssc'][index]
try:
pnbl_indent = df['ssc'][pnbl_index]
except:
pnbl_indent = -1
try:
nnbl_indent = df['ssc'][nnbl_index]
nnbl_case = df['case'][nnbl_index]
except:
nnbl_indent = -1
# try:
# if df['prvious_line_blank'][pnbl_index] == 'N' :
# ppnbl_pos = df['Identification_Status'][pnbl_index-1].split(";")
# else:
# ppnbl_pos = df['Identification_Status'][pnbl_index-2].split(";")
# except:
# pass
line_no = df['line_no'][index]
data = df['data'][index]
cur_line_pos = df['Identification_Status'][index].split(";")
pnbl_par = df['parenthetical'][pnbl_index] if pnbl_index != 'first' else False
cur_line_par = df['parenthetical'][index]
cur_line_case = df['case'][index]
try:
pnbl_top2 = pnbl_pos[1]
except:
pnbl_top2 = ''
# try:
# nnbl_top2 = nnbl_pos[1]
# except:
# nnbl_top2 = ''
#cur_indent = df['data_begins/Space count'][index]
#print(cur_indent)
#print("examining")
#print(line_no,cur_indent,data)
#print(cur_line_pos)
# print(pnbl_pos)
# print(cur_line_pos)
# print(nnbl_pos)
line_new_pos = []
print("\n")
cur_line_pos = df['Identification_Status'][index].split(";")
top1 = cur_line_pos[0]
top2 = top1
top3 = top1
top4 = top1
top5 = top1
if len(cur_line_pos) == 5:
top5 = cur_line_pos[4]
if len(cur_line_pos) >= 4:
top4 = cur_line_pos[3]
if len(cur_line_pos) >= 3:
top3 = cur_line_pos[2]
if len(cur_line_pos) >= 2:
top2 = cur_line_pos[1]
lcp = df['lcp'][index]
if len(pnbl_pos) == 0:
pnbl_pos = ['blank']
pnbl_top1 = pnbl_pos[0]
pnbl_top2 = pnbl_top1
pnbl_top3 = pnbl_top1
pnbl_top4 = pnbl_top1
pnbl_top5 = pnbl_top1
if len(pnbl_pos) == 5:
pnbl_top5 = pnbl_pos[4]
if len(pnbl_pos) >= 4:
pnbl_top4 = pnbl_pos[3]
print(pnbl_pos[3])
if len(pnbl_pos) >= 3:
pnbl_top3 = pnbl_pos[2]
if len(pnbl_pos) >= 2:
pnbl_top2 = pnbl_pos[1]
# lcp = df['last_character_placement'][index]
nnbl_top1 = nnbl_pos[0]
nnbl_top2 = nnbl_top1
nnbl_top3 = nnbl_top1
# nnbl_top4 = nnbl_top1
# nnbl_top5 = nnbl_top1
if len(nnbl_pos) >= 3:
nnbl_top3 = nnbl_pos[2]
if len(nnbl_pos) >= 2:
nnbl_top2 = nnbl_pos[1]
try:
print(index,data,cur_line_case,top1,top2,nnbl_case,nnbl_top1)
except:
pass
nnbl_new_data = ''
before = ''
## if 6 and 22 left
## split after full stop to new line.. merge with subsequesnt transiton
if len(cur_line_pos) == 2 and (top1 == 'ps6' or top2 == 'ps22') or (top2 == 'ps6' or top1 == 'ps22'):
if re.search(".",data):
print("found full stop,separating")
parts = data.split(".")
last = parts[-1]
try:
print(last)
except:
pass
before = parts[0:-1]
print(" ".join(before))
print(nnbl_indent)
if len(last.split()) == 1:
print("single word after full stop")
if "".join(nnbl_pos) == 'ps16':
print("next is transition , merging")
nnbl_data = df['data'][nnbl_index]
try:
print(nnbl_data)
except:
pass
nnbl_new_data = last.strip() + ' ' + nnbl_data.strip()
try:
print(nnbl_new_data)
except:
pass
nnbl_new_data = nnbl_new_data.rjust(len(nnbl_new_data) + int(nnbl_indent))
df['data'][nnbl_index] = nnbl_new_data
print("Splitting current and Identifying current action end")
df['data'][index] = " ".join(before)
df['Identification_Status'][index] = 'ps6'
df['When_Identified'][index] = 'ExamineLastTwo'
continue
## if 7 and 8.. make 7 if no parenthtical
if len(cur_line_pos) == 2 and ((top1 == 'ps7' and top2 == 'ps8') or (top1 == 'ps8' and top2 == 'ps7')):
if df['parenthetical'][index] == 'Absent':
try:
print("Identifying as speaker as no parenthtical",data)
except:
pass
df['Identification_Status'][index] = 'ps7'
df['When_Identified'][index] = 'ExamineLastTwo'
continue
## if 1/2 and 30 left keep 1 if lcp <
if len(cur_line_pos) == 2 and (( (top1 == 'ps1' or top1 == 'ps2') and top2 == 'ps30') or (top1 == 'ps30' and top2 == 'ps1')):
print(index,cur_line_case,nnbl_case,nnbl_top1)
if df['lcp'][index] <= 63:
try:
print("Identifying as slugline",data)
except:
pass
df['Identification_Status'][index] = 'ps1'
df['When_Identified'][index] = 'ExamineLastTwo'
elif cur_line_case == 'AllUpper' and nnbl_case == 'AllUpper' and (nnbl_top1 == 'ps1' or nnbl_top1 == 'ps3'):
try:
print("Identifying as slugline beginning",data)
except:
pass
df['Identification_Status'][index] = 'ps2'
df['When_Identified'][index] = 'ExamineLastTwo'
try:
print("Identifying as slugline end",df['data'][nnbl_index])
except:
pass
df['Identification_Status'][nnbl_index] = 'ps3'
df['When_Identified'][nnbl_index] = 'ExamineLastTwo'
continue
## if 15 and 29 left keep 15 if lcp <
if len(cur_line_pos) == 2 and ((top1 == 'ps15' and top2 == 'ps29') or (top1 == 'ps29' and top2 == 'ps15')):
if df['lcp'][index] <= 51:
try:
print("Identifying as dialogue ending",data)
except:
pass
df['Identification_Status'][index] = 'ps15'
df['When_Identified'][index] = 'ExamineLastTwo'
continue
if len(cur_line_pos) == 2 and ((top1 == 'ps13' and top2 == 'ps9') or (top1 == 'ps9' and top2 == 'ps13')) :
if cur_line_par == 'Absent':
try:
print("Identifying as dialogue beginning",data)
except:
pass
df['Identification_Status'][index] = 'ps13'
df['When_Identified'][index] = 'ExamineLastTwo'
continue
if len(cur_line_pos) == 2:
if cur_line_pos[0] == 'ps1' and cur_line_pos[1] == 'ps17':
wt1 = int(df['ps1'][index])
wt17 = int(df['ps17'][index])
if wt1 - wt17 > 20:
print("identifying current as ps1 ")
try:
print(line_no,data)
except:
pass
df['Identification_Status'][index] = 'ps1'
df['When_Identified'][index] = 'ExamineLastTwo'
continue
### remove ps7 ,8 if in stopwords
elif cur_line_pos[0] == 'ps7':
line_new_pos = cur_line_pos
print("Checking stop words")
skip_words = ['ON THE SCREEN','ON THE TV','MORNING','AT HOTEL','TV','MONTAGES','MUSICAL MONTAGES','ESSENTIALS','LATER','ESSENTIAL']
search_data = data.replace(":","")
found_match = False
for word in skip_words:
if re.match(word,search_data.strip()):
found_match = True
break
if found_match:
try:
line_new_pos.remove('ps7')
line_new_pos.remove('ps8')
print("ps7,ps8 removed")
df['Identification_Status'][index] = ";".join(line_new_pos)
df['When_Identified'][index] = 'ExamineSpeakerSkipWords'
continue
except:
print("Could not remove speaker pos")
### remove ps3 if pnbl top 2 does not have ps2
else:
line_new_pos = cur_line_pos
print("Checking sluglineend")
if not (pnbl_top1 == 'ps2' or pnbl_top2 == 'ps2') and cur_line_pos[0] == 'ps3':
line_new_pos.remove('ps3')
print("ps3 removed")
df['Identification_Status'][index] = ";".join(line_new_pos)
df['When_Identified'][index] = 'ExamineSluglineEnd'
continue
for index in df.index:
#print(index)
cur_line_pos = df['Identification_Status'][index].split(";")
if len(cur_line_pos) != 1 :
df['isIdentified'][index] == 'No'
else:
df['isIdentified'][index] == 'Yes'
return df
def examine_action_using_top2_wt_diff(df):
for index in df.index[1:-1]:
if df['isIdentified'][index] == 'Yes':
continue
pnbl_pos = []
nnbl_pos = []
pnbl_index = index -1
nnbl_index = index +1
if index == 0:
pnbl_pos = ['blank']
elif df['plb'][index] == 'N' :
pnbl_pos = df['Identification_Status'][index-1].split(";")
pnbl_index = index -1
elif index - 1 == 0:
pnpl_pos = ['blank']
else:
pnbl_pos = df['Identification_Status'][index-2].split(";")
pnbl_index = index -2
if index == df.index[-1]:
nnbl_pos = ['blank']
elif df['nlb'][index] == 'N' :
nnbl_pos = df['Identification_Status'][index+1].split(";")
nnbl_par = df['parenthetical'][index+1]
nnbl_index = index +1
elif index+1 == df.index[-1]:
nnbl_pos = ['blank']
else:
nnbl_pos = df['Identification_Status'][index+2].split(";")
nnbl_par = df['parenthetical'][index+2]
nnbl_index = index +2
cur_indent = df['ssc'][index]
cur_case = df['case'][index]
try:
pnbl_indent = df['ssc'][pnbl_index]
pnbl_case = df['case'][pnbl_index]
except:
pnbl_indent = -1
pnbl_case = ''
#nnbl_indent = df['ssc'][nnbl_index]
try:
nnbl_indent = df['ssc'][nnbl_index]
nnbl_case = df['case'][nnbl_index]
except:
nnbl_indent = -1
nnbl_case = ''
try:
if df['plb'][pnbl_index] == 'N' :
ppnbl_pos = df['Identification_Status'][pnbl_index-1].split(";")
else:
ppnbl_pos = df['Identification_Status'][pnbl_index-2].split(";")
ppnbl_exists = True
except:
ppnbl_exists = False
pass
line_no = df['line_no'][index]
data = df['data'][index]
cur_line_pos = df['Identification_Status'][index].split(";")
pnbl_par = df['parenthetical'][pnbl_index]
try:
pnbl_top2 = pnbl_pos[1]
except:
pnbl_top2 = ''
# try:
# nnbl_top2 = nnbl_pos[1]
# except:
# nnbl_top2 = ''
# print(line_no,data)
# print(pnbl_pos)
# print(cur_line_pos)
# print(nnbl_pos)
line_new_pos = []
#using pnbl and nnbl identified lines refine/identify current line
# if "".join(pnbl_pos) in ('ps15','ps6') and cur_line_pos[0] == 'ps4':
# print(line_no,data)
# print("pnbl is 15 or 6 and current top is 'ps4'")
# print("Identifying as ps4")
# df['Identification_Status'][index] = 'ps4'
# cur_line_pos = ['ps4']
# df['When_Identified'][index] = 'RefiningActionPossibilities'
cur_line_pos = df['Identification_Status'][index].split(";")
top1 = cur_line_pos[0]
top2 = top1
top3 = top1
top4 = top1
top5 = top1
if len(cur_line_pos) == 5:
top5 = cur_line_pos[4]
if len(cur_line_pos) >= 4:
top4 = cur_line_pos[3]
if len(cur_line_pos) >= 3:
top3 = cur_line_pos[2]
if len(cur_line_pos) >= 2:
top2 = cur_line_pos[1]
top1_wt = df[top1][index]
top2_wt = df[top2][index]
top2_wt_diff = top1_wt - top2_wt
## if top is 6
if cur_line_pos[0] == 'ps6' :
print("top 2 wt diff",top2_wt_diff)
if cur_indent < 25 and "".join(nnbl_pos) == 'ps1' and top2_wt_diff > 15:
print("identifying as ps6")
df['Identification_Status'][index] = 'ps6'
df['When_Identified'][index] = 'ExamineActionUsingTop2Wt'
## if top is 5
if cur_line_pos[0] == 'ps5' and cur_case != 'AllUpper':
print("top 2 wt diff",top2_wt_diff)
if pnbl_indent == cur_indent and cur_indent == nnbl_indent and (("".join(pnbl_pos) == 'ps4' or "".join(pnbl_pos) == 'ps5') or ("".join(nnbl_pos) == 'ps6' or "".join(nnbl_pos) == 'ps5'))and top2_wt_diff > 10 and pnbl_case != 'AllUpper' and nnbl_case != 'AllUpper' :
print("identifying as ps5")
df['Identification_Status'][index] = 'ps5'
df['When_Identified'][index] = 'ExamineActionUsingTop2Wt'
return df
def identify_top_as_final(df):
#take the top possibility as final
for index in df.index:
cur_line_pos = df['Identification_Status'][index].split(";")
top1 = cur_line_pos[0]
top2 = top1
top3 = top1
top4 = top1
top5 = top1
if len(cur_line_pos) == 5:
top5 = cur_line_pos[4]
if len(cur_line_pos) >= 4:
top4 = cur_line_pos[3]
if len(cur_line_pos) >= 3:
top3 = cur_line_pos[2]
if len(cur_line_pos) >= 2:
top2 = cur_line_pos[1]
if df['isIdentified'][index] == 'Yes':
continue
contains_slug_words = False
data = df['data'][index]
sp_words3 = ['INT.','EXT.','I/E','E/I','EXT-','INT-']
for sp_word in sp_words3:
print(sp_word)
#search_data = data.replace(":","")
found = re.search(sp_word,data.strip()[0:8])
if found:
contains_slug_words = True
break
#line_pos = df['Identification_Status'][index].split(";")
if (top1 == 'ps1' or top1 == 'ps2') and not contains_slug_words:
df['Identification_Status'][index] = top2
continue
df['Identification_Status'][index] = top1
# df['isIdentified'][index] = 'No'
return df
def run_audit_on_identified_backup(df,audit_df):
def correct_case(df,audit_df,index,new_case):
##
line_no = df['line_no'][index]
print("correcting case to",new_case)
if new_case == 'AllUpper':
df['data'][index] = df['data'][index].upper()
elif new_case == 'AllLower':
df['data'][index] = df['data'][index].lower()
df['case'][index] = new_case
#audit_df['case_format'][line_no] = new_case
audit_df['case_corrected'][line_no] = 'Corrected to ' + str(new_case)
def correct_left_indent(df,audit_df,index,new_indent):
##
line_no = df['line_no'][index]
data = df['data'][index]
data = data.strip()
print("Correcting left indent to",new_indent)
df['data'][index] = data.rjust(len(data)+new_indent)
df['ssc'][index] = new_indent
df['lcp'][index] = new_indent + len(data) - 1
audit_df['left_indent_corrected'][line_no] = 'Left indent Corrected to ' + str(new_indent)
def correct_right_indent(df,audit_df,index,new_lcp):
##
line_no = df['line_no'][index]
data = df['data'][index]
data = data.strip()
new_indent = 0
print("Correcting right indent to",83 - new_lcp -1)
new_indent = new_lcp - len(data) + 1
df['data'][index] = data.rjust(len(data) + new_indent)
df['ssc'][index] = new_indent
df['lcp'][index] = new_lcp
audit_df['right_indent_corrected'][line_no] = 'Right indent Corrected to ' + str(83 - new_lcp -1)
def delete_line_after(df,audit_df,index):
line_no = df['line_no'][index]
removed_line_no = df['line_no'][index+1]
df.drop(index + 1, inplace= True)
print("line deleted after",line_no)
print("line no deleted ",removed_line_no)
audit_df['blank_deleted_after'][line_no] = 'Yes'
audit_df['line_removed'][removed_line_no] = 'Yes'
def delete_line_before(df,audit_df,line_no):
line_no = df['line_no'][index]
removed_line_no = df['line_no'][index-1]
df.drop(index - 1, inplace= True)
print("line deleted before",line_no)
audit_df['blank_deleted_before'][line_no] = 'Yes'
audit_df['line_removed'][removed_line_no] = 'Yes'
def insert_line_after(df,audit_df,index):
line_no = df['line_no'][index]
next_line_no = df['line_no'][index+1]
new_line_no = (line_no + next_line_no) / 2
if new_line_no in audit_df.index:
new_line_no = (new_line_no + next_line_no)/2
print("inserted blank line after ", line_no)
df.loc[index + 0.25] = np.nan
df.loc[index + 0.25,'Identification_Status'] = 'blank'
df.loc[index + 0.25,'case'] = ''
df.loc[index + 0.25,'plb'] = 'N'
df.loc[index + 0.25,'nlb'] = 'N'
df.loc[index + 0.25,'line_no'] = new_line_no
df['plb'][index + 1] = 'Y'
audit_df['blank_inserted_after'][line_no] = 'Yes'
audit_df.loc[new_line_no] = 'No'
audit_df.loc[new_line_no]['data'] = ''
audit_df.loc[new_line_no]['data_corrected'] = ''
audit_df.loc[new_line_no]['line_removed'] = 'No'
print("line inserted after ",line_no)
def insert_line_before(df,audit_df,index):
line_no = df['line_no'][index]
pvs_line_no = df['line_no'][index-1]
new_line_no = (line_no + pvs_line_no) / 2
if new_line_no in audit_df.index:
new_line_no = (new_line_no + line_no)/2
print("inserted blank line before",line_no)
df.loc[index - 0.25] = np.nan
df.loc[index - 0.25,'Identification_Status'] = 'blank'
df.loc[index - 0.25,'case'] = 'None'
df.loc[index - 0.25,'plb'] = 'N'
df.loc[index - 0.25,'nlb'] = 'N'
df.loc[index - 0.25,'line_no'] = new_line_no
df['nlb'][index - 1] = 'Y'
audit_df['blank_inserted_before'][line_no] = 'Yes'
audit_df.loc[new_line_no] = ''
audit_df.loc[new_line_no]['line_removed'] = 'No'
def check_and_remove_numbers(df,audit_df,index):
data = df['data'][index]
start_is_num = True
scene_num = ''
## check if number at start
while start_is_num:
sub_num = re.search('\d',data.lstrip())
if sub_num:
if sub_num.start() == 0:
data = data.replace(sub_num.group(0),'')
df['data'][index] = data
print(data)
scene_num += sub_num.group(0)
continue
start_is_num = False
print("scene num",scene_num)
def audit_ps1(df,audit_df,index):
print("Auditing Slugline")
cur_indent = df['ssc'][index]
cur_data = df['data'][index]
cur_case = df['case'][index]
try:
print(cur_data)
except:
pass
new_indent = 15
if cur_indent != new_indent:
correct_left_indent(df,audit_df,index,new_indent)
else:
print("indent already",new_indent)
#check and correct case
new_case = 'AllUpper'
if cur_case != new_case:
correct_case(df,audit_df,index,new_case)
else:
print("Case already",new_case)
## plb nlb
plb = df['plb'][index]
nlb = df['nlb'][index]
if plb == 'N':
if index != 0 :
insert_line_before(df,audit_df,index)
df['plb'][index] = 'Y'
else:
print("previous line already blank")
if nlb == 'N':
insert_line_after(df,audit_df,index)
df['nlb'][index] = 'Y'
else:
print("next line already blank")
## remove numbers if found at start
check_and_remove_numbers(df,audit_df,index)
def audit_ps4(df,audit_df,index):
print("Auditing Action Beginning")
nl_deleted = False
cur_indent = df['ssc'][index]
cur_data = df['data'][index]
cur_case = df['case'][index]
new_indent = 15
if cur_indent != new_indent:
correct_left_indent(df,audit_df,index,new_indent)
else:
print("indent already",new_indent)
#check and correct case
# new_case = 'AllLower'
# if cur_case != new_case:
# correct_case(df,audit_df,index,new_case)
# else:
# print("Case already",new_case)
## plb nlb
plb = df['plb'][index]
nlb = df['nlb'][index]
if plb == 'N':
insert_line_before(df,audit_df,index)
df['plb'][index] = 'Y'
else:
print("previous line already blank")
if nlb == 'Y':
delete_line_after(df,audit_df,index)
nl_deleted = True
df['nlb'][index] = 'N'
else:
print("next line not blank")
return nl_deleted
def audit_ps5(df,audit_df,index):
print("Auditing Action Middle")
nl_deleted = False
cur_indent = df['ssc'][index]
cur_data = df['data'][index]
cur_case = df['case'][index]
new_indent = 15
if cur_indent != new_indent:
correct_left_indent(df,audit_df,index,new_indent)
else:
print("indent already",new_indent)
#check and correct case
# new_case = 'AllLower'
# if cur_case != new_case:
# correct_case(df,audit_df,index,new_case)
# else:
# print("Case already",new_case)
## plb nlb
plb = df['plb'][index]
nlb = df['nlb'][index]
if plb == 'Y':
delete_line_before(df,audit_df,index)
df['plb'][index] = 'N'
else:
print("previous line already non blank")
if nlb == 'Y':
delete_line_after(df,audit_df,index)
nl_deleted = True
df['nlb'][index] = 'N'
else:
print("next line not blank")
return nl_deleted
def audit_ps6(df,audit_df,index):
print("Auditing Action Ending")
nl_deleted = False
cur_indent = df['ssc'][index]
cur_data = df['data'][index]
cur_case = df['case'][index]
new_indent = 15
if cur_indent != new_indent:
correct_left_indent(df,audit_df,index,new_indent)
else:
print("indent already",new_indent)
#check and correct case
# new_case = 'AllLower'
# if cur_case != new_case:
# correct_case(df,audit_df,index,new_case)
# else:
# print("Case already",new_case)
## plb nlb
plb = df['plb'][index]
nlb = df['nlb'][index]
if df['Identification_Status'][index - 1] in ('ps4','ps5'):
if plb == 'Y':
delete_line_before(df,audit_df,index)
df['plb'][index] = 'N'
else:
print("previous line already non blank")
else:
## later move this to insert line before
pnbl_line_no = df['pnbl_line_no'][index]
try:
pnbl_identified = True if df.loc[df['line_no'] == pnbl_line_no,'isIdentified'] == 'Yes' else False
except:
pnbl_identified = False
if plb == 'N' and pnbl_identified:
insert_line_before(df,audit_df,index)
df['plb'][index] = 'Y'
else:
print("previous line already blank")
if nlb == 'N':
insert_line_after(df,audit_df,index)
df['nlb'][index] = 'Y'
else:
print("next line already blank")
def audit_ps7(df,audit_df,index):
print("Auditing Speaker")
nl_deleted = False
cur_indent = df['ssc'][index]
cur_data = df['data'][index]
cur_case = df['case'][index]
new_indent = 35
if cur_indent != new_indent:
correct_left_indent(df,audit_df,index,new_indent)
else:
print("indent already",new_indent)
#check and correct case
new_case = 'AllUpper'
if cur_case != new_case:
correct_case(df,audit_df,index,new_case)
else:
print("Case already",new_case)
## plb nlb
plb = df['plb'][index]
nlb = df['nlb'][index]
if plb == 'N':
insert_line_before(df,audit_df,index)
df['plb'][index] = 'Y'
else:
print("previous line already blank")
if nlb == 'Y':
delete_line_after(df,audit_df,index)
nl_deleted = True
df['nlb'][index] = 'N'
else:
print("next line not blank")
return nl_deleted
def audit_ps10(df,audit_df,index):
print("Auditing Parenthetical complete")
nl_deleted = False
cur_indent = df['ssc'][index]
cur_data = df['data'][index]
cur_case = df['case'][index]
new_indent = 30
if cur_indent != new_indent:
correct_left_indent(df,audit_df,index,new_indent)
else:
print("indent already",new_indent)
#check and correct case
new_case = 'AllLower'
if cur_case != new_case:
correct_case(df,audit_df,index,new_case)
else:
print("Case already",new_case)
## plb nlb
plb = df['plb'][index]
nlb = df['nlb'][index]
if plb == 'Y':
delete_line_before(df,audit_df,index)
df['plb'][index] = 'N'
else:
print("previous line already blank")
if nlb == 'Y':
delete_line_after(df,audit_df,index)
nl_deleted = True
df['nlb'][index] = 'N'
else:
print("next line not blank")
return nl_deleted
def audit_ps11(df,audit_df,index):
print("Auditing Parenthetical beginning")
nl_deleted = False
cur_indent = df['ssc'][index]
cur_data = df['data'][index]
cur_case = df['case'][index]
new_indent = 30
if cur_indent != new_indent:
correct_left_indent(df,audit_df,index,new_indent)
else:
print("indent already",new_indent)
#check and correct case
new_case = 'AllLower'
if cur_case != new_case:
correct_case(df,audit_df,index,new_case)
else:
print("Case already",new_case)
## plb nlb
plb = df['plb'][index]
nlb = df['nlb'][index]
if plb == 'Y':
delete_line_before(df,audit_df,index)
df['plb'][index] = 'N'
else:
print("previous line already blank")
if nlb == 'Y':
delete_line_after(df,audit_df,index)
nl_deleted = True
df['nlb'][index] = 'N'
else:
print("next line not blank")
return nl_deleted
def audit_ps20(df,audit_df,index):
print("Auditing Parenthetical middle")
nl_deleted = False
cur_indent = df['ssc'][index]
cur_data = df['data'][index]
cur_case = df['case'][index]
new_indent = 30
if cur_indent != new_indent:
correct_left_indent(df,audit_df,index,new_indent)
else:
print("indent already",new_indent)
#check and correct case
new_case = 'AllLower'
if cur_case != new_case:
correct_case(df,audit_df,index,new_case)
else:
print("Case already",new_case)
## plb nlb
plb = df['plb'][index]
nlb = df['nlb'][index]
if plb == 'Y':
delete_line_before(df,audit_df,index)
df['plb'][index] = 'N'
else:
print("previous line already blank")
if nlb == 'Y':
delete_line_after(df,audit_df,index)
nl_deleted = True
df['nlb'][index] = 'N'
else:
print("next line not blank")
return nl_deleted
def audit_ps12(df,audit_df,index):
print("Auditing Parenthetical end")
nl_deleted = False
cur_indent = df['ssc'][index]
cur_data = df['data'][index]
cur_case = df['case'][index]
new_indent = 30
if cur_indent != new_indent:
correct_left_indent(df,audit_df,index,new_indent)
else:
print("indent already",new_indent)
#check and correct case
new_case = 'AllLower'
if cur_case != new_case:
correct_case(df,audit_df,index,new_case)
else:
print("Case already",new_case)
## plb nlb
plb = df['plb'][index]
nlb = df['nlb'][index]
if plb == 'Y':
delete_line_before(df,audit_df,index)
df['plb'][index] = 'N'
else:
print("previous line already blank")
if nlb == 'Y':
delete_line_after(df,audit_df,index)
nl_deleted = True
df['nlb'][index] = 'N'
else:
print("next line not blank")
return nl_deleted
def audit_ps13(df,audit_df,index):
print("Auditing Dialogue Beginning")
nl_deleted = False
cur_indent = df['ssc'][index]
cur_data = df['data'][index]
cur_case = df['case'][index]
new_indent = 25
if cur_indent != new_indent:
correct_left_indent(df,audit_df,index,new_indent)
else:
print("indent already",new_indent)
#check and correct case
# new_case = 'AllLower'
# if cur_case != new_case:
# correct_case(df,audit_df,index,new_case)
# else:
# print("Case already",new_case)
## plb nlb
plb = df['plb'][index]
nlb = df['nlb'][index]
if plb == 'Y':
delete_line_before(df,audit_df,index)
df['plb'][index] = 'N'
else:
print("previous line already blank")
if nlb == 'Y':
delete_line_after(df,audit_df,index)
nl_deleted = True
df['nlb'][index] = 'N'
else:
print("next line not blank")
return nl_deleted
def audit_ps14(df,audit_df,index):
print("Auditing Dialogue Middle")
nl_deleted = False
cur_indent = df['ssc'][index]
cur_data = df['data'][index]
cur_case = df['case'][index]
new_indent = 25
if cur_indent != new_indent:
correct_left_indent(df,audit_df,index,new_indent)
else:
print("indent already",new_indent)
#check and correct case
# new_case = 'AllLower'
# if cur_case != new_case:
# correct_case(df,audit_df,index,new_case)
# else:
# print("Case already",new_case)
## plb nlb
plb = df['plb'][index]
nlb = df['nlb'][index]
if plb == 'Y':
delete_line_before(df,audit_df,index)
df['plb'][index] = 'N'
else:
print("previous line already blank")
if nlb == 'Y':
delete_line_after(df,audit_df,index)
nl_deleted = True
df['nlb'][index] = 'N'
else:
print("next line not blank")
return nl_deleted
def audit_ps15(df,audit_df,index):
print("Auditing Dialogue End")
nl_deleted = False
cur_indent = df['ssc'][index]
cur_data = df['data'][index]
cur_case = df['case'][index]
new_indent = 25
if cur_indent != new_indent:
correct_left_indent(df,audit_df,index,new_indent)
else:
print("indent already",new_indent)
# #check and correct case
# new_case = 'AllLower'
# if cur_case != new_case:
# correct_case(df,audit_df,index,new_case)
# else:
# print("Case already",new_case)
## plb nlb
plb = df['plb'][index]
nlb = df['nlb'][index]
if plb == 'Y':
delete_line_before(df,audit_df,index)
df['plb'][index] = 'N'
else:
print("previous line already not blank")
if nlb == 'N':
nl_pos = df['Identification_Status'][index+1]
if nl_pos == 'ps10':
print("not inserting blank as next is parenthtical")
else:
insert_line_after(df,audit_df,index)
df['nlb'][index] = 'Y'
else:
print("next line already blank")
def audit_ps16(df,audit_df,index):
print("Auditing Transition")
cur_indent = df['ssc'][index]
cur_data = df['data'][index]
cur_case = df['case'][index]
cur_lcp = df['lcp'][index]
new_lcp = 72
if cur_lcp != new_lcp:
correct_right_indent(df,audit_df,index,new_lcp)
else:
print("indent already",new_lcp)
#check and correct case
print(cur_case,"123")
new_case = 'AllUpper'
if cur_case != new_case:
correct_case(df,audit_df,index,new_case)
else:
print("Case already",new_case)
## plb nlb
plb = df['plb'][index]
nlb = df['nlb'][index]
if plb == 'N':
insert_line_before(df,audit_df,index)
df['plb'][index] = 'Y'
else:
print("previous line already blank")
if nlb == 'N':
insert_line_after(df,audit_df,index)
df['nlb'][index] = 'Y'
else:
print("next line already blank")
def audit_ps17(df,audit_df,index):
print("Auditing Special Term")
cur_indent = df['ssc'][index]
cur_data = df['data'][index]
cur_case = df['case'][index]
new_indent = 15
if cur_indent != new_indent:
correct_left_indent(df,audit_df,index,new_indent)
else:
print("indent already",new_indent)
#check and correct case
new_case = 'AllUpper'
if cur_case != new_case:
correct_case(df,audit_df,index,new_case)
else:
print("Case already",new_case)
## plb nlb
plb = df['plb'][index]
nlb = df['nlb'][index]
if plb == 'N':
insert_line_before(df,audit_df,index)
df['plb'][index] = 'Y'
else:
print("previous line already blank")
if nlb == 'N':
insert_line_after(df,audit_df,index)
df['nlb'][index] = 'Y'
else:
print("next line already blank")
index_iter = iter(df.index)
for index in index_iter:
if (df['Identification_Status'][index] == 'blank'):
continue
nl_deleted = False
cur_line_pos = df['Identification_Status'][index]
fn_name = 'audit_' + cur_line_pos
line_no = df['line_no'][index]
print("\n")
print("line no",line_no)
print("index ",index)
print(cur_line_pos)
try:
to_call_fn = locals()[fn_name]
print(to_call_fn)
except:
continue
try:
nl_deleted = to_call_fn(df,audit_df,index)
except:
pass
if nl_deleted :
next(index_iter)
df = df.sort_index().reset_index(drop=True)
#df = df.sort_values(by=['line_no']).reset_index(drop =True)
return df
def run_audit_on_identified(df,audit_df = False):
def correct_case(df,index,new_case,audit_df = False):
##
line_no = df['line_no'][index]
print("correcting case to",new_case)
if new_case == 'AllUpper':
df['data'][index] = df['data'][index].upper()
elif new_case == 'AllLower':
df['data'][index] = df['data'][index].lower()
df['case'][index] = new_case
try:
if not audit_df.empty:
audit_df['case_corrected'][line_no] = 'Corrected to ' + str(new_case)
except:
pass
def correct_left_indent(df,index,new_indent,audit_df= False):
##
line_no = df['line_no'][index]
data = df['data'][index]
data = data.strip()
print("Correcting left indent to",new_indent)
df['data'][index] = data.rjust(len(data)+new_indent)
df['ssc'][index] = new_indent
df['lcp'][index] = new_indent + len(data) - 1
try:
if not audit_df.empty:
audit_df['left_indent_corrected'][line_no] = 'Left indent Corrected to ' + str(new_indent)
except:
pass
def correct_right_indent(df,index,new_lcp,audit_df=False):
##
line_no = df['line_no'][index]
data = df['data'][index]
data = data.strip()
new_indent = 0
print("Correcting right indent to",83 - new_lcp -1)
new_indent = new_lcp - len(data) + 1
df['data'][index] = data.rjust(len(data) + new_indent)
df['ssc'][index] = new_indent
df['lcp'][index] = new_lcp
try:
if not audit_df.empty:
audit_df['right_indent_corrected'][line_no] = 'Right indent Corrected to ' + str(83 - new_lcp -1)
except:
pass
def delete_line_after(df,index,audit_df=False):
line_no = df['line_no'][index]
removed_line_no = df['line_no'][index+1]
df.drop(index + 1, inplace= True)
print("line deleted after",line_no)
print("line no deleted ",removed_line_no)
try:
if not audit_df.empty:
audit_df['blank_deleted_after'][line_no] = 'Yes'
audit_df['line_removed'][removed_line_no] = 'Yes'
except:
pass
def delete_line_before(df,line_no,audit_df=False):
line_no = df['line_no'][index]
removed_line_no = df['line_no'][index-1]
df.drop(index - 1, inplace= True)
print("line deleted before",line_no)
try:
if not audit_df.empty:
audit_df['blank_deleted_before'][line_no] = 'Yes'
audit_df['line_removed'][removed_line_no] = 'Yes'
except:
pass
def insert_line_after(df,index,audit_df=False):
line_no = df['line_no'][index]
next_line_no = df['line_no'][index+1]
new_line_no = (line_no + next_line_no) / 2
try:
if not audit_df.empty:
if new_line_no in audit_df.index:
new_line_no = (new_line_no + next_line_no)/2
except:
pass
print("inserted blank line after ", line_no)
df.loc[index + 0.25] = np.nan
df.loc[index + 0.25,'data'] = ''
df.loc[index + 0.25,'Identification_Status'] = 'blank'
df.loc[index + 0.25,'case'] = ''
df.loc[index + 0.25,'plb'] = 'N'
df.loc[index + 0.25,'nlb'] = 'N'
df.loc[index + 0.25,'line_no'] = new_line_no
df['plb'][index + 1] = 'Y'
try:
if not audit_df.empty:
audit_df['blank_inserted_after'][line_no] = 'Yes'
audit_df.loc[new_line_no] = np.nan
audit_df.loc[new_line_no]['data'] = ''
audit_df.loc[new_line_no]['data_corrected'] = ''
audit_df.loc[new_line_no]['line_removed'] = 'No'
except:
pass
print("line inserted after ",line_no)
def insert_line_before(df,index,audit_df=False):
line_no = df['line_no'][index]
pvs_line_no = df['line_no'][index-1]
new_line_no = (line_no + pvs_line_no) / 2
try:
if not audit_df.empty:
if new_line_no in audit_df.index:
new_line_no = (new_line_no + line_no)/2
except:
pass
print("inserted blank line before",line_no)
df.loc[index - 0.25] = np.nan
df.loc[index - 0.25,'Identification_Status'] = 'blank'
df.loc[index - 0.25,'data'] = ''
df.loc[index - 0.25,'case'] = 'None'
df.loc[index - 0.25,'plb'] = 'N'
df.loc[index - 0.25,'nlb'] = 'N'
df.loc[index - 0.25,'line_no'] = new_line_no
df['nlb'][index - 1] = 'Y'
try:
if not audit_df.empty:
audit_df['blank_inserted_before'][line_no] = 'Yes'
audit_df.loc[new_line_no] = np.nan
audit_df.loc[new_line_no]['line_removed'] = 'No'
except:
pass
def check_and_remove_numbers(df,index,audit_df=False):
data = df['data'][index]
start_is_num = True
scene_num = ''
## check if number at start
while start_is_num:
sub_num = re.search('\d',data.lstrip())
if sub_num:
if sub_num.start() == 0:
data = data.replace(sub_num.group(0),'')
df['data'][index] = data
print(data)
scene_num += sub_num.group(0)
continue
start_is_num = False
print("scene num",scene_num)
def audit_ps1(df,index,audit_df=False):
print("Auditing Slugline")
cur_indent = df['ssc'][index]
cur_data = df['data'][index]
cur_case = df['case'][index]
try:
print(cur_data)
except:
pass
new_indent = 15
if cur_indent != new_indent:
correct_left_indent(df,index,new_indent,audit_df)
else:
print("indent already",new_indent)
#check and correct case
new_case = 'AllUpper'
if cur_case != new_case:
correct_case(df,index,new_case,audit_df)
else:
print("Case already",new_case)
## plb nlb
plb = df['plb'][index]
nlb = df['nlb'][index]
if plb == 'N':
if index != 0 :
insert_line_before(df,index,audit_df)
df['plb'][index] = 'Y'
else:
print("previous line already blank")
if nlb == 'N':
insert_line_after(df,index,audit_df)
df['nlb'][index] = 'Y'
else:
print("next line already blank")
## remove numbers if found at start
check_and_remove_numbers(df,index,audit_df)
def audit_ps4(df,index,audit_df=False):
print("Auditing Action Beginning")
nl_deleted = False
cur_indent = df['ssc'][index]
cur_data = df['data'][index]
cur_case = df['case'][index]
new_indent = 15
if cur_indent != new_indent:
correct_left_indent(df,index,new_indent,audit_df)
else:
print("indent already",new_indent)
#check and correct case
# new_case = 'AllLower'
# if cur_case != new_case:
# correct_case(df,audit_df,index,new_case)
# else:
# print("Case already",new_case)
## plb nlb
plb = df['plb'][index]
nlb = df['nlb'][index]
if plb == 'N':
insert_line_before(df,index,audit_df)
df['plb'][index] = 'Y'
else:
print("previous line already blank")
if nlb == 'Y':
delete_line_after(df,index,audit_df)
nl_deleted = True
df['nlb'][index] = 'N'
else:
print("next line not blank")
return nl_deleted
def audit_ps5(df,index,audit_df=False):
print("Auditing Action Middle")
nl_deleted = False
cur_indent = df['ssc'][index]
cur_data = df['data'][index]
cur_case = df['case'][index]
new_indent = 15
if cur_indent != new_indent:
correct_left_indent(df,index,new_indent,audit_df)
else:
print("indent already",new_indent)
#check and correct case
# new_case = 'AllLower'
# if cur_case != new_case:
# correct_case(df,audit_df,index,new_case)
# else:
# print("Case already",new_case)
## plb nlb
plb = df['plb'][index]
nlb = df['nlb'][index]
if plb == 'Y':
delete_line_before(df,index,audit_df)
df['plb'][index] = 'N'
else:
print("previous line already non blank")
if nlb == 'Y':
delete_line_after(df,index,audit_df)
nl_deleted = True
df['nlb'][index] = 'N'
else:
print("next line not blank")
return nl_deleted
def audit_ps6(df,index,audit_df=False):
print("Auditing Action Ending")
nl_deleted = False
cur_indent = df['ssc'][index]
cur_data = df['data'][index]
cur_case = df['case'][index]
new_indent = 15
if cur_indent != new_indent:
correct_left_indent(df,index,new_indent,audit_df)
else:
print("indent already",new_indent)
#check and correct case
# new_case = 'AllLower'
# if cur_case != new_case:
# correct_case(df,audit_df,index,new_case)
# else:
# print("Case already",new_case)
## plb nlb
plb = df['plb'][index]
nlb = df['nlb'][index]
if df['Identification_Status'][index - 1] in ('ps4','ps5'):
if plb == 'Y':
delete_line_before(df,index,audit_df)
df['plb'][index] = 'N'
else:
print("previous line already non blank")
else:
## later move this to insert line before
pnbl_line_no = df['pnbl_line_no'][index]
try:
pnbl_identified = True if df.loc[df['line_no'] == pnbl_line_no,'isIdentified'] == 'Yes' else False
except:
pnbl_identified = False
if plb == 'N' and pnbl_identified:
insert_line_before(df,index,audit_df)
df['plb'][index] = 'Y'
else:
print("previous line already blank")
if nlb == 'N':
insert_line_after(df,index,audit_df)
df['nlb'][index] = 'Y'
else:
print("next line already blank")
def audit_ps7(df,index,audit_df=False):
print("Auditing Speaker")
nl_deleted = False
cur_indent = df['ssc'][index]
cur_data = df['data'][index]
cur_case = df['case'][index]
new_indent = 35
if cur_indent != new_indent:
correct_left_indent(df,index,new_indent,audit_df)
else:
print("indent already",new_indent)
#check and correct case
new_case = 'AllUpper'
if cur_case != new_case:
correct_case(df,index,new_case,audit_df)
else:
print("Case already",new_case)
## plb nlb
plb = df['plb'][index]
nlb = df['nlb'][index]
if plb == 'N':
insert_line_before(df,index,audit_df)
df['plb'][index] = 'Y'
else:
print("previous line already blank")
if nlb == 'Y':
delete_line_after(df,index,audit_df)
nl_deleted = True
df['nlb'][index] = 'N'
else:
print("next line not blank")
return nl_deleted
def audit_ps8(df,index,audit_df=False):
print("Auditing Speaker")
nl_deleted = False
cur_indent = df['ssc'][index]
cur_data = df['data'][index]
cur_case = df['case'][index]
new_indent = 35
if cur_indent != new_indent:
correct_left_indent(df,index,new_indent,audit_df)
else:
print("indent already",new_indent)
#check and correct case
new_case = 'AllUpper'
if cur_case != new_case:
correct_case(df,index,new_case,audit_df)
else:
print("Case already",new_case)
## plb nlb
plb = df['plb'][index]
nlb = df['nlb'][index]
if plb == 'N':
insert_line_before(df,index,audit_df)
df['plb'][index] = 'Y'
else:
print("previous line already blank")
if nlb == 'Y':
delete_line_after(df,index,audit_df)
nl_deleted = True
df['nlb'][index] = 'N'
else:
print("next line not blank")
return nl_deleted
def audit_ps10(df,index,audit_df=False):
print("Auditing Parenthetical complete")
nl_deleted = False
cur_indent = df['ssc'][index]
cur_data = df['data'][index]
cur_case = df['case'][index]
new_indent = 30
if cur_indent != new_indent:
correct_left_indent(df,index,new_indent,audit_df)
else:
print("indent already",new_indent)
#check and correct case
new_case = 'AllLower'
if cur_case != new_case:
correct_case(df,index,new_case,audit_df)
else:
print("Case already",new_case)
## plb nlb
plb = df['plb'][index]
nlb = df['nlb'][index]
if plb == 'Y':
delete_line_before(df,index,audit_df)
df['plb'][index] = 'N'
else:
print("previous line already blank")
if nlb == 'Y':
delete_line_after(df,index,audit_df)
nl_deleted = True
df['nlb'][index] = 'N'
else:
print("next line not blank")
return nl_deleted
def audit_ps11(df,index,audit_df=False):
print("Auditing Parenthetical beginning")
nl_deleted = False
cur_indent = df['ssc'][index]
cur_data = df['data'][index]
cur_case = df['case'][index]
new_indent = 30
if cur_indent != new_indent:
correct_left_indent(df,index,new_indent,audit_df)
else:
print("indent already",new_indent)
#check and correct case
new_case = 'AllLower'
if cur_case != new_case:
correct_case(df,index,new_case,audit_df)
else:
print("Case already",new_case)
## plb nlb
plb = df['plb'][index]
nlb = df['nlb'][index]
if plb == 'Y':
delete_line_before(df,index,audit_df)
df['plb'][index] = 'N'
else:
print("previous line already blank")
if nlb == 'Y':
delete_line_after(df,index,audit_df)
nl_deleted = True
df['nlb'][index] = 'N'
else:
print("next line not blank")
return nl_deleted
def audit_ps20(df,index,audit_df=False):
print("Auditing Parenthetical middle")
nl_deleted = False
cur_indent = df['ssc'][index]
cur_data = df['data'][index]
cur_case = df['case'][index]
new_indent = 30
if cur_indent != new_indent:
correct_left_indent(df,index,new_indent,audit_df)
else:
print("indent already",new_indent)
#check and correct case
new_case = 'AllLower'
if cur_case != new_case:
correct_case(df,index,new_case,audit_df)
else:
print("Case already",new_case)
## plb nlb
plb = df['plb'][index]
nlb = df['nlb'][index]
if plb == 'Y':
delete_line_before(df,index,audit_df)
df['plb'][index] = 'N'
else:
print("previous line already blank")
if nlb == 'Y':
delete_line_after(df,index,audit_df)
nl_deleted = True
df['nlb'][index] = 'N'
else:
print("next line not blank")
return nl_deleted
def audit_ps12(df,index,audit_df=False):
print("Auditing Parenthetical end")
nl_deleted = False
cur_indent = df['ssc'][index]
cur_data = df['data'][index]
cur_case = df['case'][index]
new_indent = 30
if cur_indent != new_indent:
correct_left_indent(df,index,new_indent,audit_df)
else:
print("indent already",new_indent)
#check and correct case
new_case = 'AllLower'
if cur_case != new_case:
correct_case(df,index,new_case,audit_df)
else:
print("Case already",new_case)
## plb nlb
plb = df['plb'][index]
nlb = df['nlb'][index]
if plb == 'Y':
delete_line_before(df,index,audit_df)
df['plb'][index] = 'N'
else:
print("previous line already blank")
if nlb == 'Y':
delete_line_after(df,index,audit_df)
nl_deleted = True
df['nlb'][index] = 'N'
else:
print("next line not blank")
return nl_deleted
def audit_ps13(df,index,audit_df=False):
print("Auditing Dialogue Beginning")
nl_deleted = False
cur_indent = df['ssc'][index]
cur_data = df['data'][index]
cur_case = df['case'][index]
new_indent = 25
if cur_indent != new_indent:
correct_left_indent(df,index,new_indent,audit_df)
else:
print("indent already",new_indent)
#check and correct case
# new_case = 'AllLower'
# if cur_case != new_case:
# correct_case(df,audit_df,index,new_case)
# else:
# print("Case already",new_case)
## plb nlb
plb = df['plb'][index]
nlb = df['nlb'][index]
if plb == 'Y':
delete_line_before(df,index,audit_df)
df['plb'][index] = 'N'
else:
print("previous line already blank")
if nlb == 'Y':
delete_line_after(df,index,audit_df)
nl_deleted = True
df['nlb'][index] = 'N'
else:
print("next line not blank")
return nl_deleted
def audit_ps14(df,index,audit_df=False):
print("Auditing Dialogue Middle")
nl_deleted = False
cur_indent = df['ssc'][index]
cur_data = df['data'][index]
cur_case = df['case'][index]
new_indent = 25
if cur_indent != new_indent:
correct_left_indent(df,index,new_indent,audit_df)
else:
print("indent already",new_indent)
#check and correct case
# new_case = 'AllLower'
# if cur_case != new_case:
# correct_case(df,audit_df,index,new_case)
# else:
# print("Case already",new_case)
## plb nlb
plb = df['plb'][index]
nlb = df['nlb'][index]
if plb == 'Y':
delete_line_before(df,index,audit_df)
df['plb'][index] = 'N'
else:
print("previous line already blank")
if nlb == 'Y':
delete_line_after(df,index,audit_df)
nl_deleted = True
df['nlb'][index] = 'N'
else:
print("next line not blank")
return nl_deleted
def audit_ps15(df,index,audit_df=False):
print("Auditing Dialogue End")
nl_deleted = False
cur_indent = df['ssc'][index]
cur_data = df['data'][index]
cur_case = df['case'][index]
new_indent = 25
if cur_indent != new_indent:
correct_left_indent(df,index,new_indent,audit_df)
else:
print("indent already",new_indent)
# #check and correct case
# new_case = 'AllLower'
# if cur_case != new_case:
# correct_case(df,audit_df,index,new_case)
# else:
# print("Case already",new_case)
## plb nlb
plb = df['plb'][index]
nlb = df['nlb'][index]
if plb == 'Y':
delete_line_before(df,index,audit_df)
df['plb'][index] = 'N'
else:
print("previous line already not blank")
if nlb == 'N':
nl_pos = df['Identification_Status'][index+1]
if nl_pos == 'ps10':
print("not inserting blank as next is parenthtical")
else:
insert_line_after(df,index,audit_df)
df['nlb'][index] = 'Y'
else:
print("next line already blank")
def audit_ps16(df,index,audit_df=False):
print("Auditing Transition")
cur_indent = df['ssc'][index]
cur_data = df['data'][index]
cur_case = df['case'][index]
cur_lcp = df['lcp'][index]
new_lcp = 72
if cur_lcp != new_lcp:
correct_right_indent(df,index,new_lcp,audit_df)
else:
print("indent already",new_lcp)
#check and correct case
print(cur_case,"123")
new_case = 'AllUpper'
if cur_case != new_case:
correct_case(df,index,new_case,audit_df)
else:
print("Case already",new_case)
## plb nlb
plb = df['plb'][index]
nlb = df['nlb'][index]
if plb == 'N':
insert_line_before(df,index,audit_df)
df['plb'][index] = 'Y'
else:
print("previous line already blank")
if nlb == 'N':
insert_line_after(df,index,audit_df)
df['nlb'][index] = 'Y'
else:
print("next line already blank")
def audit_ps17(df,index,audit_df=False):
print("Auditing Special Term")
cur_indent = df['ssc'][index]
cur_data = df['data'][index]
cur_case = df['case'][index]
new_indent = 15
if cur_indent != new_indent:
correct_left_indent(df,index,new_indent,audit_df)
else:
print("indent already",new_indent)
#check and correct case
new_case = 'AllUpper'
if cur_case != new_case:
correct_case(df,index,new_case,audit_df)
else:
print("Case already",new_case)
## plb nlb
plb = df['plb'][index]
nlb = df['nlb'][index]
if plb == 'N':
insert_line_before(df,index,audit_df)
df['plb'][index] = 'Y'
else:
print("previous line already blank")
if nlb == 'N':
insert_line_after(df,index,audit_df)
df['nlb'][index] = 'Y'
else:
print("next line already blank")
index_iter = iter(df.index)
for index in index_iter:
if (df['Identification_Status'][index] == 'blank'):
continue
nl_deleted = False
cur_line_pos = df['Identification_Status'][index]
fn_name = 'audit_' + cur_line_pos
line_no = df['line_no'][index]
print("\n")
print("line no",line_no)
print("index ",index)
print(cur_line_pos)
print(df['line_no'].dtype)
try:
to_call_fn = locals()[fn_name]
print(to_call_fn)
except:
continue
try:
nl_deleted = to_call_fn(df,index,audit_df)
except:
pass
if nl_deleted :
next(index_iter)
df = df.sort_index().reset_index(drop=True)
#df = df.sort_values(by=['line_no']).reset_index(drop =True)
try:
if not audit_df.empty:
return df,audit_df
else:
return df
except:
return df
def merge_line_to_para(df):
## output columns
## line_no, para_no , scene_no, identified_script_element, langueage , script
para_df = pd.DataFrame()
para_df['para_no'] = ''
para_df['scene_no'] = ''
para_df['content'] = ''
para_df['script_element'] = ''
para_no = 0
scene_no = 0
index_iter = iter(df.index)
for index in index_iter:
line_pos = df['Identification_Status'][index]
data = df['data'][index]
data = data.strip()
#print(line_pos,data)
para_no += 1
para_df.loc[para_no] = ''
para_df['para_no'][para_no] = para_no
if line_pos == 'blank' :
para_df['content'][para_no] = ''
para_df['script_element'][para_no] = 'blank'
para_df['scene_no'][para_no] = scene_no
continue
if line_pos == 'ps1':
para_df['content'][para_no] = data
para_df['script_element'][para_no] = 'slugline'
scene_no += 1
para_df['scene_no'][para_no] = scene_no
continue
if line_pos == 'ps2':
##merge with ps3
if df['Identification_Status'][index+1] == 'ps3':
nl_data = df['data'][index+1]
nl_data = nl_data.strip()
merge_data = data + ' ' + nl_data
para_df['content'][para_no] = merge_data
para_df['script_element'][para_no] = 'slugline'
scene_no += 1
para_df['scene_no'][para_no] = scene_no
next(index_iter)
continue
else:
para_df['content'][para_no] = data
para_df['script_element'][para_no] = 'slugline'
scene_no += 1
para_df['scene_no'][para_no] = scene_no
continue
if line_pos == 'ps3':
para_df['content'][para_no] = data
para_df['script_element'][para_no] = 'slugline'
scene_no += 1
para_df['scene_no'][para_no] = scene_no
continue
if line_pos == 'ps4':
merge_data = data
fwd_index = index+1
nl_pos = df['Identification_Status'][fwd_index]
while nl_pos == 'ps5' or nl_pos == 'ps6':
next(index_iter)
nl_data = df['data'][fwd_index]
nl_data = nl_data.strip()
merge_data += ' ' + nl_data
fwd_index += 1
try:
nl_pos = df['Identification_Status'][fwd_index]
except:
break
para_df['content'][para_no] = merge_data
para_df['script_element'][para_no] = 'action'
para_df['scene_no'][para_no] = scene_no
continue
if line_pos == 'ps5':
merge_data = data
fwd_index = index+1
nl_pos = df['Identification_Status'][fwd_index]
while nl_pos == 'ps6':
next(index_iter)
nl_data = df['data'][fwd_index]
nl_data = nl_data.strip()
merge_data += ' ' + nl_data
fwd_index += 1
try:
nl_pos = df['Identification_Status'][fwd_index]
except:
break
para_df['content'][para_no] = merge_data
para_df['script_element'][para_no] = 'action'
para_df['scene_no'][para_no] = scene_no
continue
if line_pos == 'ps6':
para_df['content'][para_no] = data
para_df['script_element'][para_no] = 'action'
para_df['scene_no'][para_no] = scene_no
continue
if line_pos == 'ps7' or line_pos == 'ps8':
para_df['content'][para_no] = data
para_df['script_element'][para_no] = 'speaker'
para_df['scene_no'][para_no] = scene_no
continue
if line_pos == 'ps9' or line_pos == 'ps10':
para_df['content'][para_no] = data
para_df['script_element'][para_no] = 'parenthetical'
para_df['scene_no'][para_no] = scene_no
continue
if line_pos == 'ps11':
merge_data = data
fwd_index = index+1
nl_pos = df['Identification_Status'][fwd_index]
while nl_pos == 'ps20' or nl_pos == 'ps12':
next(index_iter)
nl_data = df['data'][fwd_index]
nl_data = nl_data.strip()
merge_data += ' ' + nl_data
fwd_index += 1
try:
nl_pos = df['Identification_Status'][fwd_index]
except:
break
para_df['content'][para_no] = merge_data
para_df['script_element'][para_no] = 'parenthetical'
para_df['scene_no'][para_no] = scene_no
continue
if line_pos == 'ps20':
merge_data = data
fwd_index = index+1
nl_pos = df['Identification_Status'][fwd_index]
while nl_pos == 'ps12':
next(index_iter)
nl_data = df['data'][fwd_index]
nl_data = nl_data.strip()
merge_data += ' ' + nl_data
fwd_index += 1
try:
nl_pos = df['Identification_Status'][fwd_index]
except:
break
para_df['content'][para_no] = merge_data
para_df['script_element'][para_no] = 'parenthetical'
para_df['scene_no'][para_no] = scene_no
continue
if line_pos == 'ps12' :
para_df['content'][para_no] = data
para_df['script_element'][para_no] = 'parenthetical'
para_df['scene_no'][para_no] = scene_no
continue
if line_pos == 'ps13':
merge_data = data
fwd_index = index+1
nl_pos = df['Identification_Status'][fwd_index]
while nl_pos == 'ps14' or nl_pos == 'ps15':
next(index_iter)
nl_data = df['data'][fwd_index]
nl_data = nl_data.strip()
merge_data += ' ' + nl_data
fwd_index += 1
try:
nl_pos = df['Identification_Status'][fwd_index]
except:
break
para_df['content'][para_no] = merge_data
para_df['script_element'][para_no] = 'dialogue'
para_df['scene_no'][para_no] = scene_no
continue
if line_pos == 'ps14':
merge_data = data
fwd_index = index+1
nl_pos = df['Identification_Status'][fwd_index]
while nl_pos == 'ps15':
next(index_iter)
nl_data = df['data'][fwd_index]
nl_data = nl_data.strip()
merge_data += ' ' + nl_data
fwd_index += 1
try:
nl_pos = df['Identification_Status'][fwd_index]
except:
break
para_df['content'][para_no] = merge_data
para_df['script_element'][para_no] = 'dialogue'
para_df['scene_no'][para_no] = scene_no
continue
if line_pos == 'ps15' :
para_df['content'][para_no] = data
para_df['script_element'][para_no] = 'dialogue'
para_df['scene_no'][para_no] = scene_no
continue
if line_pos == 'ps16' :
para_df['content'][para_no] = data
para_df['script_element'][para_no] = 'transition'
para_df['scene_no'][para_no] = scene_no
continue
if line_pos == 'ps17' :
para_df['content'][para_no] = data
para_df['script_element'][para_no] = 'special_term'
para_df['scene_no'][para_no] = scene_no
continue
if line_pos == 'ps27':
para_df['content'][para_no] = data
para_df['script_element'][para_no] = 'dialogue'
para_df['scene_no'][para_no] = scene_no
continue
return para_df
def wrap_text(df,audit_df):
# df.reset_index(inplace=True, drop=True)
# audit_df.reset_index(inplace=True, drop=True) #df['line_no'] = df['line_no'].astype(str).astype(float)
index_iter = iter(df.index)
print("wrapping lines")
print(df.dtypes)
for index in index_iter:
line_pos = df['Identification_Status'][index]
if line_pos == 'blank' or df['isIdentified'][index] == 'No':
continue
data = df['data'][index]
data = data.strip()
print("line no",df['line_no'][index],df['line_no'].dtype)
if line_pos == 'ps1':
print("checking Slugline")
if len(data) > 58:
print("Need to wrap line")
print("data 9808",data)
wrapped_data = textwrap.wrap(data, width = 58)
for line in wrapped_data:
try:
print("line 9812:",line)
except:
pass
if line_pos == 'ps6':
action_data = ''
action_list = []
print("checking Action line")
cur_lines_count = 0
action_index = index
while line_pos != 'blank':
data = df['data'][action_index]
line_no = df['line_no'][action_index]
try:
print("9827\n",line_pos,line_no,data)
except:
pass
action_data = data.strip() + ' ' + action_data
cur_lines_count += 1
action_list.append(line_no)
action_index -= 1
if action_index < 0:
break
try:
line_pos = df['Identification_Status'][action_index]
except:
line_pos = ''
if line_pos == '' or df['isIdentified'][action_index] == 'No':
break
if action_index < 0:
continue
if line_pos == '' or df['isIdentified'][action_index] == 'No':
continue
action_start_index = action_index + 1
action_data = action_data.strip()
print("Number of action lines",cur_lines_count)
if len(action_data) > 58:
print("Need to wrap line")
try:
print("actiob data:\n",action_data)
except:
pass
wrapped_data = textwrap.wrap(action_data, width = 58)
print("Wrapped line 9753")
wrapped_data_lines_count = len(wrapped_data)
if cur_lines_count == wrapped_data_lines_count:
#can change the original line(s) data
print("cur and wrapped number of lines same")
# for i in range(0,cur_lines_count):
# print(wrapped_data[i])
elif wrapped_data_lines_count > cur_lines_count:
lines_to_add = wrapped_data_lines_count - cur_lines_count
#Multiple action lines
print("will need to create ",lines_to_add," more lines")
pvs_line_no = df['line_no'][index-1] #float
cur_line_no = df['line_no'][index] #float
cur_line_index = index
pvs_line_index = cur_line_index -1
print("in line no 9874")
while lines_to_add != 0:
new_line_no = 0.0
pvs_line_no = df['line_no'][pvs_line_index] #float
new_line_no = (cur_line_no + pvs_line_no ) / 2
while new_line_no in audit_df.index:
new_line_no = (cur_line_no + new_line_no)/2
action_list.append(new_line_no)
new_line_index = (cur_line_index + pvs_line_index) /2
df.loc[new_line_index] = np.nan
df.loc[new_line_index,'line_no'] = new_line_no
if df['Identification_Status'][pvs_line_index] == 'blank':
df.loc[new_line_index,'Identification_Status'] = 'ps4'
else:
df.loc[new_line_index,'Identification_Status'] = 'ps5'
#cur_line_no = new_line_no
pvs_line_index = new_line_index
#cur_line_index -= 1
lines_to_add -= 1
elif wrapped_data_lines_count < cur_lines_count:
lines_to_remove = cur_lines_count - wrapped_data_lines_count
print("Will need to remove ",lines_to_remove, "lines")
remove_index = index -1
#pvs_line_no = df['line_no'][remove_index]
while lines_to_remove != 0:
pvs_line_no = df['line_no'][remove_index]
## remove pvs line
df.drop(remove_index, inplace= True)
audit_df['line_removed'][pvs_line_no] = 'Yes'
action_list.remove(pvs_line_no)
remove_index -= 1
lines_to_remove -= 1
action_list.sort()
print(action_list)
## add these lines in the original df
print("in line no 9914")
## now assign the values to these lines
wrapped_index = 0
for line_no in action_list:
df_index = df.index[df['line_no'] == line_no]
df['data'][df_index] = wrapped_data[wrapped_index]
if line_no not in audit_df.index:
audit_df.loc[line_no] = np.nan
audit_df.loc[line_no,'data'] = ''
audit_df.loc[line_no,'data_corrected'] = ''
audit_df['line_wrapped_at_prescribed_right_indent'][line_no] = 'Yes'
wrapped_index += 1
else:
print("No need to wrap line")
try:
print(action_data)
except:
pass
print(len(action_data))
print("in line no 9936")
if line_pos == 'ps15':
dialogue_data = ''
dialogue_list = []
print("\n checking Dialogue line",index)
cur_lines_count = 0
dialogue_index = int(index)
while line_pos not in ('ps7','ps8','ps10','ps12','ps5','ps6'): ## added 5 and 6 as wrong identification causes previous line to be ps5
data = df['data'][dialogue_index]
line_no = df['line_no'][dialogue_index]
try:
print(dialogue_index,line_no,line_pos,data)
except:
pass
#dialogue_data = data.strip() + ' ' + dialogue_data
try:
dialogue_data = data.strip() + ' ' + dialogue_data
except:
data = str(data)
dialogue_data = data.strip() + ' ' + dialogue_data
cur_lines_count += 1
if dialogue_index == index:
df['Identification_Status'][dialogue_index] = 'ps15'
else:
df['Identification_Status'][dialogue_index] = 'ps14'
dialogue_index -= 1
dialogue_list.append(line_no)
print("\nprinting isIdentified: ")
try:
li = df['isIdentified'][dialogue_index] == 'No'
print("dialogue bunch not fully identified")
except:
li = ''
print("dialogue bunch not fully identified")
if li == '' or df['isIdentified'][dialogue_index] == 'No':
break
# if df['isIdentified'][dialogue_index] == 'No' :
# print("dialogue bunch not fully identified")
# break
line_pos = df['Identification_Status'][dialogue_index]
if li == '' or df['isIdentified'][dialogue_index] == 'No' : #added li == ''
print("dialogue bunch not fully identified")
continue
dialogue_start_index = dialogue_index + 1
if dialogue_start_index != index:
df['Identification_Status'][dialogue_start_index] = 'ps13'
dialogue_data = dialogue_data.strip()
print("Number of dialogue lines 9990",cur_lines_count)
if len(dialogue_data) > 35:
print("Need to wrap dialogue line 9992")
try:
print(dialogue_data)
except:
pass
wrapped_data = textwrap.wrap(dialogue_data, width = 35)
wrapped_data_lines_count = len(wrapped_data)
if cur_lines_count == wrapped_data_lines_count:
#can change the original line(s) data
print("cur and wrapped number of lines same")
# for i in range(0,cur_lines_count):
# print(wrapped_data[i])
elif wrapped_data_lines_count > cur_lines_count:
lines_to_add = wrapped_data_lines_count - cur_lines_count
#Multiple action lines
print("will need to create ",lines_to_add," more lines")
pvs_line_no = float(df['line_no'][index-1])
cur_line_no = float(df['line_no'][index])
cur_line_index = index
pvs_line_index = cur_line_index -1
while lines_to_add != 0:
new_line_no = 0.0
pvs_line_no = float(df['line_no'][pvs_line_index])
new_line_no = (cur_line_no + pvs_line_no ) / 2
while (new_line_no in audit_df.index) or (new_line_no in dialogue_list):
new_line_no = (cur_line_no + new_line_no)/2
new_line_no = new_line_no
print(cur_line_index,cur_line_no,pvs_line_no,new_line_no)
dialogue_list.append(new_line_no)
new_line_index = (cur_line_index + pvs_line_index) /2
df.loc[new_line_index] = np.nan
df.loc[new_line_index,'line_no'] = new_line_no
if df['Identification_Status'][pvs_line_index] in ('ps7','ps10','ps12'):
df.loc[new_line_index,'Identification_Status'] = 'ps13'
else:
df.loc[new_line_index,'Identification_Status'] = 'ps14'
#cur_line_no = new_line_no
#cur_line_index -= 1
pvs_line_index = new_line_index
lines_to_add -= 1
elif wrapped_data_lines_count < cur_lines_count:
lines_to_remove = cur_lines_count - wrapped_data_lines_count
print("Will need to remove ",lines_to_remove, "lines")
remove_index = index -1
#pvs_line_no = df['line_no'][remove_index]
while lines_to_remove != 0:
pvs_line_no = df['line_no'][remove_index]
## remove pvs line
df.drop(remove_index, inplace= True)
audit_df['line_removed'][pvs_line_no] = 'Yes'
dialogue_list.remove(pvs_line_no)
remove_index -= 1
lines_to_remove -= 1
try:
dialogue_list.sort()
except:
print("converting dialogue_list to float")
dialogue_list = [float(value) if type(value) != int else value for value in dialogue_list]
dialogue_list.sort()
print("diaogue_list",dialogue_list)
## add these lines in the original df
print("in line no 10060")
## now assign the values to these lines
wrapped_index = 0
for line_no in dialogue_list:
df_index = df.index[df['line_no'] == line_no]
df['data'][df_index] = wrapped_data[wrapped_index]
if line_no not in audit_df.index:
audit_df.loc[line_no] = 'No'
audit_df.loc[line_no,'data'] = ''
audit_df.loc[line_no,'data_corrected'] = ''
audit_df['line_wrapped_at_prescribed_right_indent'][line_no] = 'Yes'
wrapped_index += 1
else:
print("No need to wrap line")
try:
print(dialogue_data)
except:
pass
print(len(dialogue_data))
# if all(isinstance(val, int) for val in df['line_no']):
# print("All values in 'line_no' are integers.")
# elif all(isinstance(val, str) for val in df['line_no']):
# print("All values in 'line_no' are strings. Converting to floats or integers...")
#try:
# df['line_no'] = df['line_no'].astype(int)
# print("Converted 'line_no' column to integers.")
#except ValueError:
# df['line_no'] = df['line_no'].astype(float)
# print("Converted 'line_no' column to floats.")
# else:
# print("Values in 'line_no' are of mixed types.")
df = df.sort_values(by=['line_no']).reset_index(drop =True)
index_iter = iter(df.index)
df.fillna({'data':''},inplace=True)
for index in index_iter:
print(index)
line_pos = df['Identification_Status'][index]
if line_pos == 'blank':
continue
data = df['data'][index]
try:
print("data",data)
print(type(data))
except:
pass
data = data.strip()
if line_pos == 'ps10' :
par_data = ''
par_list = []
print("checking Parenthetical line")
cur_lines_count = 0
par_index = index
data = df['data'][par_index]
line_no = df['line_no'][par_index]
try:
print(line_pos,data)
except:
pass
par_data = data.strip()
cur_lines_count += 1
par_list.append(line_no)
line_pos = df['Identification_Status'][par_index]
print("Number of parenthetical lines",cur_lines_count)
print("index",par_index,"line_no",line_no)
if len(par_data) > 20:
print("Need to wrap parenthetical line 10133")
try:
print(par_data)
except:
pass
wrapped_data = textwrap.wrap(par_data, width = 20)
wrapped_data_lines_count = len(wrapped_data)
if wrapped_data_lines_count > cur_lines_count:
lines_to_add = wrapped_data_lines_count - cur_lines_count
#Multiple par lines
print("will need to create ",lines_to_add," more lines")
pvs_line_no = df['line_no'][index-1] #float
cur_line_no = df['line_no'][index] #float
cur_line_index = index
pvs_line_index = cur_line_index -1
while lines_to_add != 0:
new_line_no = 0.0
pvs_line_no = df['line_no'][pvs_line_index]
try:
new_line_no = (cur_line_no + pvs_line_no ) / 2
except:
new_line_no = (float(cur_line_no) + pvs_line_no ) / 2
while new_line_no in audit_df.index:
new_line_no = (cur_line_no + new_line_no)/2
new_line_no = (new_line_no)
par_list.append(new_line_no)
new_line_index = (cur_line_index + pvs_line_index) /2
df.loc[new_line_index] = np.nan
df.loc[new_line_index,'line_no'] = new_line_no
if df['Identification_Status'][pvs_line_index] in ('ps7','ps8','ps15'):
df.loc[new_line_index,'Identification_Status'] = 'ps11'
df.loc[new_line_index,'isIdentified'] = 'Yes'
else:
df.loc[new_line_index,'Identification_Status'] = 'ps20'
df.loc[new_line_index,'isIdentified'] = 'Yes'
cur_line_no = new_line_no
cur_line_index = new_line_index
lines_to_add -= 1
df['Identification_Status'][index] = 'ps12'
try:
par_list.sort()
except :
print("exception accepted:")
par_list = [np.array([float(x)]) if isinstance(x, str) else x for x in par_list]
par_list.sort()
print("\n\npar_list:",par_list,"\n\n")
## add these lines in the original df
## now assign the values to these lines
wrapped_index = 0
for line_no in par_list:
try:
df_index = df.index[df['line_no'] == line_no]
print("try block executed\n")
except:
print("Exception:")
df_index = df.index[df['line_no'] == line_no[0]]
print("except block executed\n")
print("printing df_index 10200",df_index,"\n")
df['data'][df_index] = wrapped_data[wrapped_index]
print("printing audit_df:\n",audit_df.index,"\n")
print("checking the audit_fd:",line_no,"\n")
try:
if line_no not in audit_df.index:
audit_df.loc[line_no] = np.nan
audit_df.loc[line_no]['data'] = ''
audit_df.loc[line_no]['data_corrected'] = ''
print("###########try############")
except Exception as e:
print("Exception accepted:",e)
audit_df['line_wrapped_at_prescribed_right_indent'][line_no] = 'Yes'
wrapped_index += 1
else:
print("No need to wrap line")
try:
print(par_data)
except:
pass
print(len(par_data))
try:
df = df.sort_values(by=['line_no']).reset_index(drop =True)
except:
print("Exception 10184:")
df['line_no'] = [np.float64(val) if isinstance(val, str) else val for val in df['line_no']]
df = df.sort_values(by=['line_no']).reset_index(drop =True)
print("The df in merge_text123456789")
print(df)
return df
def check_slug_still_unidentified(df):
slug_still_unidentified = False
print("checking if all slugs were identified")
df_unidn = df.loc[df['isIdentified'] == 'No',:]
for index in df_unidn.index:
try:
line_pos = df_unidn['Identification_Status'][index].split(';')
print(line_pos)
line_pos = line_pos[0:2]
print("top2 line pos",line_pos)
except:
line_pos = []
for ps in line_pos:
if ps in ['ps1','ps2','ps18']:
slug_still_unidentified = True
return slug_still_unidentified
return slug_still_unidentified
def sa_wrapped_output_to_docx(para_df,output_docx):
page_no = 1
lines_added = 0
output_doc = Document()
style = output_doc.styles['Normal']
font = style.font
font.name = 'Courier New'
font.size = Pt(12)
section = output_doc.sections[0]
section.page_height = Mm(297)
#section.page_width = Mm(210)
a4_right = 8.57
section.page_width = Inches(a4_right)
section.left_margin = Inches(1.5)
for index in para_df.index:
para = output_doc.add_paragraph()
paragraph_format = para.paragraph_format
paragraph_format.space_before = Pt(0)
paragraph_format.space_after = Pt(0)
paragraph_format.line_spacing = Pt(12)
script_element = para_df['script_element'][index]
content = para_df['content'][index]
if script_element == 'blank':
continue
if script_element in ('slugline','action'):
paragraph_format.left_indent = Inches(0)
paragraph_format.right_indent = Inches(0)
if script_element == 'dialogue':
paragraph_format.left_indent = Inches(1.0)
paragraph_format.right_indent = Inches(1.25)
if script_element == 'parenthetical':
paragraph_format.left_indent = Inches(1.5)
paragraph_format.right_indent = Inches(2.25)
if script_element == 'speaker':
paragraph_format.left_indent = Inches(2)
paragraph_format.right_indent = Inches(1)
if script_element == 'transition':
para.alignment = WD_ALIGN_PARAGRAPH.RIGHT
paragraph_format.left_indent = Inches(2.5)
paragraph_format.right_indent = Inches(0)
if script_element == 'special_term':
paragraph_format.left_indent = Inches(0)
paragraph_format.right_indent = Inches(0)
para.text = content
# first_page = True
output_doc.save(output_docx)
def sa_output_to_docx(df,output_docx,output_template):
page_no = 1
lines_added = 0
output_doc = Document(output_template)
style = output_doc.styles['Normal']
font = style.font
font.name = 'Courier New'
font.size = Pt(12)
section = output_doc.sections[0]
section.page_height = Mm(297)
section.page_width = Mm(210)
#section.page_width = Inches(11)
section.left_margin = Inches(1.5)
header = section.header
paragraph = output_doc.paragraphs[0]
paragraph_format = paragraph.paragraph_format
paragraph_format.space_before = Pt(0)
paragraph_format.space_after = Pt(0)
paragraph_format.line_spacing = Pt(12)
#def add_page_number(doc,):
slug_still_unidentified = check_slug_still_unidentified(df)
print(slug_still_unidentified)
if df['Identification_Status'][0] == 'blank' and df['Identification_Status'][1] in ('ps1','ps2'):
## add Fade in
data = "FADE IN:"
data = data.rjust(len(data))
paragraph.add_run(data)
lines_added = 1
elif df['Identification_Status'][0] in ('ps1','ps2'):
## add Fade in and blank
data = "FADE IN:"
data = data.rjust(len(data))
paragraph.add_run(data)
run = paragraph.add_run()
run.add_break()
lines_added = 2
scene_no = 1
first_page = True
for index in df.index:
#print("lines_added",lines_added)
if lines_added == 56:
## add break if dialogue is getting separated
if df['Identification_Status'][index] in ('ps7','ps8'):
if df['Identification_Status'][index+1] in ('ps9','ps10'):
output_doc.add_page_break()
lines_added = 0
elif df['Identification_Status'][index] in ('ps1','ps2'):
## add a page break if next line is slug
output_doc.add_page_break()
lines_added = 0
if lines_added == 57:
## add break if dialogue is getting separated
if df['Identification_Status'][index] in ('ps7','ps8'):
output_doc.add_page_break()
lines_added = 0
elif df['Identification_Status'][index] in ('ps1','ps2'):
## add a page break if next line is slug
output_doc.add_page_break()
lines_added = 0
if lines_added == 58:
lines_added = 0
# elif lines_added == 58:
# lines_added = 0
# first_page = False
pos = df['Identification_Status'][index]
data = df['data'][index]
try:
print(index,data)
except:
pass
if slug_still_unidentified:
para = output_doc.add_paragraph()
paragraph_format = para.paragraph_format
paragraph_format.space_before = Pt(0)
paragraph_format.space_after = Pt(0)
paragraph_format.line_spacing = Pt(12)
if data.strip():
para.text = data[15:]
else:
para.text = ''
else:
# to do , get scene number
scene_data = str(scene_no)
left_indent = 12
if pos == 'ps1' or pos == 'ps2':
print("Removing already present scene number")
print("Adding scene number")
if scene_no < 9:
data = scene_data + ' ' + data.lstrip()
elif scene_no < 100:
data = scene_data + ' ' + data.lstrip()
else:
data = scene_data + data.lstrip()
data = data.rjust(len(data) + left_indent)
data = data.rstrip()
scene_indent = 63 - len(data.strip())
scene_data = scene_data.rjust(scene_indent)
data = data + scene_data
scene_no += 1
para = output_doc.add_paragraph()
paragraph_format = para.paragraph_format
paragraph_format.space_before = Pt(0)
paragraph_format.space_after = Pt(0)
paragraph_format.line_spacing = Pt(12)
paragraph_format.left_indent = -Inches(0.3)
para.text = data[12:]
else:
para = output_doc.add_paragraph()
paragraph_format = para.paragraph_format
paragraph_format.space_before = Pt(0)
paragraph_format.space_after = Pt(0)
paragraph_format.line_spacing = Pt(12)
if data.strip():
para.text = data[15:]
else:
para.text = ''
lines_added += 1
## add Fade out
data = "FADE OUT:"
data = data.rjust( 58 - len(data))
para = output_doc.add_paragraph()
paragraph_format = para.paragraph_format
paragraph_format.space_before = Pt(0)
paragraph_format.space_after = Pt(0)
paragraph_format.line_spacing = Pt(12)
para.text = data
output_doc.save(output_docx)
# def sa_txt_to_docx(script_txt,output_script_docx):
# output_template_name = 'ScriptTemplate5.docx'
# output_template = os.path.join(mypath,output_template_name)
# new_doc = Document(output_template)
# style = new_doc.styles['Normal']
# font = style.font
# font.name = 'Courier New'
# font.size = Pt(12)
# section = new_doc.sections[0]
# section.page_height = Mm(297)
# section.page_width = Mm(210)
# #section.page_width = Inches(11)
# section.left_margin = Inches(1.5)
# header = section.header
# with open(script_txt,'r',encoding='utf-8') as txt_in:
# lines = txt_in.readlines()
# for line in lines:
# para = new_doc.add_paragraph()
# paragraph_format = para.paragraph_format
# paragraph_format.space_before = Pt(0)
# paragraph_format.space_after = Pt(0)
# paragraph_format.line_spacing = Pt(12)
# if line.strip():
# para.text = line[15:]
# else:
# para.text = ''
# new_doc.save(output_script_docx)
def sa_output_to_txt(output_script_docx,output_script_txt):
from docx import Document
from docx.shared import Pt
from docx.shared import Mm
read_doc = Document(output_script_docx)
all_paras = read_doc.paragraphs
first = all_paras[0].paragraph_format
#print(first.left_indent)
#count = 1
print(len(all_paras))
left_margin = 15
with open(output_script_txt, 'w', encoding='utf-8') as f:
for para in all_paras:
paragraph_format = para.paragraph_format
fli =0
li =0
ri =0
try:
fli = paragraph_format.first_line_indent.inches
except:
pass
try:
li = paragraph_format.left_indent.inches
except:
pass
try:
ri = paragraph_format.right_indent.inches
except:
pass
indent = int((fli + li ) * 10)
print(fli,li,indent,ri)
data = para.text
lines = data.split('\n')
print(len(lines))
for line in lines:
try:
print(line)
except:
pass
line = line.rjust(len(line) + indent + left_margin)
try:
print(line)
except:
pass
f.write(line)
f.write('\n')
def print_audit_report_docx(audit_df,audit_report_docx):
#line_removed header left_indent_corrected right_indent_corrected line_wrapped_at_prescribed_right_indent case_corrected #blank_inserted_before blank_inserted_after blank_deleted_before blank_deleted_after space_removed_between_characters #space_added_between_characters line_merged_with_next_line line_broken_into_multiple_lines punctuation_mark_added #punctuation_mark_removed
output_doc = Document()
para = output_doc.add_paragraph()
para.alignment = WD_ALIGN_PARAGRAPH.CENTER
run = para.add_run()
run.text = ' Audit Report'
run.add_break()
run.add_break()
for index in audit_df.index:
para = output_doc.add_paragraph()
data = "Line No: " + str(index)
run = para.add_run()
run.add_break()
run.text = data
run.add_break()
run.add_break()
#para.add_run(data)
cur_data = audit_df['data'][index]
data = "Current Data: " + cur_data
run = para.add_run()
run.text = data
run.add_break()
if audit_df['line_removed'][index] == 'Yes':
data = "Line was removed"
run = para.add_run()
run.text = data
run.add_break()
continue
new_data = audit_df['data_corrected'][index]
data = "Corrected Data: " + new_data
run = para.add_run()
run.text = data
run.add_break()
data = "Changes Done:- "
run = para.add_run()
run.text = data
run.add_break()
sno = 1
changes_done = False
if audit_df['left_indent_corrected'][index] != 'No':
change_comment = audit_df['left_indent_corrected'][index]
data = str(sno) + '. ' + change_comment
run = para.add_run()
run.text = data
run.add_break()
sno += 1
changes_done = True
if audit_df['right_indent_corrected'][index] != 'No':
change_comment = audit_df['right_indent_corrected'][index]
data = str(sno) + '. ' + change_comment
run = para.add_run()
run.text = data
run.add_break()
sno += 1
changes_done = True
if audit_df['case_corrected'][index] != 'No':
change_comment = 'Case ' + audit_df['case_corrected'][index]
data = str(sno) + '. ' + change_comment
run = para.add_run()
run.text = data
run.add_break()
sno += 1
changes_done = True
if audit_df['line_wrapped_at_prescribed_right_indent'][index] != 'No':
change_comment = 'Line Wrapped at Prescribed Right Indent'
data = str(sno) + '. ' + change_comment
run = para.add_run()
run.text = data
run.add_break()
sno += 1
changes_done = True
if audit_df['line_broken_into_multiple_lines'][index] != 'No':
change_comment = 'Line Broken into Multiple Lines'
data = str(sno) + '. ' + change_comment
run = para.add_run()
run.text = data
run.add_break()
sno += 1
changes_done = True
if audit_df['line_merged_with_next_line'][index] != 'No':
change_comment = 'Line Merged with Next Line'
data = str(sno) + '. ' + change_comment
run = para.add_run()
run.text = data
run.add_break()
sno += 1
changes_done = True
if not changes_done:
data = 'No Changes Done'
run = para.add_run()
run.text = data
run.add_break()
output_doc.save(audit_report_docx)
def ps_to_script_element(ps):
if ps == 'ps1':
return 'Slugline'
elif ps == 'ps2':
return 'Slugline'
elif ps == 'ps3':
return 'Slugline'
elif ps == 'ps4':
return 'Action'
elif ps == 'ps5':
return 'Action'
elif ps == 'ps6':
return 'Action'
elif ps == 'ps7':
return 'Speaker'
elif ps == 'ps8':
return 'Speaker with Extension'
elif ps == 'ps9':
return 'Speaker Extension'
elif ps == 'ps10':
return 'Parenthetical'
elif ps == 'ps11':
return 'Parenthetical'
elif ps == 'ps12':
return 'Parenthetical'
elif ps == 'ps20':
return 'Parenthetical'
elif ps == 'ps13':
return 'Dialogue'
elif ps == 'ps14':
return 'Dialogue'
elif ps == 'ps15':
return 'Dialogue'
elif ps == 'ps16':
return 'Transition'
elif ps == 'ps17':
return 'Special Term'
# elif ps == 'ps0':
# return 'Title Lines'
else:
return ''
# def print_audit_report_tabular_docx(audit_df):
# print("inside audit report")
# #line_removed header left_indent_corrected right_indent_corrected line_wrapped_at_prescribed_right_indent case_corrected #blank_inserted_before blank_inserted_after blank_deleted_before blank_deleted_after space_removed_between_characters #space_added_between_characters line_merged_with_next_line line_broken_into_multiple_lines punctuation_mark_added #punctuation_mark_removed
# output_doc = Document()
# style = output_doc.styles['Normal']
# font = style.font
# #font.name = 'Courier New'
# font.size = Pt(8)
# section = output_doc.sections[-1]
# section.orientation = WD_ORIENT.LANDSCAPE
# section.page_width = Inches(11)
# section.left_margin = Inches(0.25)
# section.right_margin = Inches(0.25)
# para = output_doc.add_paragraph()
# para.alignment = WD_ALIGN_PARAGRAPH.CENTER
# run = para.add_run()
# font = run.font
# font.size = Pt(12)
# run.text = ' Audit Report'
# run.add_break()
# run.add_break()
# para = output_doc.add_paragraph()
# para.alignment = WD_ALIGN_PARAGRAPH.LEFT
# run = para.add_run()
# font = run.font
# font.size = Pt(10)
# run.text = ' Audit Summary'
# para = output_doc.add_paragraph()
# run = para.add_run()
# font = run.font
# font.size = Pt(9)
# print("audit summary column is created")
# case_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['case_corrected'] != 'No'),:])
# left_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['left_indent_corrected'] != 'No'),:])
# right_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['right_indent_corrected'] != 'No'),:])
# wrapped_lines_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_wrapped_at_prescribed_right_indent'] != 'No'),:])
# table =output_doc.add_table(1, cols =2)
# table.style = 'Table Grid'
# font.size = Pt(9)
# heading_cells = table.rows[0].cells
# heading_cells[0].width = Inches(2)
# heading_cells[1].width = Inches(1)
# heading_cells[0].text = 'Type of Change Done'
# heading_cells[1].text = 'Count of Lines'
# for i in range(0,2):
# heading_cells[i].paragraphs[0].runs[0].font.bold = True
# cells = table.add_row().cells
# font.size = Pt(8)
# cells[0].width = Inches(2)
# cells[0].text = 'Case Corrected'
# cells[1].width = Inches(0.5)
# cells[1].text = str(case_corrected_count)
# cells = table.add_row().cells
# font.size = Pt(8)
# cells[0].width = Inches(2)
# cells[0].text = 'Left Indent Corrected'
# cells[1].width = Inches(0.5)
# cells[1].text = str(left_indent_corrected_count)
# cells = table.add_row().cells
# font.size = Pt(8)
# cells[0].width = Inches(2)
# cells[0].text = 'Case Corrected'
# cells[1].width = Inches(0.5)
# cells[1].text = str(right_indent_corrected_count)
# cells = table.add_row().cells
# font.size = Pt(8)
# cells[0].width = Inches(2)
# cells[0].text = 'Lines Wrapped at prescribed indents'
# cells[1].width = Inches(0.5)
# cells[1].text = str(wrapped_lines_count)
# # run.add_break()
# para = output_doc.add_paragraph()
# run = para.add_run()
# run.add_break()
# run.add_break()
# font.size = Pt(8)
# no_rows = len(audit_df.index)
# table =output_doc.add_table(1, cols =6)
# table.style = 'Table Grid'
# table.autofit = False
# # table.columns[0].width = Inches(0.5)
# # table.columns[1].width = Inches(4)
# # table.columns[2].width = Inches(4)
# # table.columns[3].width = Inches(0.5)
# heading_cells = table.rows[0].cells
# heading_cells[0].width = Inches(0.5)
# heading_cells[1].width = Inches(0.5)
# heading_cells[2].width = Inches(3.5)
# heading_cells[3].width = Inches(0.8)
# heading_cells[4].width = Inches(3.5)
# heading_cells[5].width = Inches(2)
# heading_cells[0].text = 'Line No'
# heading_cells[1].text = 'Audited Line No'
# heading_cells[2].text = 'Current Content'
# heading_cells[3].text = 'Script Element'
# heading_cells[4].text = 'New Content'
# heading_cells[5].text = 'Changes Done'
# print("assigned heading")
# for i in range(0,6):
# heading_cells[i].paragraphs[0].runs[0].font.bold = True
# heading_cells[i].paragraphs[0].runs[0].font.size = Pt(9)
# print("assigned Index")
# for index in audit_df.index:
# row_index = 1
# #line_no = audit_df['line_no'][index]
# cells = table.add_row().cells
# cells[0].width = Inches(0.5)
# cells[0].text = str(index)
# audited_line_no = audit_df['audited_line_no'][index]
# data = str(audited_line_no)
# cells[1].width = Inches(0.5)
# cells[1].text = data
# cur_data = audit_df['data'][index]
# data = cur_data
# cells[2].width = Inches(3.5)
# data = str(data)
# cells[2].text = data
# if audit_df['Identification_Status'][index] == 'blank':
# script_element = 'Blank Line'
# elif audit_df['Identification_Status'][index] == '':
# if audit_df['introduction'][index] == 'Yes':
# script_element = 'Title/Introduction'
# elif audit_df['appendix'][index] == 'Yes':
# script_element = 'Appendix'
# # -----------------------------changed with mohit sir
# else:
# continue
# # -----------------------------changed with mohit sir
# else:
# script_element = ps_to_script_element(audit_df['Identification_Status'][index])
# data = script_element
# cells[3].width = Inches(0.8)
# cells[3].text = data
# new_data = audit_df['data_corrected'][index]
# data = new_data
# cells[4].width = Inches(3.5)
# data = str(data)
# cells[4].text = data
# # if audit_df['line_removed'][index] == 'Yes':
# # data = "Line was removed"
# # run = para.add_run()
# # run.text = data
# # run.add_break()
# # continue
# sno = 1
# changes_done = False
# if audit_df['left_indent_corrected'][index] != 'No':
# change_comment = audit_df['left_indent_corrected'][index]
# data = str(sno) + '. ' + str(change_comment)
# cells[5].width = Inches(2)
# para = cells[5].add_paragraph()
# run = para.add_run()
# run.text = data
# run.add_break()
# sno += 1
# changes_done = True
# if audit_df['right_indent_corrected'][index] != 'No':
# change_comment = audit_df['right_indent_corrected'][index]
# data = str(sno) + '. ' + str(change_comment)
# cells[5].width = Inches(2)
# para = cells[5].add_paragraph()
# run = para.add_run()
# run.text = data
# run.add_break()
# sno += 1
# changes_done = True
# if audit_df['case_corrected'][index] != 'No':
# change_comment = 'Case ' + str(audit_df['case_corrected'][index])
# data = str(sno) + '. ' + str(change_comment)
# cells[5].width = Inches(2)
# para = cells[5].add_paragraph()
# run = para.add_run()
# run.text = data
# run.add_break()
# sno += 1
# changes_done = True
# if audit_df['line_wrapped_at_prescribed_right_indent'][index] != 'No':
# change_comment = 'Line Wrapped at Prescribed Right Indent'
# data = str(sno) + '. ' + str(change_comment)
# cells[5].width = Inches(2)
# para = cells[5].add_paragraph()
# run = para.add_run()
# run.text = data
# run.add_break()
# sno += 1
# changes_done = True
# if audit_df['line_broken_into_multiple_lines'][index] != 'No':
# change_comment = 'Line Broken into Multiple Lines'
# data = str(sno) + '. ' + str(change_comment)
# cells[5].width = Inches(2)
# para = cells[5].add_paragraph()
# run = para.add_run()
# run.text = data
# run.add_break()
# sno += 1
# changes_done = True
# if audit_df['line_merged_with_next_line'][index] != 'No':
# change_comment = 'Line Merged with Next Line'
# data = str(sno) + '. ' + str(change_comment)
# cells[5].width = Inches(2)
# para = cells[5].add_paragraph()
# run = para.add_run()
# run.text = data
# run.add_break()
# sno += 1
# changes_done = True
# if audit_df['language_specific_audit_comments'][index] != 'No':
# change_comment = str(audit_df['language_specific_audit_comments'][index])
# data = str(sno) + '. ' + str(change_comment)
# cells[5].width = Inches(2)
# para = cells[5].add_paragraph()
# run = para.add_run()
# run.text = data
# run.add_break()
# sno += 1
# changes_done = True
# if not changes_done:
# data = 'No Changes Done'
# cells[5].width = Inches(2)
# para = cells[5].add_paragraph()
# run = para.add_run()
# run.text = data
# run.add_break()
# row_index += 1
# buffer = io.BytesIO()
# output_doc.save(buffer)
# buffer.seek(0)
# print("complete")
# #output_doc.save(audit_report_tabular_docx)
# return buffer
# def print_audit_report_tabular_docx(audit_df):
# #line_removed header left_indent_corrected right_indent_corrected line_wrapped_at_prescribed_right_indent case_corrected #blank_inserted_before blank_inserted_after blank_deleted_before blank_deleted_after space_removed_between_characters #space_added_between_characters line_merged_with_next_line line_broken_into_multiple_lines punctuation_mark_added #punctuation_mark_removed
# output_doc = Document()
# style = output_doc.styles['Normal']
# font = style.font
# #font.name = 'Courier New'
# font.size = Pt(8)
# section = output_doc.sections[-1]
# section.orientation = WD_ORIENT.LANDSCAPE
# section.page_width = Inches(11)
# section.left_margin = Inches(0.25)
# section.right_margin = Inches(0.25)
# para = output_doc.add_paragraph()
# para.alignment = WD_ALIGN_PARAGRAPH.CENTER
# run = para.add_run()
# font = run.font
# font.size = Pt(12)
# run.text = ' Audit Report'
# run.add_break()
# run.add_break()
# para = output_doc.add_paragraph()
# para.alignment = WD_ALIGN_PARAGRAPH.LEFT
# run = para.add_run()
# font = run.font
# font.size = Pt(10)
# run.text = ' Audit Summary'
# para = output_doc.add_paragraph()
# run = para.add_run()
# font = run.font
# font.size = Pt(9)
# case_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['case_corrected'] != 'No'),:])
# left_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['left_indent_corrected'] != 'No'),:])
# right_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['right_indent_corrected'] != 'No'),:])
# wrapped_lines_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_wrapped_at_prescribed_right_indent'] != 'No'),:])
# table =output_doc.add_table(1, cols =2)
# table.style = 'Table Grid'
# font.size = Pt(9)
# heading_cells = table.rows[0].cells
# heading_cells[0].width = Inches(2)
# heading_cells[1].width = Inches(1)
# heading_cells[0].text = 'Type of Change Done'
# heading_cells[1].text = 'Count of Lines'
# for i in range(0,2):
# heading_cells[i].paragraphs[0].runs[0].font.bold = True
# cells = table.add_row().cells
# font.size = Pt(8)
# cells[0].width = Inches(2)
# cells[0].text = 'Case Corrected'
# cells[1].width = Inches(0.5)
# cells[1].text = str(case_corrected_count)
# cells = table.add_row().cells
# font.size = Pt(8)
# cells[0].width = Inches(2)
# cells[0].text = 'Left Indent Corrected'
# cells[1].width = Inches(0.5)
# cells[1].text = str(left_indent_corrected_count)
# cells = table.add_row().cells
# font.size = Pt(8)
# cells[0].width = Inches(2)
# cells[0].text = 'Case Corrected'
# cells[1].width = Inches(0.5)
# cells[1].text = str(right_indent_corrected_count)
# cells = table.add_row().cells
# font.size = Pt(8)
# cells[0].width = Inches(2)
# cells[0].text = 'Lines Wrapped at prescribed indents'
# cells[1].width = Inches(0.5)
# cells[1].text = str(wrapped_lines_count)
# run.add_break()
# para = output_doc.add_paragraph()
# run = para.add_run()
# run.add_break()
# run.add_break()
# font.size = Pt(8)
# no_rows = len(audit_df.index)
# table =output_doc.add_table(1, cols =6)
# table.alignment = WD_TABLE_ALIGNMENT.CENTER
# table.style = 'Table Grid'
# table.autofit = False
# table.columns[0].width = Inches(0.5)
# table.columns[1].width = Inches(1.2)
# table.columns[2].width = Inches(2)
# table.columns[3].width = Inches(1.5)
# table.columns[4].width = Inches(2)
# table.columns[5].width = Inches(2.5)
# heading_cells = table.rows[0].cells
# heading_cells[0].width = Inches(0.5)
# heading_cells[1].width = Inches(0.5)
# heading_cells[2].width = Inches(3.5)
# heading_cells[3].width = Inches(0.8)
# heading_cells[4].width = Inches(3.5)
# heading_cells[5].width = Inches(2)
# heading_cells[0].text = 'Line No'
# heading_cells[1].text = 'Audited Line No'
# heading_cells[2].text = 'Current Content'
# heading_cells[3].text = 'Script Element'
# heading_cells[4].text = 'New Content'
# heading_cells[5].text = 'Changes Done'
# for i in range(0,6):
# heading_cells[i].paragraphs[0].runs[0].font.bold = True
# heading_cells[i].paragraphs[0].runs[0].font.size = Pt(9)
# for index in audit_df.index:
# columns_to_check = ["line_removed","introduction", "appendix", "page_no" ,"left_indent_corrected" ,"right_indent_corrected" ,"line_wrapped_at_prescribed_right_indent", "case_corrected", "blank_inserted_before" ,"blank_inserted_after" ,"blank_deleted_before" ,"blank_deleted_after" ,"space_removed_between_characters" ,"space_added_between_characters" ,"line_merged_with_next_line", "line_broken_into_multiple_lines" ,"punctuation_mark_added" ,"punctuation_mark_removed" ,"language_specific_audit_comments"]
# audit_df[columns_to_check] = audit_df[columns_to_check].fillna('No')
# if audit_df.loc[index, columns_to_check].eq('No').all().all():
# continue
# elif audit_df['introduction'][index] == 'Yes':
# continue
# elif audit_df['appendix'][index] == 'Yes':
# continue
# elif audit_df['Identification_Status'][index] == 'blank':
# continue
# elif pd.isna(audit_df.loc[index, "Identification_Status"]):
# continue
# row_index = 1
# #line_no = audit_df['line_no'][index]
# cells = table.add_row().cells
# cells[0].width = Inches(0.5)
# cells[0].text = str(index)
# audited_line_no = audit_df['audited_line_no'][index]
# data = str(audited_line_no)
# cells[1].width = Inches(0.5)
# cells[1].text = data
# cur_data = audit_df['data'][index]
# data = str(cur_data).strip()
# cells[2].width = Inches(3.5)
# data = str(data)
# cells[2].text = data
# if audit_df['Identification_Status'][index] == 'blank':
# script_element = 'Blank Line'
# elif audit_df['Identification_Status'][index] == '':
# if audit_df['introduction'][index] == 'Yes':
# script_element = 'Title/Introduction'
# elif audit_df['appendix'][index] == 'Yes':
# script_element = 'Appendix'
# # -----------------------------changed with mohit sir
# else:
# continue
# # -----------------------------changed with mohit sir
# else:
# script_element = ps_to_script_element(audit_df['Identification_Status'][index])
# data = script_element
# cells[3].width = Inches(0.8)
# cells[3].text = data
# new_data = audit_df['data_corrected'][index]
# data = str(new_data).strip()
# cells[4].width = Inches(3.5)
# data = str(data)
# cells[4].text = data
# sno = 1
# changes_done = False
# # identification_status = audit_df['Identification_Status'][index]
# if pd.isnull(audit_df['Identification_Status'][index]) or audit_df['Identification_Status'][index] == "":
# continue
# if audit_df['left_indent_corrected'][index] != 'No':
# change_comment = audit_df['left_indent_corrected'][index]
# try:
# str_int = change_comment[-2]+change_comment[-1]
# except Exception as e:
# pass
# if ps_to_script_element(audit_df['Identification_Status'][index]) == "Dialogue":
# if str_int == "15":
# change_comment = "Dialogue line left index corrected to 1.5 Inch"
# elif str_int == "25":
# change_comment = "Dialogue line left index corrected to 2.5 Inch"
# if str_int == "15":
# name = ps_to_script_element(audit_df['Identification_Status'][index])
# change_comment = f"{name} line left indent corrected to 1.5 Inch"
# print(change_comment)
# elif str_int == "25":
# name = ps_to_script_element(audit_df['Identification_Status'][index])
# change_commen = f"{name} left indent corrected to 2.5 Inch"
# elif str_int == "30":
# change_comment = "Parenthetical left indent corrected to 3 Inch"
# elif str_int == "35":
# change_comment = "Speaker left indent corrected to 3.5 Inch"
# if len(str(change_comment)) <= 2 :
# continue
# data = str(sno) + '. ' + str(change_comment)
# cells[5].width = Inches(2)
# para = cells[5].add_paragraph()
# run = para.add_run()
# run.text = data
# run.add_break()
# sno += 1
# changes_done = True
# if audit_df['right_indent_corrected'][index] != 'No':
# name = ps_to_script_element(audit_df['Identification_Status'][index])
# change_comment = audit_df['right_indent_corrected'][index]
# try:
# str_int = change_comment[-2]+change_comment[-1]
# except Exception as e:
# pass
# if str_int == "10":
# change_comment = f"{name} right indent corrected to 1 Inch"
# if len(str(change_comment)) <= 2 :
# continue
# data = str(sno) + '. ' + str(change_comment)
# cells[5].width = Inches(2)
# para = cells[5].add_paragraph()
# run = para.add_run()
# run.text = data
# run.add_break()
# sno += 1
# changes_done = True
# if audit_df['case_corrected'][index] != 'No':
# name = ps_to_script_element(audit_df['Identification_Status'][index])
# string = str(audit_df['case_corrected'][index])
# string = string.split()
# content = string[-1]
# if content == "AllUpper":
# change_comment = f'{name} Case ' + "Corrected to All Upper"
# elif content == "AllLower":
# change_comment = f'{name} Case ' + "Corrected to All Lowerr"
# if len(str(change_comment)) <= 2 :
# continue
# data = str(sno) + '. ' + str(change_comment)
# cells[5].width = Inches(2)
# para = cells[5].add_paragraph()
# run = para.add_run()
# run.text = data
# run.add_break()
# sno += 1
# changes_done = True
# if audit_df['line_wrapped_at_prescribed_right_indent'][index] != 'No':
# change_comment = 'Line Wrapped at Prescribed Right Indent 1 Inch'
# name = ps_to_script_element(audit_df['Identification_Status'][index])
# if name == "Action":
# change_comment = f'{name}Line Wrapped at Prescribed Right Indent 1 Inch'
# elif name == "Dialogue":
# change_comment = f'{name}Line Wrapped at Prescribed Right Indent 2 Inch'
# if len(str(change_comment)) <= 2 :
# continue
# data = str(sno) + '. ' + str(change_comment)
# cells[5].width = Inches(2)
# para = cells[5].add_paragraph()
# run = para.add_run()
# run.text = data
# run.add_break()
# sno += 1
# changes_done = True
# if audit_df['line_broken_into_multiple_lines'][index] != 'No':
# name = ps_to_script_element(audit_df['Identification_Status'][index])
# change_comment = f'{name} line Broken into Multiple Lines'
# if len(str(change_comment)) <= 2 :
# continue
# data = str(sno) + '. ' + str(change_comment)
# cells[5].width = Inches(2)
# para = cells[5].add_paragraph()
# run = para.add_run()
# run.text = data
# run.add_break()
# sno += 1
# changes_done = True
# if audit_df['line_merged_with_next_line'][index] != 'No':
# name = ps_to_script_element(audit_df['Identification_Status'][index])
# change_comment = f'{name} line Merged with Next Line'
# if len(str(change_comment)) <= 2 :
# continue
# data = str(sno) + '. ' + str(change_comment)
# cells[5].width = Inches(2)
# para = cells[5].add_paragraph()
# run = para.add_run()
# run.text = data
# run.add_break()
# sno += 1
# changes_done = True
# if audit_df['language_specific_audit_comments'][index] != 'No':
# pass
# name = ps_to_script_element(audit_df['Identification_Status'][index])
# change_comment = f"{name}",str(audit_df['language_specific_audit_comments'][index])
# if len(str(change_comment)) <= 2 :
# continue
# data = str(sno) + '. ' + str(change_comment)
# cells[5].width = Inches(2)
# para = cells[5].add_paragraph()
# run = para.add_run()
# run.text = data
# run.add_break()
# sno += 1
# changes_done = True
# if audit_df['blank_inserted_after'][index] != 'No':
# change_comment = 'A blank line is added below'
# data = str(sno) + '. ' + str(change_comment)
# cells[5].width = Inches(2)
# para = cells[5].add_paragraph()
# run = para.add_run()
# run.text = data
# run.add_break()
# sno += 1
# changes_done = True
# if not changes_done:
# continue
# # data = 'No Changes Done'
# # cells[5].width = Inches(2)
# # para = cells[5].add_paragraph()
# # run = para.add_run()
# # run.text = data
# # run.add_break()
# row_index += 1
# buffer = io.BytesIO()
# output_doc.save(buffer)
# buffer.seek(0)
# # output_doc.save(audit_report_tabular_docx)
# return buffer
# def print_audit_report_tabular_docx(audit_df,scriptname,author,pre_audit_pagenumber,postauditpagenumber,preaudit_line_no,postaudit_line_no,script_language,dialogue_language):
# #line_removed header left_indent_corrected right_indent_corrected line_wrapped_at_prescribed_right_indent case_corrected #blank_inserted_before blank_inserted_after blank_deleted_before blank_deleted_after space_removed_between_characters #space_added_between_characters line_merged_with_next_line line_broken_into_multiple_lines punctuation_mark_added #punctuation_mark_removed
# total_no_blanklines = len(audit_df[audit_df['Identification_Status'].isin(['blank'])])
# # <---------------------BLANK LINE ADD AND remove LOGIC IS HERE----------------->
# blankline_added = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['blank_inserted_before'] != 'No'),:] )
# blank_add_after = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['blank_inserted_after'] != 'No'),:] )
# blankline_inserted = blankline_added + blank_add_after
# blankline_rem_before = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['blank_deleted_before'] != 'No'),:] )
# blank_rem_after = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['blank_deleted_after'] != 'No'),:] )
# blankline_removed_total = blankline_rem_before + blank_rem_after
# ### <<----------------- logic for case --------------------------------->
# # for slugline
# # case corrected
# sluglinecase_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['case_corrected'] != 'No') & (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])), :])
# print(sluglinecase_corrected_count)
# # indentatioin corrected
# sleft_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['left_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])),:])
# sright_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['right_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])),:])
# swrapped_lines_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_wrapped_at_prescribed_right_indent'] != 'No')& (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])),:])
# slugline_indentation = sleft_indent_corrected_count + sright_indent_corrected_count + swrapped_lines_count
# print("sluglin_indentation:",slugline_indentation)
# # formate corrected
# slugline_formate1 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_removed_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])),:])
# slugline_formate2 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_added_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])),:])
# slugline_formate3 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_merged_with_next_line'] != 'No')& (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])),:])
# slugline_formate4 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_broken_into_multiple_lines'] != 'No')& (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])),:])
# slugline_formate5 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_added'] != 'No')& (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])),:])
# slugline_formate6 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_removed'] != 'No')& (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])),:])
# slugline_formated = slugline_formate1 + slugline_formate2 + slugline_formate3 + slugline_formate4 + slugline_formate5 + slugline_formate6
# print("slugline_formated",slugline_formated)
# #total sluglines
# total_no_sluglines = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])), :])
# print(total_no_sluglines)
# # for actioon -----line
# # case corrected
# actionlinecase_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['case_corrected'] != 'No') & (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])), :])
# print(actionlinecase_corrected_count)
# # indentatioin corrected
# actionleft_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['left_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])),:])
# actionright_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['right_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])),:])
# actionwrapped_lines_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_wrapped_at_prescribed_right_indent'] != 'No')& (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])),:])
# actionline_indentation = actionleft_indent_corrected_count + actionright_indent_corrected_count + actionwrapped_lines_count
# print("actionliine_indentation:",actionline_indentation)
# # formate corrected
# actionline_formate1 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_removed_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])),:])
# actionline_formate2 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_added_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])),:])
# actionline_formate3 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_merged_with_next_line'] != 'No')& (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])),:])
# actionline_formate4 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_broken_into_multiple_lines'] != 'No')& (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])),:])
# actionline_formate5 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_added'] != 'No')& (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])),:])
# actionline_formate6 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_removed'] != 'No')& (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])),:])
# actionline_formated = actionline_formate1 + actionline_formate2 + actionline_formate3 + actionline_formate4 + actionline_formate5 + actionline_formate6
# print("actionline_formated",actionline_formated)
# #total no of actionline
# total_actionlines = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])), :])
# print(total_actionlines)
# # for Speaker
# # case corrected
# speakercase_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['case_corrected'] != 'No') & (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])), :])
# print("speakercase_corrected_count", speakercase_corrected_count)
# # indentatioin corrected
# speakerleft_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['left_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])),:])
# speakerright_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['right_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])),:])
# speaker_lines_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_wrapped_at_prescribed_right_indent'] != 'No')& (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])),:])
# speaker_indentation = speakerleft_indent_corrected_count + speakerright_indent_corrected_count + speaker_lines_count
# print("speaker_indentation:",speaker_indentation)
# # formate corrected
# speaker_formate1 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_removed_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])),:])
# speaker_formate2 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_added_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])),:])
# speaker_formate3 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_merged_with_next_line'] != 'No')& (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])),:])
# speaker_formate4 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_broken_into_multiple_lines'] != 'No')& (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])),:])
# speaker_formate5 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_added'] != 'No')& (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])),:])
# speaker_formate6 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_removed'] != 'No')& (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])),:])
# speaker_formated = speaker_formate1 + speaker_formate2 + speaker_formate3 + speaker_formate4 + speaker_formate5 + speaker_formate6
# print("speaker_formated",speaker_formated)
# #total no of speaker -speaker
# total_no_speaker = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])),:])
# print(total_no_speaker)
# # for Parenthetical -----line
# # case corrected
# parentheticalcase_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['case_corrected'] != 'No') & (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])), :])
# print(parentheticalcase_corrected_count)
# # indentatioin corrected
# parenthetical_left_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['left_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])),:])
# parenthetical_right_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['right_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])),:])
# parenthetical_wrapped_lines_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_wrapped_at_prescribed_right_indent'] != 'No')& (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])),:])
# parenthetical_line_indentation = parenthetical_left_indent_corrected_count + parenthetical_right_indent_corrected_count + parenthetical_wrapped_lines_count
# print("parenthetical_line_indentation:",parenthetical_line_indentation)
# # formate corrected
# parenthetical_formate1 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_removed_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])),:])
# parenthetical_formate2 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_added_between_characters'] != 'No') & (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])),:])
# parenthetical_formate3 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_merged_with_next_line'] != 'No')& (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])),:])
# parenthetical_formate4 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_broken_into_multiple_lines'] != 'No')& (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])),:])
# parenthetical_formate5 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_added'] != 'No')& (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])),:])
# parenthetical_formate6 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_removed'] != 'No')& (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])),:])
# parenthetical_formated = parenthetical_formate1 + parenthetical_formate2 + parenthetical_formate3 + parenthetical_formate4 + parenthetical_formate5 + parenthetical_formate6
# print("parenthetical_formated",parenthetical_formated)
# #total number of parenthetical
# total_no_parenthetical = len(audit_df.loc[(audit_df['line_removed'] == 'No')& (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])),:])
# print(total_no_parenthetical)
# # for Dialogue -----line
# # case corrected
# Dialogue_case_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['case_corrected'] != 'No') & (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])), :])
# print(Dialogue_case_corrected_count)
# # indentatioin corrected
# dialogue_left_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['left_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])),:])
# dialogue_right_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['right_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])),:])
# dialogue_wrapped_lines_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_wrapped_at_prescribed_right_indent'] != 'No')& (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])),:])
# dialogue_line_indentation = dialogue_left_indent_corrected_count + dialogue_right_indent_corrected_count + dialogue_wrapped_lines_count
# print("dialogue_line_indentation:",dialogue_line_indentation)
# # formate corrected
# dialogue_formate1 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_removed_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])),:])
# dialogue_formate2 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_added_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])),:])
# dialogue_formate3 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_merged_with_next_line'] != 'No')& (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])),:])
# dialogue_formate4 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_broken_into_multiple_lines'] != 'No')& (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])),:])
# dialogue_formate5 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_added'] != 'No')& (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])),:])
# dialogue_formate6 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_removed'] != 'No')& (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])),:])
# dialogue_formated = dialogue_formate1 + dialogue_formate2 + dialogue_formate3 + dialogue_formate4 + dialogue_formate5 + dialogue_formate6
# print("dialogue_formated",dialogue_formated)
# # total number of dialogue
# total_no_dialogue = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])),:])
# print(total_no_dialogue)
# # for Transistion -----line
# # case corrected
# transitions_case_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['case_corrected'] != 'No') & (audit_df['Identification_Status'].isin(['ps16'])), :])
# print(transitions_case_corrected_count)
# # indentatioin corrected
# transitions_left_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['left_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps16'])),:])
# transitions_right_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['right_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps16'])),:])
# transitions_wrapped_lines_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_wrapped_at_prescribed_right_indent'] != 'No')& (audit_df['Identification_Status'].isin(['ps16'])),:])
# transitions_line_indentation = transitions_left_indent_corrected_count + transitions_right_indent_corrected_count + transitions_wrapped_lines_count
# print("transitions_line_indentation:",transitions_line_indentation)
# # formate corrected
# transitions_formate1 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_removed_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps16'])),:])
# transitions_formate2 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_added_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps16'])),:])
# transitions_formate3 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_merged_with_next_line'] != 'No')& (audit_df['Identification_Status'].isin(['ps16'])),:])
# transitions_formate4 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_broken_into_multiple_lines'] != 'No')& (audit_df['Identification_Status'].isin(['ps16'])),:])
# transitions_formate5 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_added'] != 'No')& (audit_df['Identification_Status'].isin(['ps16'])),:])
# transitions_formate6 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_removed'] != 'No')& (audit_df['Identification_Status'].isin(['ps16'])),:])
# transitions_formated = transitions_formate1 + transitions_formate2 + transitions_formate3 + transitions_formate4 + transitions_formate5 + transitions_formate6
# print("transitions_formated",transitions_formated)
# #total transition
# total_no_transition = len(audit_df.loc[audit_df['Identification_Status'].isin(['ps16']),:])
# print(total_no_transition)
# # for Spectial Terms -----line
# # case corrected
# st_case_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['case_corrected'] != 'No') & (audit_df['Identification_Status'].isin(['ps17'])), :])
# print("st_case_corrected_count",st_case_corrected_count)
# # indentatioin corrected
# st_left_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['left_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps17'])),:])
# st_right_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['right_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps17'])),:])
# st_wrapped_lines_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_wrapped_at_prescribed_right_indent'] != 'No')& (audit_df['Identification_Status'].isin(['ps17'])),:])
# st_line_indentation = st_left_indent_corrected_count + st_right_indent_corrected_count + st_wrapped_lines_count
# print("st_line_indentation:",st_line_indentation)
# # formate corrected
# st_formate1 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_removed_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps17'])),:])
# st_formate2 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_added_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps17'])),:])
# st_formate3 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_merged_with_next_line'] != 'No')& (audit_df['Identification_Status'].isin(['ps17'])),:])
# st_formate4 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_broken_into_multiple_lines'] != 'No')& (audit_df['Identification_Status'].isin(['ps17'])),:])
# st_formate5 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_added'] != 'No')& (audit_df['Identification_Status'].isin(['ps17'])),:])
# st_formate6 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_removed'] != 'No')& (audit_df['Identification_Status'].isin(['ps17'])),:])
# st_formated = st_formate1 + st_formate2 + st_formate3 + st_formate4 + st_formate5 + st_formate6
# print("st_formated",st_formated)
# #total numner of special terms
# total_special_terms = len(audit_df.loc[audit_df['Identification_Status'].isin(['ps17']),:])
# if total_special_terms < 1 :
# total_special_terms = 1
# print(total_special_terms)
# # write logic for the percentage
# #a
# difference_of_page_no = int(pre_audit_pagenumber) - int(postauditpagenumber)
# average_of_page_no = (int(pre_audit_pagenumber) + int(postauditpagenumber)) / 2
# final_ratio_pageno = (difference_of_page_no / average_of_page_no) * 100
# #b
# difference_of_line_no = int(preaudit_line_no)- int(postaudit_line_no)
# average_of_line_no = (int(preaudit_line_no) + int(postaudit_line_no)) / 2
# final_ratio_lineno = (difference_of_line_no / average_of_line_no) * 100
# #c
# try:
# ratio_for_blanklines = ((int(blankline_inserted) + int(blankline_removed_total)) / average_of_line_no) *100
# except:
# ratio_for_blanklines = 0
# #j
# try:
# ratio_for_sluglines = ((int(sluglinecase_corrected_count)+int(slugline_indentation)+int(slugline_formated))/total_no_sluglines)*100
# except:
# ratio_for_sluglines = 0
# #d
# try:
# ratio_for_actionlines = ((int(actionlinecase_corrected_count)+ int(actionline_indentation)+ int(total_actionlines))/total_actionlines)*100
# except:
# ratio_for_actionlines = 0
# #e
# try:
# ratio_for_Speaker = ((int(speakercase_corrected_count)+int(speaker_formated)+int(speaker_formated))/ total_actionlines)*100
# except:
# ratio_for_Speaker = 0
# #f
# try:
# ratio_for_parenthetical = ((int(parentheticalcase_corrected_count)+int(parenthetical_line_indentation)+int(parenthetical_formated)) / total_no_parenthetical)*100
# except:
# ratio_for_parenthetical = 0
# #g
# try:
# ratio_for_dialogues = ((int(Dialogue_case_corrected_count)+int(dialogue_line_indentation)+int(dialogue_formated)) / total_no_dialogue)*100
# except:
# ratio_for_dialogues = 0
# #h
# try:
# ratio_for_transitions = ((int(transitions_case_corrected_count)+int(transitions_line_indentation)+int(transitions_formated)) / total_no_transition)*100
# except:
# ratio_for_transitions = 0
# #i
# try:
# ratio_for_special_terms = ((int(st_case_corrected_count)+int(st_line_indentation)+int(st_formated))/total_special_terms) * 100
# except:
# ratio_for_special_terms = 0
# average_of_c_j = (ratio_for_sluglines+ratio_for_actionlines+ratio_for_Speaker+ratio_for_parenthetical+ratio_for_dialogues+ratio_for_transitions+ratio_for_special_terms)/7
# audit_configuration_percentage = (final_ratio_pageno+final_ratio_lineno+ratio_for_blanklines) + (average_of_c_j)
# audit_configuration_percentage_str = f"{audit_configuration_percentage:.2f}%"
# print("audit_configuration_percentage",audit_configuration_percentage_str)
# total_script_element_correct = (total_no_sluglines+total_actionlines+total_no_speaker+total_no_parenthetical+total_no_dialogue+total_no_transition+total_special_terms)
# print("total_script_element_correct",total_script_element_correct)
# audit_script_accuracy = (total_no_sluglines+total_actionlines+total_no_speaker+total_no_parenthetical+total_no_dialogue+total_no_transition+total_special_terms+total_no_blanklines)/preaudit_line_no
# print("audit_script_accuracy",audit_script_accuracy)
# # audit_script_accuracy_str = min(audit_script_accuracy*100 , 100)
# audit_script_accuracy_str = min(audit_script_accuracy*100,100)
# audit_script_accuracy_str = f"{audit_script_accuracy_str:.2f}%"
# print("audit_script_accuracy_str",audit_script_accuracy_str)
# # the table logics ends here
# # percenteage table from here
# output_doc = Document()
# style = output_doc.styles['Normal']
# font = style.font
# #font.name = 'Courier New'
# font.size = Pt(10)
# section = output_doc.sections[-1]
# section.orientation = WD_ORIENT.LANDSCAPE
# section.page_width = Inches(11)
# section.left_margin = Inches(0.25)
# section.right_margin = Inches(0.25)
# para = output_doc.add_paragraph()
# para.alignment = WD_ALIGN_PARAGRAPH.CENTER
# # Audit Summary at center of the page with bold
# run = para.add_run()
# font = run.font
# font.bold = True
# font.size = Pt(14)
# run.text = ' Audit Summary'
# run.add_break()
# # Add a paragraph for the left-aligned "Audit Date"
# current_date = date.today()
# # Convert to the "day month year" format
# formatted_date = current_date.strftime("%d %B %Y")
# left_aligned_text = output_doc.add_paragraph("\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tAudit Date: " + str(formatted_date))
# left_aligned_text.alignment = WD_ALIGN_PARAGRAPH.LEFT
# font_audit_date = left_aligned_text.runs[0].font
# font_audit_date.size = Pt(12)
# para = output_doc.add_paragraph()
# right_aligned_text = para.add_run('\t\tScriptname: ' + str(scriptname))
# right_aligned_text.alignment = WD_ALIGN_PARAGRAPH.RIGHT
# font_right = right_aligned_text.font
# font_right.size = Pt(12)
# author_para = output_doc.add_paragraph()
# run_author = author_para.add_run("\t\tAuthor: " + str(author))
# font_author = run_author.font
# font_author.size = Pt(12)
# language_script_para = output_doc.add_paragraph()
# run_language_script = language_script_para.add_run("\t\tLanguage of Script: " + str(script_language))
# font_language_script = run_language_script.font
# font_language_script.size = Pt(12)
# language_dialogue_para = output_doc.add_paragraph()
# run_language_dialogue = language_dialogue_para.add_run("\t\tLanguage of Dialogue: " + str(dialogue_language))
# font_language_dialogue = run_language_dialogue.font
# font_language_dialogue.size = Pt(12)
# # for pre audit and post Audit
# para = output_doc.add_paragraph()
# run = para.add_run()
# font = run.font
# font.size = Pt(11)
# percent_table = output_doc.add_table(1, cols =2)
# percent_table.alignment = WD_TABLE_ALIGNMENT.CENTER
# percent_table.style = 'Table Grid'
# percent_heading_cells = percent_table.rows[0].cells
# percent_heading_cells[0].width = Inches(1.5)
# percent_heading_cells[1].width = Inches(1)
# percent_heading_cells[0].text = 'Audit Contribution'
# percent_heading_cells[1].text = str(audit_configuration_percentage_str)
# percent_heading_cells = percent_table.add_row().cells
# font.size = Pt(12)
# percent_heading_cells[0].width = Inches(1.5)
# percent_heading_cells[0].text = 'Audit Script Accuracy'
# percent_heading_cells[1].width = Inches(1)
# percent_heading_cells[1].text = str(audit_script_accuracy_str)
# para = output_doc.add_paragraph()
# pre_post_table = output_doc.add_table(1, cols =3)
# pre_post_table.alignment = WD_TABLE_ALIGNMENT.CENTER
# pre_post_table.style = 'Table Grid'
# preheading_cells = pre_post_table.rows[0].cells
# preheading_cells[0].width = Inches(1.5)
# preheading_cells[1].width = Inches(1)
# preheading_cells[2].width = Inches(1)
# preheading_cells[1].text = 'Pre Audit'
# preheading_cells[2].text = 'Post Audit'
# # row No of pages
# pcells = pre_post_table.add_row().cells
# font.size = Pt(12)
# pcells[0].width = Inches(1.5)
# pcells[0].text = 'No of Pages'
# pcells[1].width = Inches(1)
# pcells[1].text = str(pre_audit_pagenumber)
# pcells[2].width = Inches(1)
# pcells[2].text = str(postauditpagenumber)
# # row no of lines
# pcells = pre_post_table.add_row().cells
# font.size = Pt(12)
# pcells[0].width = Inches(1.5)
# pcells[0].text = 'No of lines'
# pcells[1].width = Inches(1)
# pcells[1].text = str(preaudit_line_no)
# pcells[2].width = Inches(1)
# pcells[2].text = str(postaudit_line_no)
# # adding extra line after the table above
# para = output_doc.add_paragraph()
# run = para.add_run()
# font = run.font
# font.size = Pt(12)
# #--------------------------------------
# bl_table = output_doc.add_table(1, cols =2)
# bl_table.alignment = WD_TABLE_ALIGNMENT.CENTER
# bl_table.style = 'Table Grid'
# bl_heading_cells = bl_table.rows[0].cells
# bl_heading_cells[0].width = Inches(1.5)
# bl_heading_cells[0].text = 'Blank Lines Added'
# bl_heading_cells[1].width = Inches(1.5)
# bl_heading_cells[1].text = str(blankline_inserted) # add the number here
# blcells = bl_table.add_row().cells
# font.size = Pt(12)
# blcells[0].width = Inches(1.5)
# blcells[0].text = 'Blank Lines Removed'
# blcells[1].width = Inches(1.5)
# blcells[1].text = str(blankline_removed_total) # add the number here
# # adding extra line after the table above
# para = output_doc.add_paragraph()
# run = para.add_run()
# font = run.font
# font.size = Pt(12)
# sum_table = output_doc.add_table(1, cols =4)
# sum_table.alignment = WD_TABLE_ALIGNMENT.CENTER
# sum_table.style = 'Table Grid'
# sum_heading_cells = sum_table.rows[0].cells
# sum_heading_cells[0].width = Inches(1.5)
# sum_heading_cells[0].text = ''
# sum_heading_cells[1].width = Inches(1.5)
# sum_heading_cells[1].height = Inches(0.5)
# sum_heading_cells[1].text = 'Case Correction'
# sum_heading_cells[2].width = Inches(1.5)
# sum_heading_cells[2].text = 'Indent Correction'
# sum_heading_cells[3].width = Inches(1.5)
# sum_heading_cells[3].text = 'Format Correction'
# sum_cells = sum_table.add_row().cells
# font.size = Pt(12)
# sum_cells[0].width = Inches(1.5)
# sum_cells[0].height = Inches(0.3)
# sum_cells[0].text = 'Sluglines'
# sum_cells[1].width = Inches(1.5)
# sum_cells[1].height = Inches(0.3)
# sum_cells[1].text = str(sluglinecase_corrected_count)
# sum_cells[2].width = Inches(1.5)
# sum_cells[2].height = Inches(0.3)
# sum_cells[2].text = str(slugline_indentation)
# sum_cells[3].width = Inches(1.5)
# sum_cells[3].height = Inches(0.3)
# sum_cells[3].text = str(slugline_formated)
# sum_cells = sum_table.add_row().cells
# font.size = Pt(12)
# sum_cells[0].width = Inches(1.5)
# sum_cells[0].text = 'Actioin Lines'
# sum_cells[1].width = Inches(1.5)
# sum_cells[1].text = str(actionlinecase_corrected_count)
# sum_cells[2].width = Inches(1.5)
# sum_cells[2].text = str(actionline_indentation)
# sum_cells[3].width = Inches(1.5)
# sum_cells[3].text = str(actionline_formated)
# sum_cells = sum_table.add_row().cells
# font.size = Pt(12)
# sum_cells[0].width = Inches(1.5)
# sum_cells[0].text = 'Speakers'
# sum_cells[1].width = Inches(1.5)
# sum_cells[1].text = str(speakercase_corrected_count)
# sum_cells[2].width = Inches(1.5)
# sum_cells[2].text = str(speaker_indentation)
# sum_cells[3].width = Inches(1.5)
# sum_cells[3].text = str(speaker_formated)
# sum_cells = sum_table.add_row().cells
# font.size = Pt(12)
# sum_cells[0].width = Inches(1.5)
# sum_cells[0].text = 'Parentheticals'
# sum_cells[1].width = Inches(1.5)
# sum_cells[1].text = str(parentheticalcase_corrected_count)
# sum_cells[2].width = Inches(1.5)
# sum_cells[2].text = str(parenthetical_line_indentation)
# sum_cells[3].width = Inches(1.5)
# sum_cells[3].text = str(parenthetical_formated)
# sum_cells = sum_table.add_row().cells
# font.size = Pt(12)
# sum_cells[0].width = Inches(1.5)
# sum_cells[0].text = 'Dialogues'
# sum_cells[1].width = Inches(1.5)
# sum_cells[1].text = str(Dialogue_case_corrected_count)
# sum_cells[2].width = Inches(1.5)
# sum_cells[2].text = str(dialogue_line_indentation)
# sum_cells[3].width = Inches(1.5)
# sum_cells[3].text = str(dialogue_formated)
# sum_cells = sum_table.add_row().cells
# font.size = Pt(12)
# sum_cells[0].width = Inches(1.5)
# sum_cells[0].text = 'Transitions'
# sum_cells[1].width = Inches(1.5)
# sum_cells[1].text = str(transitions_case_corrected_count)
# sum_cells[2].width = Inches(1.5)
# sum_cells[2].text = str(transitions_line_indentation)
# sum_cells[3].width = Inches(1.5)
# sum_cells[3].text = str(transitions_formated)
# sum_cells = sum_table.add_row().cells
# font.size = Pt(12)
# sum_cells[0].width = Inches(1.5)
# sum_cells[0].text = 'Special Terms'
# sum_cells[1].width = Inches(1.5)
# sum_cells[1].text = str(st_case_corrected_count)
# sum_cells[2].width = Inches(1.5)
# sum_cells[2].text = str(st_line_indentation)
# sum_cells[3].width = Inches(1.5)
# sum_cells[3].text = str(st_line_indentation)
# para = output_doc.add_paragraph()
# run = para.add_run()
# font = run.font
# font.size = Pt(12)
# para = output_doc.add_paragraph()
# run = para.add_run()
# run.add_break()
# run.add_break()
# #--------------------------- 14-09-2023
# for _ in range(5):
# output_doc.add_paragraph()
# #----------------------- 14-09-23
# para = output_doc.add_paragraph()
# para.alignment = WD_ALIGN_PARAGRAPH.CENTER
# # Audit detail at center of the page with bold
# run = para.add_run()
# font = run.font
# font.bold = True
# font.size = Pt(14)
# run.text = ' Audit Details'
# run.add_break()
# # -------------------------- 14-09-23
# no_rows = len(audit_df.index)
# table =output_doc.add_table(1, cols =6)
# table.alignment = WD_TABLE_ALIGNMENT.CENTER
# table.style = 'Table Grid'
# table.autofit = False
# table.columns[0].width = Inches(0.5)
# table.columns[1].width = Inches(1.2)
# table.columns[2].width = Inches(2)
# table.columns[3].width = Inches(1.5)
# table.columns[4].width = Inches(2)
# table.columns[5].width = Inches(2.5)
# # table.columns[3].width = Inches(0.5)
# heading_cells = table.rows[0].cells
# heading_cells[0].width = Inches(0.1)
# heading_cells[1].width = Inches(0.1)
# heading_cells[2].width = Inches(3.5)
# heading_cells[3].width = Inches(0.8)
# heading_cells[4].width = Inches(3.5)
# heading_cells[5].width = Inches(2)
# heading_cells[0].text = 'Line No'
# heading_cells[1].text = 'Audited Line No'
# heading_cells[2].text = 'Current Content'
# heading_cells[3].text = 'Script Element'
# heading_cells[4].text = 'New Content'
# heading_cells[5].text = 'Changes Done'
# for i in range(0,6):
# heading_cells[i].paragraphs[0].runs[0].font.bold = True
# heading_cells[i].paragraphs[0].runs[0].font.size = Pt(9)
# #------------------------------->LOGIC HERE<---------------------------------------------
# report_df = pd.DataFrame(columns=['line_no', 'audited_line_no', 'current_content', 'script_element', 'new_content', 'changes_done', 'para_no'])
# for index in audit_df.index:
# columns_to_check = ["line_removed","introduction", "appendix", "page_no" ,"left_indent_corrected" ,"right_indent_corrected" ,"line_wrapped_at_prescribed_right_indent", "case_corrected", "blank_inserted_before" ,"blank_inserted_after" ,"blank_deleted_before" ,"blank_deleted_after" ,"space_removed_between_characters" ,"space_added_between_characters" ,"line_merged_with_next_line", "line_broken_into_multiple_lines" ,"punctuation_mark_added" ,"punctuation_mark_removed" ,"language_specific_audit_comments"]
# audit_df[columns_to_check] = audit_df[columns_to_check].fillna('No')
# if audit_df.loc[index, columns_to_check].eq('No').all().all():
# # All columns contain 'No', skip this row
# continue
# elif audit_df['introduction'][index] == 'Yes':
# continue
# elif audit_df['appendix'][index] == 'Yes':
# continue
# elif audit_df['Identification_Status'][index] == 'blank':
# continue
# elif pd.isna(audit_df.loc[index, "Identification_Status"]):
# continue
# para_value = audit_df["para_no"][index] # ---------------------------------------------><-------------------------
# current_para_value = report_df['para_no'].iloc[-1] if not report_df.empty else None
# if para_value == current_para_value:
# continue
# else:
# # report_df = report_df.append(audit_df.loc[index], ignore_index=True)
# new_row = audit_df.loc[index].to_frame().T
# report_df = pd.concat([report_df, new_row], ignore_index=True)
# print("current_para_value",current_para_value)
# row_index = 1
# old_line_no_index = index
# collection_old_line_no = []
# while old_line_no_index < len(audit_df) and str(audit_df["para_no"][old_line_no_index]) == str(para_value):
# if audit_df['Identification_Status'][old_line_no_index] != "blank":
# try:
# data = int(old_line_no_index)
# collection_old_line_no.append(str(data))
# except ValueError:
# pass
# old_line_no_index += 1
# cells = table.add_row().cells
# cells[0].width = Inches(0.1)
# cells[0].text = ', '.join(collection_old_line_no)
# audited_line_index = index
# #--------------------------------------audited_lino_no------------------
# collection_audited_line_no = []
# while audited_line_index < len(audit_df) and str(audit_df["para_no"][audited_line_index]) == str(para_value):
# if audit_df['Identification_Status'][audited_line_index] != "blank":
# audited_line_no = audit_df['audited_line_no'][audited_line_index]
# try:
# data = int(audited_line_no)
# collection_audited_line_no.append(str(data))
# except ValueError:
# pass
# audited_line_index += 1
# print("collection_audited_line_no", collection_audited_line_no)
# data_string = ', '.join(collection_audited_line_no)
# print("data_string:", data_string)
# cells[1].width = Inches(0.1)
# cells[1].text = data_string
# #------------------------------>OLD DATA<---------------------------------
# data_index = index
# collection_data = []
# while data_index < len(audit_df) and str(audit_df["para_no"][data_index]) == str(para_value):
# cur_data = audit_df['data'][data_index]
# if not pd.isna(cur_data): # Check if the value is not NaN
# data = str(cur_data).strip()
# collection_data.append(data)
# data_index += 1
# cells[2].width = Inches(3.5)
# data = str(data)
# cells[2].text = '\n '.join(collection_data)
# if audit_df['Identification_Status'][index] == 'blank':
# script_element = 'Blank Line'
# elif audit_df['Identification_Status'][index] == '':
# if audit_df['introduction'][index] == 'Yes':
# script_element = 'Title/Introduction'
# elif audit_df['appendix'][index] == 'Yes':
# script_element = 'Appendix'
# else:
# continue
# else:
# script_element = ps_to_script_element(audit_df['Identification_Status'][index])
# data = script_element
# cells[3].width = Inches(0.8)
# cells[3].text = data
# collection_new_data = []
# new_data_index = index
# while new_data_index < len(audit_df) and str(audit_df["para_no"][new_data_index]) == str(para_value):
# if audit_df["line_removed"][new_data_index] == "No":
# new_data = audit_df['data_corrected'][new_data_index]
# if not pd.isna(new_data): # Check if the value is not NaN
# data = str(new_data).strip()
# collection_new_data.append(data)
# new_data_index += 1
# data = str(new_data).strip()
# cells[4].width = Inches(3.5)
# data = str(data)
# cells[4].text = '\n '.join(collection_new_data)
# sno = 1
# changes_done = False
# # identification_status = audit_df['Identification_Status'][index]
# if pd.isnull(audit_df['Identification_Status'][index]) or audit_df['Identification_Status'][index] == "":
# continue
# if audit_df['left_indent_corrected'][index] != 'No':
# change_comment = audit_df['left_indent_corrected'][index]
# try:
# str_int = change_comment[-2]+change_comment[-1]
# except Exception as e:
# pass
# if ps_to_script_element(audit_df['Identification_Status'][index]) == "Dialogue":
# if str_int == "15":
# change_comment = "Dialogue line left index corrected to 1.5 Inch"
# elif str_int == "25":
# change_comment = "Dialogue line left index corrected to 2.5 Inch"
# if str_int == "15":
# name = ps_to_script_element(audit_df['Identification_Status'][index])
# change_comment = f"{name} line left indent corrected to 1.5 Inch"
# print(change_comment)
# elif str_int == "25":
# name = ps_to_script_element(audit_df['Identification_Status'][index])
# change_commen = f"{name} left indent corrected to 2.5 Inch"
# elif str_int == "30":
# change_comment = "Parenthetical left indent corrected to 3 Inch"
# elif str_int == "35":
# change_comment = "Speaker left indent corrected to 3.5 Inch"
# data = str(sno) + '. ' + str(change_comment)
# # dataa = data.split()
# # if dataa[-1] == "nan":
# # continue
# cells[5].width = Inches(2)
# para = cells[5].add_paragraph()
# run = para.add_run()
# run.text = data
# run.add_break()
# sno += 1
# changes_done = True
# if audit_df['right_indent_corrected'][index] != 'No':
# name = ps_to_script_element(audit_df['Identification_Status'][index])
# change_comment = audit_df['right_indent_corrected'][index]
# try:
# str_int = change_comment[-2]+change_comment[-1]
# except Exception as e:
# pass
# if str_int == "10":
# change_comment = f"{name} right indent corrected to 1 Inch"
# data = str(sno) + '. ' + str(change_comment)
# # dataa = data.split()
# # if dataa[-1] == "nan":
# # continue
# cells[5].width = Inches(2)
# para = cells[5].add_paragraph()
# run = para.add_run()
# run.text = data
# run.add_break()
# sno += 1
# changes_done = True
# if audit_df['case_corrected'][index] != 'No':
# name = ps_to_script_element(audit_df['Identification_Status'][index])
# string = str(audit_df['case_corrected'][index])
# string = string.split()
# content = string[-1]
# if content == "AllUpper":
# change_comment = f'{name} Case ' + "Corrected to All Upper"
# elif content == "AllLower":
# change_comment = f'{name} Case ' + "Corrected to All Lowerr"
# if len(str(change_comment)) <= 2 :
# continue
# data = str(sno) + '. ' + str(change_comment)
# # dataa = data.split()
# # if dataa[-1] == "nan":
# # continue
# cells[5].width = Inches(2)
# para = cells[5].add_paragraph()
# run = para.add_run()
# run.text = data
# run.add_break()
# sno += 1
# changes_done = True
# if audit_df['line_wrapped_at_prescribed_right_indent'][index] != 'No':
# change_comment = 'Line Wrapped at Prescribed Right Indent 1 Inch'
# name = ps_to_script_element(audit_df['Identification_Status'][index])
# if name == "Action":
# change_comment = f'{name}Line Wrapped at Prescribed Right Indent 1 Inch'
# elif name == "Dialogue":
# change_comment = f'{name}Line Wrapped at Prescribed Right Indent 2 Inch'
# data = str(sno) + '. ' + str(change_comment)
# # dataa = data.split()
# # if dataa[-1] == "nan":
# # continue
# cells[5].width = Inches(2)
# para = cells[5].add_paragraph()
# run = para.add_run()
# run.text = data
# run.add_break()
# sno += 1
# changes_done = True
# if audit_df['line_broken_into_multiple_lines'][index] != 'No':
# name = ps_to_script_element(audit_df['Identification_Status'][index])
# change_comment = f'{name} line Broken into Multiple Lines'
# data = str(sno) + '. ' + str(change_comment)
# # dataa = data.split()
# # if dataa[-1] == "nan":
# # continue
# cells[5].width = Inches(2)
# para = cells[5].add_paragraph()
# run = para.add_run()
# run.text = data
# run.add_break()
# sno += 1
# changes_done = True
# if audit_df['line_merged_with_next_line'][index] != 'No':
# name = ps_to_script_element(audit_df['Identification_Status'][index])
# change_comment = f'{name} line Merged with Next Line'
# data = str(sno) + '. ' + str(change_comment)
# # dataa = data.split()
# # if dataa[-1] == "nan":
# # continue
# cells[5].width = Inches(2)
# para = cells[5].add_paragraph()
# run = para.add_run()
# run.text = data
# run.add_break()
# sno += 1
# changes_done = True
# if audit_df['language_specific_audit_comments'][index] != 'No':
# pass
# name = ps_to_script_element(audit_df['Identification_Status'][index])
# change_comment = f"{name}",str(audit_df['language_specific_audit_comments'][index])
# data = str(sno) + '. ' + str(change_comment)
# cells[5].width = Inches(2)
# para = cells[5].add_paragraph()
# run = para.add_run()
# run.text = data
# run.add_break()
# sno += 1
# changes_done = True
# if audit_df['blank_inserted_after'][index] != 'No':
# change_comment = 'A blank line is added below'
# # name = ps_to_script_element(audit_df['Identification_Status'][index])
# # if name == "Action":
# # change_comment = f'{name}Line Wrapped at Prescribed Right Indent 1 Inch'
# # elif name == "Dialogue":
# # change_comment = f'{name}Line Wrapped at Prescribed Right Indent 2 Inch'
# data = str(sno) + '. ' + str(change_comment)
# # dataa = data.split()
# # if dataa[-1] == "nan":
# # continue
# cells[5].width = Inches(2)
# para = cells[5].add_paragraph()
# run = para.add_run()
# run.text = data
# run.add_break()
# sno += 1
# changes_done = True
# if not changes_done:
# continue
# # data = 'No Changes Done'
# # cells[5].width = Inches(2)
# # para = cells[5].add_paragraph()
# # run = para.add_run()
# # run.text = data
# # run.add_break()
# row_index += 1
# buffer = io.BytesIO()
# output_doc.save(buffer)
# buffer.seek(0)
# #output_doc.save(audit_report_tabular_docx)
# return buffer
def count_the_line(text_file_path):
with open(text_file_path, 'r') as fp:
lines = len(fp.readlines())
return lines
def convert_to_pdf(input_docx, out_folder):
p = subprocess.Popen(
[
"libreoffice",
"--headless",
"--convert-to",
"pdf",
"--outdir",
out_folder,
input_docx,
]
)
print(["--convert-to", "pdf", input_docx])
p.communicate()
def countPages(docfile, pdf_file_path, base_path_directory):
convert_to_pdf(docfile, base_path_directory)
print("converted to pdf")
print("pdf_file_path",pdf_file_path)
pdf = PdfFileReader(open(pdf_file_path, "rb"))
number_of_pages = pdf.getNumPages()
return number_of_pages
def convert_txt_to_docx(txt_file_path, docx_file_path):
doc = docx.Document()
with open(txt_file_path, 'r', encoding='utf-8') as txt:
text = txt.read()
doc.add_paragraph(text)
doc.save(docx_file_path)
def csv_to_docx(csv: pd.DataFrame) -> Document:
output_doc = Document()
style = output_doc.styles["Normal"]
font = style.font
font.name = "Courier New"
font.size = Pt(12)
section = output_doc.sections[0]
section.page_height = Mm(297)
a4_right = 8.57
section.page_width = Inches(a4_right)
section.left_margin = Inches(1.5)
for index in csv.index:
para = output_doc.add_paragraph()
paragraph_format = para.paragraph_format
paragraph_format.space_before = Pt(0)
paragraph_format.space_after = Pt(0)
paragraph_format.line_spacing = Pt(12)
script_element = csv["script_element"][index]
content = csv["content"][index]
if script_element == "blank":
continue
elif script_element == "slugline":
paragraph_format.left_indent = Inches(0)
paragraph_format.right_indent = Inches(0)
content = content.upper()
elif script_element == "action":
paragraph_format.left_indent = Inches(0)
paragraph_format.right_indent = Inches(0)
elif script_element == "dialogue":
paragraph_format.left_indent = Inches(1.0)
paragraph_format.right_indent = Inches(1.25)
elif script_element == "parenthetical":
paragraph_format.left_indent = Inches(1.5)
paragraph_format.right_indent = Inches(2.25)
elif script_element == "speaker":
paragraph_format.left_indent = Inches(2)
paragraph_format.right_indent = Inches(1)
content = content.upper()
elif script_element == "transition":
para.alignment = WD_ALIGN_PARAGRAPH.RIGHT
paragraph_format.left_indent = Inches(2.5)
paragraph_format.right_indent = Inches(0)
elif script_element == "special_term":
paragraph_format.left_indent = Inches(0)
paragraph_format.right_indent = Inches(0)
if isinstance(content, float):
content = ""
para.text = content
return output_doc
def language_detector_for_csv(orginal_csv_path):
try:
audit_df = pd.read_csv(orginal_csv_path)
except:
audit_df = orginal_csv_path
actionline_lang = []
dialogue_lang = []
for index, row in audit_df.iterrows():
if audit_df["script_element"][index] in ["action"]:
string_original = audit_df["content"][index]
src_lang = language_detector(string_original)
Final_lang = [language_code[src_lang]]
actionline_lang.append(Final_lang)
elif audit_df["script_element"][index] in ["dialogue"]:
string_original = audit_df["content"][index]
src_lang = language_detector(string_original)
Final_lang = [language_code[src_lang]]
dialogue_lang.append(Final_lang)
return actionline_lang, dialogue_lang
def assign_para_no(df):
para_no = 1
df['para_no'] = 0
index_iter = iter(df.index)
for index in df.index:
line_pos = df['Identification_Status'][index]
if line_pos == 'blank' :
continue
if line_pos == 'ps1':
df.at[index, 'para_no'] = para_no
para_no += 1
continue
if line_pos == "ps2":
if df['Identification_Status'][index + 1] == "ps3":
df.at[index, 'para_no'] = para_no
df.at[index+1, 'para_no'] = para_no
para_no += 1
continue
else:
df.at[index, 'para_no'] = para_no
para_no += 1
continue
if line_pos == 'ps4':
df.at[index, 'para_no'] = para_no
spot_index = index +1
while df['Identification_Status'][spot_index] in ["ps5","ps6","ps4"]:
df.at[spot_index, 'para_no'] = para_no
spot_index += 1
para_no += 1
continue
if line_pos == 'ps13':
df.at[index, 'para_no'] = para_no
spot_index = index +1
while spot_index < len(df) and df['Identification_Status'][spot_index] in ["ps14","ps15","ps13","blank"]:
if df['Identification_Status'][spot_index] == "blank":
if spot_index + 1 < len(df) and df['Identification_Status'][spot_index+1] == "ps14":
df.at[spot_index+1, 'para_no'] = para_no
spot_index += 1
else:
pass
df.at[spot_index, 'para_no'] = para_no
spot_index += 1
para_no += 1
continue
if line_pos == 'ps6':
if df['Identification_Status'][index-1] in ["ps5","ps4"]:
continue
else:
df.at[index, 'para_no'] = para_no
para_no += 1
if line_pos == "ps7":
df.at[index, 'para_no'] = para_no
spot_index = index +1
while df['Identification_Status'][spot_index] in ["ps8","ps9"]:
df.at[spot_index, 'para_no'] = para_no
spot_index += 1
para_no += 1
continue
if line_pos == "ps8":
if df['Identification_Status'][index+1] in ["ps13","ps15"]:
df.at[index, 'para_no'] = para_no
para_no += 1
continue
else:
df.at[index, 'para_no'] = para_no
para_no += 1
continue
if line_pos == 'ps15':
if df['Identification_Status'][index-1] in ["ps7","ps12","ps10","ps20","ps8","blank"]:
df.at[index, 'para_no'] = para_no
para_no += 1
continue
else:
continue
if line_pos == "ps14":
if df['Identification_Status'][index-1] in ["ps8","ps7"]:
df.at[index, 'para_no'] = para_no
spot_index = index +1
while df['Identification_Status'][spot_index] == "ps15":
df.at[spot_index, 'para_no'] = para_no
spot_index += 1
para_no += 1
else:
continue
if line_pos == 'ps11':
df.at[index, 'para_no'] = para_no
spot_index = index +1
while df['Identification_Status'][spot_index] in ["ps12","ps20"]:
df.at[spot_index, 'para_no'] = para_no
spot_index += 1
para_no += 1
continue
if line_pos == "ps12":
if df['Identification_Status'][index-1] in ["ps11","ps20"]:
continue
continue
if line_pos == "ps10":
df.at[index, 'para_no'] = para_no
para_no += 1
continue
if line_pos == "ps20":
if df['Identification_Status'][index-1] == "ps11":
continue
elif df['Identification_Status'][index+1] == "ps12":
df.at[index, 'para_no'] = para_no
df.at[index+1, 'para_no'] = para_no
para_no += 1
continue
para_no += 1
continue
if line_pos == 'ps17' :
df.at[index, 'para_no'] = para_no
para_no += 1
continue
if line_pos == 'ps16' :
df.at[index, 'para_no'] = para_no
para_no += 1
continue
columns = list(df.columns)
columns.insert(3, columns.pop(columns.index('para_no')))
df = df[columns]
return df
def print_audit_report_tabular_docx(audit_df,scriptname,author,pre_audit_pagenumber,postauditpagenumber,preaudit_line_no,postaudit_line_no,script_language,dialogue_language):
#line_removed header left_indent_corrected right_indent_corrected line_wrapped_at_prescribed_right_indent case_corrected #blank_inserted_before blank_inserted_after blank_deleted_before blank_deleted_after space_removed_between_characters #space_added_between_characters line_merged_with_next_line line_broken_into_multiple_lines punctuation_mark_added #punctuation_mark_removed
total_no_blanklines = len(audit_df[audit_df['Identification_Status'].isin(['blank'])])
# <---------------------BLANK LINE ADD AND remove LOGIC IS HERE----------------->
blankline_added = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['blank_inserted_before'] != 'No'),:] )
blank_add_after = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['blank_inserted_after'] != 'No'),:] )
blankline_inserted = blankline_added + blank_add_after
blankline_rem_before = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['blank_deleted_before'] != 'No'),:] )
blank_rem_after = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['blank_deleted_after'] != 'No'),:] )
blankline_removed_total = blankline_rem_before + blank_rem_after
### <<----------------- logic for case --------------------------------->
# for slugline
# case corrected
sluglinecase_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['case_corrected'] != 'No') & (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])), :])
print(sluglinecase_corrected_count)
# indentatioin corrected
sleft_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['left_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])),:])
sright_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['right_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])),:])
swrapped_lines_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_wrapped_at_prescribed_right_indent'] != 'No')& (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])),:])
slugline_indentation = sleft_indent_corrected_count + sright_indent_corrected_count + swrapped_lines_count
print("sluglin_indentation:",slugline_indentation)
# formate corrected
slugline_formate1 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_removed_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])),:])
slugline_formate2 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_added_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])),:])
slugline_formate3 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_merged_with_next_line'] != 'No')& (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])),:])
slugline_formate4 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_broken_into_multiple_lines'] != 'No')& (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])),:])
slugline_formate5 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_added'] != 'No')& (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])),:])
slugline_formate6 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_removed'] != 'No')& (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])),:])
slugline_formated = slugline_formate1 + slugline_formate2 + slugline_formate3 + slugline_formate4 + slugline_formate5 + slugline_formate6
print("slugline_formated",slugline_formated)
#total sluglines
total_no_sluglines = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])), :])
print(total_no_sluglines)
# for actioon -----line
# case corrected
actionlinecase_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['case_corrected'] != 'No') & (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])), :])
print(actionlinecase_corrected_count)
# indentatioin corrected
actionleft_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['left_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])),:])
actionright_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['right_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])),:])
actionwrapped_lines_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_wrapped_at_prescribed_right_indent'] != 'No')& (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])),:])
actionline_indentation = actionleft_indent_corrected_count + actionright_indent_corrected_count + actionwrapped_lines_count
print("actionliine_indentation:",actionline_indentation)
# formate corrected
actionline_formate1 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_removed_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])),:])
actionline_formate2 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_added_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])),:])
actionline_formate3 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_merged_with_next_line'] != 'No')& (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])),:])
actionline_formate4 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_broken_into_multiple_lines'] != 'No')& (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])),:])
actionline_formate5 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_added'] != 'No')& (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])),:])
actionline_formate6 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_removed'] != 'No')& (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])),:])
actionline_formated = actionline_formate1 + actionline_formate2 + actionline_formate3 + actionline_formate4 + actionline_formate5 + actionline_formate6
print("actionline_formated",actionline_formated)
#total no of actionline
total_actionlines = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])), :])
print(total_actionlines)
# for Speaker
# case corrected
speakercase_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['case_corrected'] != 'No') & (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])), :])
print("speakercase_corrected_count", speakercase_corrected_count)
# indentatioin corrected
speakerleft_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['left_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])),:])
speakerright_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['right_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])),:])
speaker_lines_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_wrapped_at_prescribed_right_indent'] != 'No')& (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])),:])
speaker_indentation = speakerleft_indent_corrected_count + speakerright_indent_corrected_count + speaker_lines_count
print("speaker_indentation:",speaker_indentation)
# formate corrected
speaker_formate1 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_removed_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])),:])
speaker_formate2 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_added_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])),:])
speaker_formate3 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_merged_with_next_line'] != 'No')& (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])),:])
speaker_formate4 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_broken_into_multiple_lines'] != 'No')& (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])),:])
speaker_formate5 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_added'] != 'No')& (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])),:])
speaker_formate6 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_removed'] != 'No')& (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])),:])
speaker_formated = speaker_formate1 + speaker_formate2 + speaker_formate3 + speaker_formate4 + speaker_formate5 + speaker_formate6
print("speaker_formated",speaker_formated)
#total no of speaker -speaker
total_no_speaker = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])),:])
print(total_no_speaker)
# for Parenthetical -----line
# case corrected
parentheticalcase_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['case_corrected'] != 'No') & (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])), :])
print(parentheticalcase_corrected_count)
# indentatioin corrected
parenthetical_left_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['left_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])),:])
parenthetical_right_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['right_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])),:])
parenthetical_wrapped_lines_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_wrapped_at_prescribed_right_indent'] != 'No')& (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])),:])
parenthetical_line_indentation = parenthetical_left_indent_corrected_count + parenthetical_right_indent_corrected_count + parenthetical_wrapped_lines_count
print("parenthetical_line_indentation:",parenthetical_line_indentation)
# formate corrected
parenthetical_formate1 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_removed_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])),:])
parenthetical_formate2 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_added_between_characters'] != 'No') & (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])),:])
parenthetical_formate3 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_merged_with_next_line'] != 'No')& (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])),:])
parenthetical_formate4 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_broken_into_multiple_lines'] != 'No')& (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])),:])
parenthetical_formate5 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_added'] != 'No')& (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])),:])
parenthetical_formate6 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_removed'] != 'No')& (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])),:])
parenthetical_formated = parenthetical_formate1 + parenthetical_formate2 + parenthetical_formate3 + parenthetical_formate4 + parenthetical_formate5 + parenthetical_formate6
print("parenthetical_formated",parenthetical_formated)
#total number of parenthetical
total_no_parenthetical = len(audit_df.loc[(audit_df['line_removed'] == 'No')& (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])),:])
print(total_no_parenthetical)
# for Dialogue -----line
# case corrected
Dialogue_case_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['case_corrected'] != 'No') & (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])), :])
print(Dialogue_case_corrected_count)
# indentatioin corrected
dialogue_left_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['left_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])),:])
dialogue_right_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['right_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])),:])
dialogue_wrapped_lines_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_wrapped_at_prescribed_right_indent'] != 'No')& (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])),:])
dialogue_line_indentation = dialogue_left_indent_corrected_count + dialogue_right_indent_corrected_count + dialogue_wrapped_lines_count
print("dialogue_line_indentation:",dialogue_line_indentation)
# formate corrected
dialogue_formate1 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_removed_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])),:])
dialogue_formate2 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_added_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])),:])
dialogue_formate3 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_merged_with_next_line'] != 'No')& (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])),:])
dialogue_formate4 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_broken_into_multiple_lines'] != 'No')& (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])),:])
dialogue_formate5 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_added'] != 'No')& (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])),:])
dialogue_formate6 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_removed'] != 'No')& (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])),:])
dialogue_formated = dialogue_formate1 + dialogue_formate2 + dialogue_formate3 + dialogue_formate4 + dialogue_formate5 + dialogue_formate6
print("dialogue_formated",dialogue_formated)
# total number of dialogue
total_no_dialogue = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])),:])
print(total_no_dialogue)
# for Transistion -----line
# case corrected
transitions_case_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['case_corrected'] != 'No') & (audit_df['Identification_Status'].isin(['ps16'])), :])
print(transitions_case_corrected_count)
# indentatioin corrected
transitions_left_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['left_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps16'])),:])
transitions_right_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['right_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps16'])),:])
transitions_wrapped_lines_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_wrapped_at_prescribed_right_indent'] != 'No')& (audit_df['Identification_Status'].isin(['ps16'])),:])
transitions_line_indentation = transitions_left_indent_corrected_count + transitions_right_indent_corrected_count + transitions_wrapped_lines_count
print("transitions_line_indentation:",transitions_line_indentation)
# formate corrected
transitions_formate1 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_removed_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps16'])),:])
transitions_formate2 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_added_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps16'])),:])
transitions_formate3 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_merged_with_next_line'] != 'No')& (audit_df['Identification_Status'].isin(['ps16'])),:])
transitions_formate4 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_broken_into_multiple_lines'] != 'No')& (audit_df['Identification_Status'].isin(['ps16'])),:])
transitions_formate5 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_added'] != 'No')& (audit_df['Identification_Status'].isin(['ps16'])),:])
transitions_formate6 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_removed'] != 'No')& (audit_df['Identification_Status'].isin(['ps16'])),:])
transitions_formated = transitions_formate1 + transitions_formate2 + transitions_formate3 + transitions_formate4 + transitions_formate5 + transitions_formate6
print("transitions_formated",transitions_formated)
#total transition
total_no_transition = len(audit_df.loc[audit_df['Identification_Status'].isin(['ps16']),:])
print(total_no_transition)
# for Spectial Terms -----line
# case corrected
st_case_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['case_corrected'] != 'No') & (audit_df['Identification_Status'].isin(['ps17'])), :])
print("st_case_corrected_count",st_case_corrected_count)
# indentatioin corrected
st_left_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['left_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps17'])),:])
st_right_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['right_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps17'])),:])
st_wrapped_lines_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_wrapped_at_prescribed_right_indent'] != 'No')& (audit_df['Identification_Status'].isin(['ps17'])),:])
st_line_indentation = st_left_indent_corrected_count + st_right_indent_corrected_count + st_wrapped_lines_count
print("st_line_indentation:",st_line_indentation)
# formate corrected
st_formate1 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_removed_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps17'])),:])
st_formate2 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_added_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps17'])),:])
st_formate3 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_merged_with_next_line'] != 'No')& (audit_df['Identification_Status'].isin(['ps17'])),:])
st_formate4 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_broken_into_multiple_lines'] != 'No')& (audit_df['Identification_Status'].isin(['ps17'])),:])
st_formate5 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_added'] != 'No')& (audit_df['Identification_Status'].isin(['ps17'])),:])
st_formate6 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_removed'] != 'No')& (audit_df['Identification_Status'].isin(['ps17'])),:])
st_formated = st_formate1 + st_formate2 + st_formate3 + st_formate4 + st_formate5 + st_formate6
print("st_formated",st_formated)
#total numner of special terms
total_special_terms = len(audit_df.loc[audit_df['Identification_Status'].isin(['ps17']),:])
if total_special_terms < 1 :
total_special_terms = 1
print(total_special_terms)
# write logic for the percentage
#a
difference_of_page_no = int(pre_audit_pagenumber) - int(postauditpagenumber)
average_of_page_no = (int(pre_audit_pagenumber) + int(postauditpagenumber)) / 2
final_ratio_pageno = (difference_of_page_no / average_of_page_no) * 100
#b
difference_of_line_no = int(preaudit_line_no)- int(postaudit_line_no)
average_of_line_no = (int(preaudit_line_no) + int(postaudit_line_no)) / 2
final_ratio_lineno = (difference_of_line_no / average_of_line_no) * 100
#c
try:
ratio_for_blanklines = ((int(blankline_inserted) + int(blankline_removed_total)) / average_of_line_no) *100
except:
ratio_for_blanklines = 0
#j
try:
ratio_for_sluglines = ((int(sluglinecase_corrected_count)+int(slugline_indentation)+int(slugline_formated))/total_no_sluglines)*100
except:
ratio_for_sluglines = 0
#d
try:
ratio_for_actionlines = ((int(actionlinecase_corrected_count)+ int(actionline_indentation)+ int(total_actionlines))/total_actionlines)*100
except:
ratio_for_actionlines = 0
#e
try:
ratio_for_Speaker = ((int(speakercase_corrected_count)+int(speaker_formated)+int(speaker_formated))/ total_actionlines)*100
except:
ratio_for_Speaker = 0
#f
try:
ratio_for_parenthetical = ((int(parentheticalcase_corrected_count)+int(parenthetical_line_indentation)+int(parenthetical_formated)) / total_no_parenthetical)*100
except:
ratio_for_parenthetical = 0
#g
try:
ratio_for_dialogues = ((int(Dialogue_case_corrected_count)+int(dialogue_line_indentation)+int(dialogue_formated)) / total_no_dialogue)*100
except:
ratio_for_dialogues = 0
#h
try:
ratio_for_transitions = ((int(transitions_case_corrected_count)+int(transitions_line_indentation)+int(transitions_formated)) / total_no_transition)*100
except:
ratio_for_transitions = 0
#i
try:
ratio_for_special_terms = ((int(st_case_corrected_count)+int(st_line_indentation)+int(st_formated))/total_special_terms) * 100
except:
ratio_for_special_terms = 0
average_of_c_j = (ratio_for_sluglines+ratio_for_actionlines+ratio_for_Speaker+ratio_for_parenthetical+ratio_for_dialogues+ratio_for_transitions+ratio_for_special_terms)/7
audit_configuration_percentage = (final_ratio_pageno+final_ratio_lineno+ratio_for_blanklines) + (average_of_c_j)
audit_configuration_percentage_str = f"{audit_configuration_percentage:.2f}%"
print("audit_configuration_percentage",audit_configuration_percentage_str)
total_script_element_correct = (total_no_sluglines+total_actionlines+total_no_speaker+total_no_parenthetical+total_no_dialogue+total_no_transition+total_special_terms)
print("total_script_element_correct",total_script_element_correct)
audit_script_accuracy = (total_no_sluglines+total_actionlines+total_no_speaker+total_no_parenthetical+total_no_dialogue+total_no_transition+total_special_terms+total_no_blanklines)/preaudit_line_no
print("audit_script_accuracy",audit_script_accuracy)
# audit_script_accuracy_str = min(audit_script_accuracy*100 , 100)
audit_script_accuracy_str = audit_script_accuracy*100
audit_script_accuracy_str = f"{audit_script_accuracy_str:.2f}%"
print("audit_script_accuracy_str",audit_script_accuracy_str)
# the table logics ends here
# percenteage table from here
output_doc = Document()
style = output_doc.styles['Normal']
font = style.font
#font.name = 'Courier New'
font.size = Pt(10)
section = output_doc.sections[-1]
section.orientation = WD_ORIENT.LANDSCAPE
section.page_width = Inches(11)
section.left_margin = Inches(0.25)
section.right_margin = Inches(0.25)
para = output_doc.add_paragraph()
para.alignment = WD_ALIGN_PARAGRAPH.CENTER
# Audit Summary at center of the page with bold
run = para.add_run()
font = run.font
font.bold = True
font.size = Pt(18)
run.text = ' Audit Summary'
para = output_doc.add_paragraph()
# run.add_break()
# Add a paragraph for the left-aligned "Audit Date"
current_date = date.today()
# Convert to the "day month year" format
formatted_date = current_date.strftime("%d %B %Y")
string_date = "𝐀𝐮𝐝𝐢𝐭 𝐃𝐚𝐭𝐞"
left_aligned_text = output_doc.add_paragraph("\t\t\t\t\t\t\t\t\t\t\t\t\t\t"+ string_date+ " : " + str(formatted_date))
left_aligned_text.alignment = WD_ALIGN_PARAGRAPH.LEFT
font_audit_date = left_aligned_text.runs[0].font
font_audit_date.size = Pt(14)
font.bold = True
para = output_doc.add_paragraph()
table = output_doc.add_table(rows=2, cols=2)
table.alignment = WD_TABLE_ALIGNMENT.CENTER
table.style = 'Colorful Shading Accent 6'
table.autofit = False # Turn off autofit to set cell widths explicitly
# Set cell widths (you can adjust these values as needed)
table.columns[0].width = Pt(150)
table.columns[1].width = Pt(100)
# Access the first cell in the first row
cell = table.cell(0, 0)
cell.text = "Audit Contributions"
cell1 = table.cell(0, 1)
cell1.text = audit_configuration_percentage_str
for paragraph in cell.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(14)
for paragraph in cell1.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(14)
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
# Set vertical alignment to top
cell.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
# Access the first cell in the second row
cell = table.cell(1, 0)
cell.text = "Audited Accuracy"
cell1 = table.cell(1,1)
cell1.text = audit_script_accuracy_str
for paragraph in cell.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(14)
for paragraph in cell1.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(14)
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
# Set vertical alignment to top
cell.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
para = output_doc.add_paragraph()
para = output_doc.add_paragraph()
para = output_doc.add_paragraph()
right_aligned_text = para.add_run('\t\t𝐒𝐜𝐫𝐢𝐩𝐭 𝐍𝐚𝐦𝐞: ' + str(scriptname))
right_aligned_text.alignment = WD_ALIGN_PARAGRAPH.RIGHT
font_right = right_aligned_text.font
font_right.size = Pt(14)
author_para = output_doc.add_paragraph()
run_author = author_para.add_run("\t\t𝐀𝐮<F09D9080><F09D90AE>𝐡𝐨𝐫: " + str(author))
font_author = run_author.font
font_author.size = Pt(14)
language_script_para = output_doc.add_paragraph()
run_language_script = language_script_para.add_run("\t\t𝐋𝐚𝐧𝐠𝐮𝐚𝐠𝐞 𝐨𝐟 𝐒𝐜𝐫𝐢𝐩𝐭: " + str(script_language))
font_language_script = run_language_script.font
font_language_script.size = Pt(14)
language_dialogue_para = output_doc.add_paragraph()
run_language_dialogue = language_dialogue_para.add_run("\t\t𝐋𝐚𝐧𝐠𝐮𝐚𝐠𝐞 𝐨𝐟 𝐃𝐢𝐚𝐥𝐨𝐠𝐮𝐞: " + str(dialogue_language))
font_language_dialogue = run_language_dialogue.font
font_language_dialogue.size = Pt(14)
# Remove line spacing for the entire document
for para in output_doc.paragraphs:
para.paragraph_format.space_before = Pt(1)
para.paragraph_format.space_after = Pt(1)
para = output_doc.add_paragraph()
para = output_doc.add_paragraph()
# changes_string_line = output_doc.add_paragraph()
# run_changes_string_line = changes_string_line.add_run("\t\tStructural Changes\t\t\t\t\\t Blank Lines Adjustments ")
# font_changes_string_line = run_changes_string_line.font
# # Set font properties
# font_changes_string_line.color.rgb = WD_COLOR_INDEX.RED # Red font color
# font_changes_string_line.italic = True # Italic style
# font_changes_string_line.bold = True
paragraph = output_doc.add_paragraph()
paragraph = output_doc.add_paragraph()
run = paragraph.add_run("\t\t\t\tStructural Changes\t\t\t\t\t\tBlank Lines Adjustments ")
# Set font size
font = run.font
font.size = Pt(14)
# Set font color to red
font.color.rgb = RGBColor(255, 0, 0)
font.bold = True
font.italic = True
for para in output_doc.paragraphs:
para.paragraph_format.space_before = Pt(0)
para.paragraph_format.space_after = Pt(0)
table = output_doc.add_table(rows=1, cols=2)
table.allow_autofit = False
table.alignment = WD_TABLE_ALIGNMENT.CENTER
table._cells[0].width = Inches(4.3)
table._cells[1].width = Inches(4.3)
column_first = table._cells[0].add_table(rows=3, cols=3)
column_second = table._cells[1].add_table(rows=2, cols=2)
column_first.style = 'Colorful Shading Accent 6'
column_second.style = 'Colorful Shading Accent 6'
column_first_row1 = column_first.cell(0,1)
column_first_row1.text ="Pre Audit"
column_first_row1.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
for paragraph in column_first_row1.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(14)
column_first_row1_c2 = column_first.cell(0,2)
column_first_row1_c2.text ="Post Audit"
column_first_row1_c2.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
for paragraph in column_first_row1_c2.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(14)
column_first_row2_c1 = column_first.cell(1,0)
column_first_row2_c1.text ="No of Pages"
column_first_row2_c1.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
for paragraph in column_first_row2_c1.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(14)
column_first_row2_c2 = column_first.cell(1,1)
column_first_row2_c2.text = str(pre_audit_pagenumber)
column_first_row2_c2.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
for paragraph in column_first_row2_c2.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(14)
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
column_first_row2_c3 = column_first.cell(1,2)
column_first_row2_c3.text = str(postauditpagenumber)
column_first_row2_c3.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
for paragraph in column_first_row2_c3.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(14)
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
column_first_row3_c1 = column_first.cell(2,0)
column_first_row3_c1.text = "No of Lines"
column_first_row3_c1.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
for paragraph in column_first_row3_c1.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(14)
column_first_row3_c2 = column_first.cell(2,1)
column_first_row3_c2.text = str(preaudit_line_no)
column_first_row3_c2.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
for paragraph in column_first_row3_c2.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(14)
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
column_first_row3_c3 = column_first.cell(2,2)
column_first_row3_c3.text = str(postaudit_line_no)
column_first_row3_c3.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
for paragraph in column_first_row3_c3.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(14)
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
column_second_row1_c1 = column_second.cell(0,0)
column_second_row1_c1.text = "Blank Lines Added"
column_second_row1_c1.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
for paragraph in column_second_row1_c1.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(14)
column_second_row1_c2 = column_second.cell(0,1)
column_second_row1_c2.text = str(blankline_inserted)
column_second_row1_c2.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
for paragraph in column_second_row1_c2.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(14)
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
column_second_row2_c1 = column_second.cell(1,0)
column_second_row2_c1.text = "Blank Lines Removed"
column_second_row2_c1.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
for paragraph in column_second_row2_c1.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(14)
column_second_row2_c2 = column_second.cell(1,1)
column_second_row2_c2.text = str(blankline_removed_total)
column_second_row2_c2.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
for paragraph in column_second_row2_c2.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(14)
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
paragraph = output_doc.add_paragraph()
paragraph = output_doc.add_paragraph()
paragraph = output_doc.add_paragraph()
run = paragraph.add_run("Summary of Correction made")
# Set font size
font = run.font
font.size = Pt(14)
# Set font color to red
font.color.rgb = RGBColor(255, 0, 0)
font.bold = True
font.italic = True
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
summary_table = output_doc.add_table(rows=9, cols=5)
summary_table.allow_autofit = False
summary_table.alignment = WD_TABLE_ALIGNMENT.CENTER
summary_table.style = 'Colorful Shading Accent 6'
# # Calculate the column widths
# column_widths = [Inches(1), Inches(1.5), Inches(1.5), Inches(1.5), Inches(1)] # Adjust the widths as needed
# # Set the column widths
# for col, width in enumerate(column_widths):
# summary_table.columns[col].width = width
# table.columns[0].width = Pt(150)
# table.columns[1].width = Pt(100)
# summary_table.columns[0].width = Inches(1)
# summary_table.columns[1].width = Inches(1)
# summary_table.columns[2].width = Inches(1)
# summary_table.columns[3].width = Inches(1)
# summary_table.columns[4].width = Inches(0.5)
summary_table_row1_col_2 = summary_table.cell(0,1)
summary_table_row1_col_2.text ="Case Correction"
summary_table_row1_col_2.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
for paragraph in summary_table_row1_col_2.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(14)
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
summary_table_row1_col_3 = summary_table.cell(0,2)
summary_table_row1_col_3.text ="Indent Correction"
summary_table_row1_col_3.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
for paragraph in summary_table_row1_col_3.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(14)
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
summary_table_row1_col_4 = summary_table.cell(0,3)
summary_table_row1_col_4.text ="Format Correction"
summary_table_row1_col_4.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
for paragraph in summary_table_row1_col_4.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(14)
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
summary_table_row1_col_5 = summary_table.cell(0,4)
summary_table_row1_col_5.text ="Total"
summary_table_row1_col_5.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
summary_table_row1_col_5.width = Inches(0.5)
for paragraph in summary_table_row1_col_5.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(14)
run.font.bold = True
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
#---------ROW 2------------
summary_table_row2_col_1 = summary_table.cell(1,0)
summary_table_row2_col_1.text ="Sluglines"
summary_table_row2_col_1.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
for paragraph in summary_table_row2_col_1.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(14)
summary_table_row2_col_2 = summary_table.cell(1,1)
summary_table_row2_col_2.text = str(sluglinecase_corrected_count)
summary_table_row2_col_2.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
for paragraph in summary_table_row2_col_2.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(14)
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
summary_table_row2_col_3 = summary_table.cell(1,2)
summary_table_row2_col_3.text = str(slugline_indentation)
summary_table_row2_col_3.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
for paragraph in summary_table_row2_col_3.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(14)
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
summary_table_row2_col_4 = summary_table.cell(1,3)
summary_table_row2_col_4.text = str(slugline_formated)
summary_table_row2_col_4.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
for paragraph in summary_table_row2_col_4.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(14)
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
summary_table_row2_col_5 = summary_table.cell(1,4)
total_slug = slugline_formated+slugline_indentation+sluglinecase_corrected_count
summary_table_row2_col_5.text = str(total_slug)
summary_table_row2_col_5.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
for paragraph in summary_table_row2_col_5.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(14)
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
summary_table_row3_col_1 = summary_table.cell(2,0)
summary_table_row3_col_1.text = "Action Lines"
summary_table_row3_col_1.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
for paragraph in summary_table_row3_col_1.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(14)
summary_table_row3_col_2 = summary_table.cell(2,1)
summary_table_row3_col_2.text = str(actionlinecase_corrected_count)
summary_table_row3_col_2.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
for paragraph in summary_table_row3_col_2.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(14)
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
summary_table_row3_col_3 = summary_table.cell(2,2)
summary_table_row3_col_3.text = str(actionline_indentation)
summary_table_row3_col_3.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
for paragraph in summary_table_row3_col_3.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(14)
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
summary_table_row3_col_4 = summary_table.cell(2,3)
summary_table_row3_col_4.text = str(actionline_formated)
summary_table_row3_col_4.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
for paragraph in summary_table_row3_col_4.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(14)
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
summary_table_row3_col_5 = summary_table.cell(2,4)
total_action_line = actionlinecase_corrected_count+actionline_indentation+actionline_formated
summary_table_row3_col_5.text = str(total_action_line)
summary_table_row3_col_5.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
for paragraph in summary_table_row3_col_5.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(14)
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
#-----ROW 4 ---------
summary_table_row4_col_1 = summary_table.cell(3,0)
summary_table_row4_col_1.text = "Speaker"
summary_table_row4_col_1.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
for paragraph in summary_table_row4_col_1.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(14)
# paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
summary_table_row4_col_2 = summary_table.cell(3,1)
summary_table_row4_col_2.text = str(speakercase_corrected_count)
summary_table_row4_col_2.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
for paragraph in summary_table_row4_col_2.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(14)
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
summary_table_row4_col_3 = summary_table.cell(3,2)
summary_table_row4_col_3.text = str(speaker_indentation)
summary_table_row4_col_3.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
for paragraph in summary_table_row4_col_3.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(14)
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
summary_table_row4_col_4 = summary_table.cell(3,3)
summary_table_row4_col_4.text = str(speaker_formated)
summary_table_row4_col_4.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
for paragraph in summary_table_row4_col_4.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(14)
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
summary_table_row4_col_5 = summary_table.cell(3,4)
total_speaker = speaker_formated+speaker_indentation+speakercase_corrected_count
summary_table_row4_col_5.text = str(total_speaker)
summary_table_row4_col_5.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
for paragraph in summary_table_row4_col_5.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(14)
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
# ----ROW 5 -------
summary_table_row5_col_1 = summary_table.cell(4,0)
summary_table_row5_col_1.text = "Parentheticals"
summary_table_row5_col_1.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
for paragraph in summary_table_row5_col_1.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(14)
summary_table_row5_col_2 = summary_table.cell(4,1)
summary_table_row5_col_2.text = str(parentheticalcase_corrected_count)
summary_table_row5_col_2.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
for paragraph in summary_table_row5_col_2.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(14)
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
summary_table_row5_col_3 = summary_table.cell(4,2)
summary_table_row5_col_3.text = str(parenthetical_line_indentation)
summary_table_row5_col_3.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
for paragraph in summary_table_row5_col_3.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(14)
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
summary_table_row5_col_4 = summary_table.cell(4,3)
summary_table_row5_col_4.text = str(parenthetical_formated)
summary_table_row5_col_4.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
for paragraph in summary_table_row5_col_4.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(14)
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
summary_table_row5_col_5 = summary_table.cell(4,4)
total_parenthetical = parenthetical_formated + parenthetical_line_indentation+parentheticalcase_corrected_count
summary_table_row5_col_5.text = str(total_parenthetical)
summary_table_row5_col_5.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
for paragraph in summary_table_row5_col_5.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(14)
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
#ROW --- 6
summary_table_row6_col_1 = summary_table.cell(5,0)
summary_table_row6_col_1.text = "Dialogue"
summary_table_row6_col_1.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
for paragraph in summary_table_row6_col_1.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(14)
summary_table_row6_col_2 = summary_table.cell(5,1)
summary_table_row6_col_2.text = str(Dialogue_case_corrected_count)
summary_table_row6_col_2.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
for paragraph in summary_table_row6_col_2.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(14)
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
summary_table_row6_col_3 = summary_table.cell(5,2)
summary_table_row6_col_3.text = str(dialogue_line_indentation)
summary_table_row6_col_3.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
for paragraph in summary_table_row6_col_3.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(14)
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
summary_table_row6_col_4 = summary_table.cell(5,3)
summary_table_row6_col_4.text = str(dialogue_formated)
summary_table_row6_col_4.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
for paragraph in summary_table_row6_col_4.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(14)
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
summary_table_row6_col_5 = summary_table.cell(5,4)
total_dialogue = dialogue_formated + dialogue_line_indentation+Dialogue_case_corrected_count
summary_table_row6_col_5.text = str(total_dialogue)
summary_table_row6_col_5.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
for paragraph in summary_table_row6_col_5.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(14)
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
#ROW --- 7
summary_table_row7_col_1 = summary_table.cell(6,0)
summary_table_row7_col_1.text = "Transitions"
summary_table_row7_col_1.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
for paragraph in summary_table_row7_col_1.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(14)
summary_table_row7_col_2 = summary_table.cell(6,1)
summary_table_row7_col_2.text = str(transitions_case_corrected_count)
summary_table_row7_col_2.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
for paragraph in summary_table_row7_col_2.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(14)
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
summary_table_row7_col_3 = summary_table.cell(6,2)
summary_table_row7_col_3.text = str(transitions_line_indentation)
summary_table_row7_col_3.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
for paragraph in summary_table_row7_col_3.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(14)
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
summary_table_row7_col_4 = summary_table.cell(6,3)
summary_table_row7_col_4.text = str(transitions_formated)
summary_table_row7_col_4.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
for paragraph in summary_table_row7_col_4.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(14)
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
summary_table_row7_col_5 = summary_table.cell(6,4)
total_transition = transitions_formated+transitions_line_indentation+transitions_case_corrected_count
summary_table_row7_col_5.text = str(total_transition)
summary_table_row7_col_5.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
for paragraph in summary_table_row7_col_5.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(14)
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
#ROW --- 8
summary_table_row8_col_1 = summary_table.cell(7,0)
summary_table_row8_col_1.text = "Special Terms"
summary_table_row8_col_1.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
for paragraph in summary_table_row8_col_1.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(14)
summary_table_row8_col_2 = summary_table.cell(7,1)
summary_table_row8_col_2.text = str(st_case_corrected_count)
summary_table_row8_col_2.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
for paragraph in summary_table_row8_col_2.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(14)
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
summary_table_row8_col_3 = summary_table.cell(7,2)
summary_table_row8_col_3.text = str(st_line_indentation)
summary_table_row8_col_3.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
for paragraph in summary_table_row8_col_3.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(14)
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
summary_table_row8_col_4 = summary_table.cell(7,3)
summary_table_row8_col_4.text = str(st_formated)
summary_table_row8_col_4.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
for paragraph in summary_table_row8_col_4.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(14)
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
summary_table_row8_col_5 = summary_table.cell(7,4)
total_special_term = st_formated +st_line_indentation+ st_case_corrected_count
summary_table_row8_col_5.text = str(total_special_term)
summary_table_row8_col_5.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
for paragraph in summary_table_row8_col_5.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(14)
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
#ROW --- 9
summary_table_row9_col_1 = summary_table.cell(8,0)
summary_table_row9_col_1.text = "Total"
summary_table_row9_col_1.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
for paragraph in summary_table_row9_col_1.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(14)
run.font.bold = True
summary_table_row9_col_2 = summary_table.cell(8,1)
summary_table_row9_col_2.text = str(sluglinecase_corrected_count+actionlinecase_corrected_count+speakercase_corrected_count+parentheticalcase_corrected_count +Dialogue_case_corrected_count+transitions_case_corrected_count+st_case_corrected_count)
summary_table_row9_col_2.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
for paragraph in summary_table_row9_col_2.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(14)
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
summary_table_row9_col_3 = summary_table.cell(8,2)
summary_table_row9_col_3.text = str(slugline_indentation+actionline_indentation+speaker_indentation+parenthetical_line_indentation+dialogue_line_indentation+transitions_line_indentation+st_line_indentation)
summary_table_row9_col_3.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
for paragraph in summary_table_row9_col_3.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(14)
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
summary_table_row9_col_4 = summary_table.cell(8,3)
summary_table_row9_col_4.text = str(slugline_formated+actionline_formated+speaker_formated+parenthetical_formated+dialogue_formated+transitions_formated+st_formated)
summary_table_row9_col_4.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
for paragraph in summary_table_row9_col_4.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(14)
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
summary_table_row9_col_5 = summary_table.cell(8,4)
summary_table_row9_col_5.text = str(total_slug+total_action_line+total_speaker+total_parenthetical+total_dialogue+total_transition+total_special_term)
summary_table_row9_col_5.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
for paragraph in summary_table_row9_col_5.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(14)
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
#--------------------------- 14-09-2023
for _ in range(3):
output_doc.add_paragraph()
#----------------------- 14-09-23
para = output_doc.add_paragraph()
para.alignment = WD_ALIGN_PARAGRAPH.CENTER
# Audit detail at center of the page with bold
run = para.add_run()
font = run.font
font.bold = True
font.size = Pt(18)
run.text = ' Audit Details'
run.add_break()
# -------------------------- 14-09-23
no_rows = len(audit_df.index)
table =output_doc.add_table(1, cols =6)
table.alignment = WD_TABLE_ALIGNMENT.CENTER
table.style = 'Colorful Shading Accent 6'
table.autofit = False
table.columns[0].width = Inches(0.5)
table.columns[1].width = Inches(1.2)
table.columns[2].width = Inches(2)
table.columns[3].width = Inches(1.5)
table.columns[4].width = Inches(2)
table.columns[5].width = Inches(2.5)
# table.columns[3].width = Inches(0.5)
heading_cells = table.rows[0].cells
heading_cells[0].width = Inches(0.1)
heading_cells[1].width = Inches(0.1)
heading_cells[2].width = Inches(3.5)
heading_cells[3].width = Inches(0.8)
heading_cells[4].width = Inches(3.5)
heading_cells[5].width = Inches(2)
heading_cells[0].text = 'Line No'
heading_cells[1].text = 'Audited Line No'
heading_cells[2].text = 'Current Content'
heading_cells[3].text = 'Script Element'
heading_cells[4].text = 'New Content'
heading_cells[5].text = 'Changes Done'
for i in range(0,6):
heading_cells[i].paragraphs[0].runs[0].font.bold = True
heading_cells[i].paragraphs[0].runs[0].font.size = Pt(9)
#------------------------------->LOGIC HERE<---------------------------------------------
report_df = pd.DataFrame(columns=['line_no', 'audited_line_no', 'current_content', 'script_element', 'new_content', 'changes_done', 'para_no'])
for index in audit_df.index:
columns_to_check = ["line_removed","introduction", "appendix", "page_no" ,"left_indent_corrected" ,"right_indent_corrected" ,"line_wrapped_at_prescribed_right_indent", "case_corrected", "blank_inserted_before" ,"blank_inserted_after" ,"blank_deleted_before" ,"blank_deleted_after" ,"space_removed_between_characters" ,"space_added_between_characters" ,"line_merged_with_next_line", "line_broken_into_multiple_lines" ,"punctuation_mark_added" ,"punctuation_mark_removed" ,"language_specific_audit_comments"]
audit_df[columns_to_check] = audit_df[columns_to_check].fillna('No')
if audit_df.loc[index, columns_to_check].eq('No').all().all():
# All columns contain 'No', skip this row
continue
elif audit_df['introduction'][index] == 'Yes':
continue
elif audit_df['appendix'][index] == 'Yes':
continue
elif audit_df['Identification_Status'][index] == 'blank':
continue
elif pd.isna(audit_df.loc[index, "Identification_Status"]):
continue
para_value = audit_df["para_no"][index] # ---------------------------------------------><-------------------------
current_para_value = report_df['para_no'].iloc[-1] if not report_df.empty else None
if para_value == current_para_value:
continue
else:
# report_df = report_df.append(audit_df.loc[index], ignore_index=True)
new_row = audit_df.loc[index].to_frame().T
report_df = pd.concat([report_df, new_row], ignore_index=True)
print("current_para_value",current_para_value)
row_index = 1
old_line_no_index = index
collection_old_line_no = []
while old_line_no_index < len(audit_df) and str(audit_df["para_no"][old_line_no_index]) == str(para_value):
if audit_df['Identification_Status'][old_line_no_index] != "blank":
try:
data = int(old_line_no_index)
collection_old_line_no.append(str(data))
except ValueError:
pass
old_line_no_index += 1
cells = table.add_row().cells
cells[0].width = Inches(0.1)
cells[0].text = ', '.join(collection_old_line_no)
audited_line_index = index
#--------------------------------------audited_lino_no------------------
collection_audited_line_no = []
while audited_line_index < len(audit_df) and str(audit_df["para_no"][audited_line_index]) == str(para_value):
if audit_df['Identification_Status'][audited_line_index] != "blank":
audited_line_no = audit_df['audited_line_no'][audited_line_index]
try:
data = int(audited_line_no)
collection_audited_line_no.append(str(data))
except ValueError:
pass
audited_line_index += 1
print("collection_audited_line_no", collection_audited_line_no)
data_string = ', '.join(collection_audited_line_no)
print("data_string:", data_string)
cells[1].width = Inches(0.1)
cells[1].text = data_string
#------------------------------>OLD DATA<---------------------------------
data_index = index
collection_data = []
while data_index < len(audit_df) and str(audit_df["para_no"][data_index]) == str(para_value):
cur_data = audit_df['data'][data_index]
if not pd.isna(cur_data): # Check if the value is not NaN
data = str(cur_data).strip()
collection_data.append(data)
data_index += 1
cells[2].width = Inches(3.5)
data = str(data)
cells[2].text = '\n '.join(collection_data)
if audit_df['Identification_Status'][index] == 'blank':
script_element = 'Blank Line'
elif audit_df['Identification_Status'][index] == '':
if audit_df['introduction'][index] == 'Yes':
script_element = 'Title/Introduction'
elif audit_df['appendix'][index] == 'Yes':
script_element = 'Appendix'
else:
continue
else:
script_element = ps_to_script_element(audit_df['Identification_Status'][index])
data = script_element
cells[3].width = Inches(0.8)
cells[3].text = data
collection_new_data = []
new_data_index = index
while new_data_index < len(audit_df) and str(audit_df["para_no"][new_data_index]) == str(para_value):
if audit_df["line_removed"][new_data_index] == "No":
new_data = audit_df['data_corrected'][new_data_index]
if not pd.isna(new_data): # Check if the value is not NaN
data = str(new_data).strip()
collection_new_data.append(data)
new_data_index += 1
data = str(new_data).strip()
cells[4].width = Inches(3.5)
data = str(data)
cells[4].text = '\n '.join(collection_new_data)
sno = 1
changes_done = False
# identification_status = audit_df['Identification_Status'][index]
if pd.isnull(audit_df['Identification_Status'][index]) or audit_df['Identification_Status'][index] == "":
continue
if audit_df['left_indent_corrected'][index] != 'No':
change_comment = audit_df['left_indent_corrected'][index]
try:
str_int = change_comment[-2]+change_comment[-1]
except Exception as e:
pass
if ps_to_script_element(audit_df['Identification_Status'][index]) == "Dialogue":
if str_int == "15":
change_comment = "Dialogue line left index corrected to 1.5 Inch"
elif str_int == "25":
change_comment = "Dialogue line left index corrected to 2.5 Inch"
if str_int == "15":
name = ps_to_script_element(audit_df['Identification_Status'][index])
change_comment = f"{name} Line left indent corrected to 1.5 Inch"
print(change_comment)
elif str_int == "25":
name = ps_to_script_element(audit_df['Identification_Status'][index])
change_commen = f"{name} Left indent corrected to 2.5 Inch"
elif str_int == "30":
change_comment = "Parenthetical left indent corrected to 3 Inch"
elif str_int == "35":
change_comment = "Speaker left indent corrected to 3.5 Inch"
data = str(sno) + '. ' + str(change_comment)
# dataa = data.split()
# if dataa[-1] == "nan":
# continue
cells[5].width = Inches(2)
para = cells[5].add_paragraph()
run = para.add_run()
run.text = data
run.add_break()
sno += 1
changes_done = True
if audit_df['right_indent_corrected'][index] != 'No':
name = ps_to_script_element(audit_df['Identification_Status'][index])
change_comment = audit_df['right_indent_corrected'][index]
try:
str_int = change_comment[-2]+change_comment[-1]
except Exception as e:
pass
if str_int == "10":
change_comment = f"{name} Line right indent corrected to 1 Inch"
data = str(sno) + '. ' + str(change_comment)
# dataa = data.split()
# if dataa[-1] == "nan":
# continue
cells[5].width = Inches(2)
para = cells[5].add_paragraph()
run = para.add_run()
run.text = data
run.add_break()
sno += 1
changes_done = True
if audit_df['case_corrected'][index] != 'No':
name = ps_to_script_element(audit_df['Identification_Status'][index])
string = str(audit_df['case_corrected'][index])
string = string.split()
content = string[-1]
if content == "AllUpper":
change_comment = f'{name} Case ' + "Corrected to All Upper"
elif content == "AllLower":
change_comment = f'{name} Case ' + "Corrected to All Lowerr"
if len(str(change_comment)) <= 2 :
continue
data = str(sno) + '. ' + str(change_comment)
# dataa = data.split()
# if dataa[-1] == "nan":
# continue
cells[5].width = Inches(2)
para = cells[5].add_paragraph()
run = para.add_run()
run.text = data
run.add_break()
sno += 1
changes_done = True
if audit_df['line_wrapped_at_prescribed_right_indent'][index] != 'No':
change_comment = 'Line Wrapped at Prescribed Right Indent 1 Inch'
name = ps_to_script_element(audit_df['Identification_Status'][index])
if name == "Action":
change_comment = f'{name} Line Wrapped at Prescribed Right Indent 1 Inch'
elif name == "Dialogue":
change_comment = f'{name} Line Wrapped at Prescribed Right Indent 2 Inch'
data = str(sno) + '. ' + str(change_comment)
# dataa = data.split()
# if dataa[-1] == "nan":
# continue
cells[5].width = Inches(2)
para = cells[5].add_paragraph()
run = para.add_run()
run.text = data
run.add_break()
sno += 1
changes_done = True
if audit_df['line_broken_into_multiple_lines'][index] != 'No':
name = ps_to_script_element(audit_df['Identification_Status'][index])
change_comment = f'{name} line Broken into Multiple Lines'
data = str(sno) + '. ' + str(change_comment)
# dataa = data.split()
# if dataa[-1] == "nan":
# continue
cells[5].width = Inches(2)
para = cells[5].add_paragraph()
run = para.add_run()
run.text = data
run.add_break()
sno += 1
changes_done = True
if audit_df['line_merged_with_next_line'][index] != 'No':
name = ps_to_script_element(audit_df['Identification_Status'][index])
change_comment = f'{name} line Merged with Next Line'
data = str(sno) + '. ' + str(change_comment)
# dataa = data.split()
# if dataa[-1] == "nan":
# continue
cells[5].width = Inches(2)
para = cells[5].add_paragraph()
run = para.add_run()
run.text = data
run.add_break()
sno += 1
changes_done = True
if audit_df['language_specific_audit_comments'][index] != 'No':
pass
name = ps_to_script_element(audit_df['Identification_Status'][index])
change_comment = f"{name}",str(audit_df['language_specific_audit_comments'][index])
data = str(sno) + '. ' + str(change_comment)
cells[5].width = Inches(2)
para = cells[5].add_paragraph()
run = para.add_run()
run.text = data
run.add_break()
sno += 1
changes_done = True
if audit_df['blank_inserted_after'][index] != 'No':
change_comment = 'A blank line is added below'
# name = ps_to_script_element(audit_df['Identification_Status'][index])
# if name == "Action":
# change_comment = f'{name}Line Wrapped at Prescribed Right Indent 1 Inch'
# elif name == "Dialogue":
# change_comment = f'{name}Line Wrapped at Prescribed Right Indent 2 Inch'
data = str(sno) + '. ' + str(change_comment)
# dataa = data.split()
# if dataa[-1] == "nan":
# continue
cells[5].width = Inches(2)
para = cells[5].add_paragraph()
run = para.add_run()
run.text = data
run.add_break()
sno += 1
changes_done = True
if not changes_done:
continue
# data = 'No Changes Done'
# cells[5].width = Inches(2)
# para = cells[5].add_paragraph()
# run = para.add_run()
# run.text = data
# run.add_break()
row_index += 1
buffer = io.BytesIO()
output_doc.save(buffer)
buffer.seek(0)
#output_doc.save(audit_report_tabular_docx)
return buffer
def replace_dot_with_comma(slugline):
pattern = r'((?:INT\./EXT\. |INT\. |EXT\. |E/I\. |INT |EXT)?)\s*(.*?)\s*-\s*([A-Z\s]+)'
def replacer(match):
location = match.group(2)
location = location.replace(".", ",")
return f'{match.group(1)}{location} - {match.group(3)}'
return re.sub(pattern, replacer, slugline)
def change_dot_to_comma_inslug(df):
for index, row in df.iterrows():
if row['script_element'] == 'slugline':
text = (row['content'])
print(text)
modified_sluglines = replace_dot_with_comma(text)
print(modified_sluglines)
df.loc[index, 'content'] = modified_sluglines
return df
def fdx_to_audited_df(input_script):
fdx = open(input_script, 'r')
fdx_df = utilities.fdx_to_csv(fdx)
df = pd.DataFrame(columns=['para_no','scene_no','content','script_element'])
df['content'] = fdx_df['Text']
df['script_element'] = fdx_df['Script_Element']
di = {'Scene Heading':'slugline','Character':'speaker','Parenthetical':'parenthetical','Transition':'transition','Action':'action','Dialogue':'dialogue'}
df.replace({"script_element":di},inplace= True)
##inserting blanks
## after slugline
## after action
## after dialogue
## after transition
count = len(df)
for index in df.index:
se = df['script_element'][index]
if index < (count-1):
if se in ('slugline','action','dialogue','transition'):
# skip parenthticals in between dialogues
if not(se == 'dialogue' and df['script_element'][index+1] in ('parenthetical','dialogue')):
df.loc[index + 0.5] = np.nan
df.loc[index + 0.5,'script_element'] = 'blank'
df.loc[index + 0.5,'content'] = ''
## case upper for slugline, character, transition, lower for parenthetical
## more elaborate function for slugline is present in sa_functions_english which will have to be repurposed
if se in ('slugline','speaker','transition'):
df.loc[index,'content'] = str(df.loc[index,'content']).upper()
if se == 'parenthetical':
df.loc[index,'content'] = str(df.loc[index,'content']).lower()
df = df.sort_index().reset_index(drop=True)
## add para_no and scene_no
para_no = 1
scene_no = 1
for index in df.index:
df['para_no'][index] = para_no
df['scene_no'][index] = scene_no
if df['script_element'][index] == 'slugline':
scene_no += 1
para_no += 1
#best of luck
return df