14495 lines
534 KiB
Plaintext
Executable File
14495 lines
534 KiB
Plaintext
Executable File
import numpy as np
|
||
import pandas as pd
|
||
import math
|
||
import os
|
||
import csv
|
||
import subprocess
|
||
import io
|
||
import shutil
|
||
from centralisedFileSystem.models import File, Script
|
||
from pathlib import Path
|
||
import re
|
||
import textwrap
|
||
import docx
|
||
from docx import Document
|
||
from docx.shared import Pt, RGBColor
|
||
from docx.shared import Mm,Inches
|
||
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
||
from docx.enum.table import WD_TABLE_ALIGNMENT
|
||
from docx.enum.table import WD_CELL_VERTICAL_ALIGNMENT
|
||
from docx.enum.section import WD_ORIENT
|
||
from pdf2docx import parse
|
||
# import pdftotext
|
||
from scriptAudit.exceptions import ScriptAuditException
|
||
from utils import utilities
|
||
from datetime import date
|
||
from PyPDF2 import PdfFileReader, PdfFileWriter
|
||
from utils.scripts_functions import countPages
|
||
from conversion.translation.detection import script_det, language_detector
|
||
from conversion.translation.translation_variables import get_language_script_code, language_code
|
||
|
||
#mypath= str(Path(__file__).resolve().parent.parent) + "/neutralAudit/matrices/"
|
||
mypath= str(Path(__file__).resolve().parent) + "/matrices/"
|
||
# mypath = os.getcwd() +'\\'
|
||
|
||
def convert_to_pdf(input_docx, out_folder):
|
||
subprocess.Popen(['libreoffice', '--headless', '--convert-to', 'pdf', '--outdir',out_folder, input_docx]).communicate()
|
||
|
||
|
||
def check_space_line(value):
|
||
if value.isspace():
|
||
return "Y"
|
||
else:
|
||
return "N"
|
||
|
||
def check_space(data):
|
||
# counter
|
||
space_count = 0
|
||
for i in range(0, len(data)):
|
||
|
||
# Check each char
|
||
# is blank or not
|
||
if data[i] == " ":
|
||
space_count += 1
|
||
else:
|
||
break
|
||
#print(space_count)
|
||
return space_count
|
||
|
||
def get_last_char_pos(data):
|
||
l = len(data)
|
||
for pos in range(0,l):
|
||
after_pos = data[pos+1:]
|
||
#print(data[pos],after_pos)
|
||
|
||
if after_pos.isspace() or not after_pos:
|
||
return pos
|
||
|
||
|
||
def get_case(value):
|
||
upperFound = False
|
||
if check_space_line(value) == "Y":
|
||
return "None"
|
||
elif value.isupper():
|
||
return "AllUpper"
|
||
elif value.islower():
|
||
return "AllLower"
|
||
else :
|
||
words = value.lstrip().split(" ")
|
||
try:
|
||
ch = words[0][0]
|
||
except:
|
||
return "None"
|
||
if words[0][0]:
|
||
|
||
if words[0][0].isupper() and not words[0].isupper():
|
||
return "FirstCamel"
|
||
|
||
elif words[0].isupper() and len(words[0]) > 1:
|
||
return "FirstUpper"
|
||
elif words[-1].isupper() and len(words[-1]) > 1:
|
||
return "EndUpper"
|
||
else:
|
||
for word in words:
|
||
if word.isupper() and len(word) > 1:
|
||
upperFound = True
|
||
if upperFound:
|
||
return "MidUpper"
|
||
else:
|
||
return "Partial"
|
||
|
||
return 'None'
|
||
|
||
def conv_pdf_to_docx(input_script,output_converted_docx):
|
||
|
||
parse(input_script,output_converted_docx,start=0,end=None)
|
||
|
||
|
||
def conv_docx_to_txt(input_script,output_converted_txt):
|
||
# import textwrap
|
||
|
||
# from docx import Document
|
||
# from docx.shared import Pt
|
||
# from docx.shared import Mm
|
||
|
||
read_doc = Document(input_script)
|
||
# print(read_doc._body._body.xml)
|
||
#section= read_doc.sections[-1]
|
||
def recalculate_section_properties(n):
|
||
|
||
|
||
try:
|
||
section = read_doc.sections[n]
|
||
section_width_inches = section.page_width.inches
|
||
#section_width = int(section.page_width.inches * 10)
|
||
# print("section width direct ",section.page_width.inches)
|
||
# print(section.left_margin.inches,section.right_margin.inches)
|
||
margins_inches = section.left_margin.inches + section.right_margin.inches
|
||
#margins = int((section.left_margin.inches + section.right_margin.inches)*10)
|
||
print(margins_inches)
|
||
canvas_width_inches = section_width_inches - margins_inches
|
||
canvas_width = int(canvas_width_inches *10)
|
||
print("canvas width",canvas_width)
|
||
left_margin = int(section.left_margin.inches * 10)
|
||
except:
|
||
section = None
|
||
canvas_width = 65
|
||
left_margin = 15
|
||
|
||
return section,canvas_width,left_margin
|
||
|
||
#for para in read_doc.paragraphs:
|
||
# n= 0
|
||
# p = para._p
|
||
# sectPrs = p.xpath("./w:pPr/w:sectPr")
|
||
# if sectPrs:
|
||
# n = n +1
|
||
# print("Section changed after para")
|
||
# print(para.text)
|
||
# section,canvas_width,left_margin = recalculate_section_properties(n)
|
||
# print(section.left_margin.inches)
|
||
|
||
|
||
n = 0
|
||
try:
|
||
section,canvas_width,left_margin = recalculate_section_properties(n)
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
print(section.left_margin.inches)
|
||
except:
|
||
n =-1
|
||
section,canvas_width,left_margin = recalculate_section_properties(n)
|
||
|
||
all_paras = read_doc.paragraphs
|
||
first = all_paras[0].paragraph_format
|
||
#print(first.left_indent)
|
||
#count = 1
|
||
print("number of paras",len(all_paras))
|
||
#left_margin = 15
|
||
|
||
|
||
previous_indent= 0
|
||
with open(output_converted_txt, 'w', encoding='utf-8') as f:
|
||
for para in all_paras:
|
||
print('\n')
|
||
paragraph_format = para.paragraph_format
|
||
## using the paragraph spacing add blank line if required
|
||
|
||
try:
|
||
space_before = paragraph_format.space_before.pt
|
||
except:
|
||
space_before = 0.0
|
||
|
||
try:
|
||
space_after = paragraph_format.space_after.pt
|
||
except:
|
||
space_after = 0.0
|
||
|
||
print("space before")
|
||
print(space_before)
|
||
print("space after")
|
||
print(space_after)
|
||
try:
|
||
print("line spacing ",paragraph_format.line_spacing.pt)
|
||
print("line spacing rule ",paragraph_format.line_spacing_rule)
|
||
if paragraph_format.line_spacing.pt < 5 and previous_indent > 20:
|
||
continue
|
||
#print("space before",paragraph_format.space_before.pt)
|
||
except:
|
||
pass
|
||
|
||
section_changed = False
|
||
try:
|
||
####check section end and remove if CONTINUED
|
||
p = para._p
|
||
sectPrs = p.xpath("./w:pPr/w:sectPr")
|
||
if sectPrs:
|
||
section_changed = True
|
||
print("checking for continued at section change")
|
||
text = para.text.split(' ')
|
||
print(text)
|
||
if len(text) == 1:
|
||
skip_words = ['CONT','CONTD','CONTINUED',"CONT'D"]
|
||
## to be replaced by regex ,match
|
||
found_continue = False
|
||
for skip_word in skip_words:
|
||
if skip_word in text[0]:
|
||
#skip para
|
||
print("found continued")
|
||
found_continue = True
|
||
break
|
||
if found_continue:
|
||
print("skipping para but setting new section")
|
||
n= n+1
|
||
try:
|
||
section,canvas_width,left_margin = recalculate_section_properties(n)
|
||
print(section.left_margin.inches)
|
||
except Exception as e:
|
||
print(e)
|
||
continue
|
||
print("Continued not found at section change")
|
||
|
||
|
||
except:
|
||
pass
|
||
|
||
|
||
if float(space_before) > 5.0 :
|
||
print("adding blank line")
|
||
f.write('\n')
|
||
|
||
fli =0
|
||
li =0
|
||
ri =0
|
||
try:
|
||
if para.style.name == 'List Paragraph':
|
||
fli = 0
|
||
else:
|
||
fli = paragraph_format.first_line_indent.inches
|
||
|
||
except:
|
||
pass
|
||
try:
|
||
|
||
li = paragraph_format.left_indent.inches
|
||
except:
|
||
pass
|
||
|
||
try:
|
||
|
||
ri = paragraph_format.right_indent.inches
|
||
except:
|
||
pass
|
||
|
||
indent = int((fli + li ) * 10)
|
||
print("calculated indent ",indent)
|
||
|
||
data = para.text
|
||
lines = data.split('\n')
|
||
print("Examining para")
|
||
try:
|
||
print(para.text)
|
||
print(para.style.name)
|
||
except:
|
||
pass
|
||
|
||
print("lines in para",len(lines))
|
||
## remove starting number (before margin) and number after 65 characters
|
||
if len(lines) == 1 :
|
||
if indent < 0:
|
||
print(indent)
|
||
#start = -(indent )
|
||
#if re.search('\d',lines[0][0:start]):
|
||
# lines[0] = " ".join(lines[0].split()[1:])
|
||
#else:
|
||
# lines[0] = lines[0][start:]
|
||
indent = 0
|
||
lines[0] = lines[0].rstrip()
|
||
|
||
if len(lines[0]) > 40:
|
||
if lines[0][40:-2].strip() == '' and re.search('\d',lines[0][-2:]) :
|
||
lines[0] = lines[0][0:-2]
|
||
|
||
print(indent)
|
||
|
||
for line in lines:
|
||
#line = line.rjust(len(line) + indent + left_margin)
|
||
|
||
line = line.replace('\t',' ')
|
||
if indent == 0:
|
||
indent = check_space(line)
|
||
|
||
line = line.strip()
|
||
if line:
|
||
#print(line)
|
||
print(fli,li,indent,ri)
|
||
print(para.alignment)
|
||
try:
|
||
width = int(canvas_width - (indent + ri*10))
|
||
except:
|
||
width = 58 - indent
|
||
|
||
#if fli == 0 and li == 0 and str(para.alignment) == 'CENTER (1)':
|
||
if str(para.alignment) == 'CENTER (1)' :
|
||
ch_count = len(line)
|
||
print("line is center aligned")
|
||
print(ch_count)
|
||
indent = indent + int((width-ch_count)/2)
|
||
print(indent)
|
||
|
||
#elif fli == 0 and li == 0 and str(para.alignment) == 'RIGHT (2)':
|
||
elif str(para.alignment) == 'RIGHT (2)':
|
||
##removing fli li =
|
||
ch_count = len(line)
|
||
print("line is right aligned")
|
||
print(ch_count)
|
||
indent = indent + int(width-ch_count)
|
||
|
||
print(indent)
|
||
|
||
else:
|
||
if str(para.alignment) == 'JUSTIFY (3)':
|
||
line = ' '.join(line.split())
|
||
print("line is left aligned")
|
||
if indent+left_margin > 55:
|
||
indent = indent - 1
|
||
|
||
if width <= 0:
|
||
width = 1
|
||
print("Calculated Width:",width)
|
||
wrapped_lines = textwrap.wrap(line, width)
|
||
wrapped_data_lines_count = len(wrapped_lines)
|
||
if wrapped_data_lines_count > 1:
|
||
print("need to wrap line")
|
||
|
||
for wrapped_line in wrapped_lines:
|
||
#print(wrapped_line)
|
||
wrapped_line = wrapped_line.rjust(len(wrapped_line) + indent + left_margin)
|
||
|
||
print(indent+left_margin)
|
||
#print(wrapped_line)
|
||
f.write(wrapped_line)
|
||
f.write('\n')
|
||
continue
|
||
|
||
line = line.rjust(len(line) + indent + left_margin)
|
||
f.write(line)
|
||
f.write('\n')
|
||
|
||
else:
|
||
print("line is blank")
|
||
f.write(line)
|
||
f.write('\n')
|
||
|
||
####check section end
|
||
#p = para._p
|
||
#sectPrs = p.xpath("./w:pPr/w:sectPr")
|
||
if section_changed:
|
||
n = n +1
|
||
print("Section changed")
|
||
section,canvas_width,left_margin = recalculate_section_properties(n)
|
||
print(section.left_margin.inches)
|
||
|
||
|
||
if space_after >5.0:
|
||
print("adding blank line")
|
||
f.write('\n')
|
||
|
||
|
||
print("\n")
|
||
previous_indent = indent + left_margin
|
||
|
||
print("Converted to text")
|
||
|
||
|
||
def conv_pdf_to_txt(input_script,output_converted_txt):
|
||
|
||
# Load your PDF
|
||
with open(input_script, "rb") as f:
|
||
# pdf = pdftotext.PDF(f) #06-2-24
|
||
pass
|
||
# # If it's password-protected
|
||
# with open("secure.pdf", "rb") as f:
|
||
# pdf = pdftotext.PDF(f, "secret")
|
||
|
||
# How many pages?
|
||
print(len(pdf))
|
||
|
||
# # Iterate over all the pages
|
||
# for page in pdf:
|
||
# print(page)
|
||
|
||
# Read some individual pages
|
||
# print(pdf[0])
|
||
# print(pdf[1])
|
||
|
||
# Read all the text into one string
|
||
#print("\n\n".join(pdf))
|
||
txt_data = "\n\n".join(pdf)
|
||
|
||
with open(output_converted_txt, "w", encoding="utf8") as out_file:
|
||
out_file.write(txt_data)
|
||
|
||
|
||
|
||
def conv_pdf_to_txt_java(input_script,output_converted_txt):
|
||
|
||
from py4j.java_gateway import JavaGateway
|
||
import sys
|
||
#import global_file_db
|
||
|
||
#pdf_file = global_file_db.input_script_pdf
|
||
#converted_txt = global_file_db.input_text_file
|
||
pdf_file = input_script
|
||
|
||
|
||
gw = JavaGateway()
|
||
result = gw.entry_point.strip(pdf_file)
|
||
|
||
# result is a dict of {
|
||
# 'success': 'true' or 'false',
|
||
# 'payload': pdf file content if 'success' is 'true'
|
||
# 'error': error message if 'success' is 'false'
|
||
# }
|
||
|
||
#print(result['error'])
|
||
print(result['payload'])
|
||
#print(result['success'])
|
||
file = open(output_converted_txt, "w", encoding="utf8")
|
||
file.write(str(result['payload']))
|
||
|
||
|
||
def conv_to_txt(input_script, output_converted_docx, output_converted_txt):
|
||
|
||
extention = input_script.rsplit(".", 1)[-1]
|
||
|
||
if extention == "txt":
|
||
shutil.copyfile(input_script, output_converted_txt)
|
||
|
||
elif extention == "pdf":
|
||
# try:
|
||
# conv_pdf_to_txt(input_script, output_converted_txt)
|
||
# except:
|
||
# conv_pdf_to_docx(input_script, output_converted_docx)
|
||
# conv_docx_to_txt(output_converted_docx, output_converted_txt)
|
||
conv_pdf_to_docx(input_script, output_converted_docx)
|
||
conv_docx_to_txt(output_converted_docx, output_converted_txt)
|
||
|
||
elif extention == "docx":
|
||
conv_docx_to_txt(input_script, output_converted_txt)
|
||
|
||
elif extention == "fdx":
|
||
fdx = open(input_script, 'r')
|
||
plain_txt = utilities.fdx_to_txt(fdx)
|
||
with open(output_converted_txt, 'w') as f:
|
||
f.write(plain_txt)
|
||
|
||
else:
|
||
raise ScriptAuditException(f"{extention} file is not supported for Audit!")
|
||
|
||
|
||
|
||
def conv_to_df(txt_script) :
|
||
|
||
script_data = open(txt_script, 'r', encoding="utf-8").read()
|
||
script_data = script_data.split("\n")
|
||
|
||
paragphs = []
|
||
line_no = 0.0
|
||
data = ''
|
||
fields = ['line_no','data','Identification_Status','isIdentified']
|
||
df = pd.DataFrame([],columns= fields)
|
||
|
||
for index_script in range(len(script_data)):
|
||
# This replaces the new-line character with a space character within a paragraph.
|
||
script_data[index_script] = script_data[index_script].replace("\n", " ")
|
||
paragphs.append(script_data[index_script])
|
||
#data = script_data[index_script]
|
||
|
||
for index_para in range(len(paragphs)):
|
||
data = paragphs[index_para]
|
||
line_no +=1
|
||
print("processing line",line_no)
|
||
#print(data)
|
||
df.loc[len(df.index)] = [str(line_no),data,'','No']
|
||
|
||
return df
|
||
|
||
|
||
def conv_to_csv(txt_script,csv_for_processing) :
|
||
#print(csv_for_processing)
|
||
import csv
|
||
|
||
script_data = open(txt_script, 'r', encoding="utf-8").read()
|
||
script_data = script_data.split("\n")
|
||
|
||
paragphs = []
|
||
line_no = 0.0
|
||
data = ''
|
||
|
||
for index_script in range(len(script_data)):
|
||
# This replaces the new-line character with a space character within a paragraph.
|
||
script_data[index_script] = script_data[index_script].replace("\n", " ")
|
||
paragphs.append(script_data[index_script])
|
||
|
||
fields = ['line_no','data','Identification_Status','isIdentified']
|
||
|
||
with open(csv_for_processing, 'w',newline='') as csvfile:
|
||
# creating a csv writer object
|
||
csvwriter = csv.writer(csvfile)
|
||
|
||
# writing the fields
|
||
csvwriter.writerow(fields)
|
||
|
||
|
||
for index_para in range(len(paragphs)):
|
||
data = paragphs[index_para]
|
||
line_no +=1
|
||
print("processing line",line_no)
|
||
#print(data)
|
||
|
||
with open(csv_for_processing, 'a', encoding='utf-8',newline='') as csvfile:
|
||
# creating a csv writer object
|
||
csvwriter = csv.writer(csvfile)
|
||
|
||
# writing the data rows
|
||
csvwriter.writerow([str(line_no),data,'','No'])
|
||
|
||
|
||
|
||
def pre_assign_wts(df):
|
||
|
||
skip_words = ['INT.','EXT.','I/E','E/I','CUT TO','CUT BACK TO','FLASHCUT TO','DISSOLVE TO', 'INTERCUT', 'INTER CUT','PBS', 'INTERVAL',
|
||
'FLASHBACK','FADE IN','FADE TO BLACK','ON THE SCREEN','ON THE TV','MORNING','AT HOTEL','TV','MONTAGES','MUSICAL MONTAGES','ESSENTIALS','LATER','ESSENTIAL']
|
||
pos_sp_dial_line_nos = df.loc[(df['data'].str.strip().str.contains(r':-|:|-|".*"') == True) & (df['data'].str.strip().str.contains('|'.join(skip_words)) == False) ,'line_no'].to_list()
|
||
print(pos_sp_dial_line_nos)
|
||
new_pos_sp_dial_line_nos =pos_sp_dial_line_nos
|
||
for index in df.loc[df['line_no'].isin(pos_sp_dial_line_nos),:].index:
|
||
data = df['data'][index]
|
||
line_no = 0.0
|
||
new_line_no = 0.0
|
||
pos_sp_par = ''
|
||
line_no = df['line_no'][index]
|
||
pos_sp =''
|
||
pos_par = ''
|
||
pos_dia = ''
|
||
pos_sp_par = ''
|
||
|
||
print(df.dtypes)
|
||
try:
|
||
print(data)
|
||
except:
|
||
pass
|
||
|
||
if ":-" in data:
|
||
pos_sp_par = data.split(":-")[0]
|
||
pos_dia = data.split(":-")[-1].strip()
|
||
elif ":" in data:
|
||
pos_sp_par = data.split(":")[0]
|
||
pos_dia = data.split(":")[-1].strip()
|
||
elif "-" in data:
|
||
pos_sp_par = data.split("-")[0]
|
||
pos_dia = data.split("-")[-1].strip()
|
||
elif "\"" in data:
|
||
pos_sp_par = data.split("\"")[0]
|
||
pos_dia = data.split("\"")[-2].strip()
|
||
|
||
pos_sp_par = pos_sp_par.strip()
|
||
|
||
if pos_sp_par:
|
||
#print(pos_sp_par)
|
||
if "(" in pos_sp_par and ")" in pos_sp_par:
|
||
pos_sp = pos_sp_par.split("(")[0]
|
||
pos_par = "(" + pos_sp_par.split("(")[-1]
|
||
else:
|
||
pos_sp = pos_sp_par
|
||
pos_par = ''
|
||
|
||
print(pos_sp)
|
||
print(pos_par)
|
||
print(pos_dia)
|
||
|
||
if pos_sp:
|
||
has_digit = any(chr.isdigit() for chr in pos_sp)
|
||
if not has_digit and pos_sp.isupper() and pos_dia.strip():
|
||
#if pos_dia.strip():
|
||
df['data'][index] = pos_sp
|
||
df['preassigned_weights'][index] ='ps7-20'
|
||
if pos_par:
|
||
df.loc[index + 0.3] = np.nan
|
||
df.loc[index + 0.3,'data'] = pos_par
|
||
new_line_no = line_no + 0.3
|
||
df.loc[index + 0.3,'line_no'] = new_line_no
|
||
df.loc[index + 0.3,'isIdentified'] = 'No'
|
||
df.loc[index + 0.3,'preassigned_weights'] = 'ps10-20'
|
||
|
||
new_pos_sp_dial_line_nos.append(new_line_no)
|
||
print("split pos_par",df.loc[index + 0.3,'line_no'])
|
||
if pos_dia:
|
||
print("1",df.dtypes)
|
||
df.loc[index + 0.6] = np.nan
|
||
print("1.5",df.dtypes)
|
||
df.loc[index + 0.6,'data'] = pos_dia
|
||
new_line_no = line_no + 0.6
|
||
print(type(line_no),type(new_line_no))
|
||
df.loc[index + 0.6,'line_no'] = new_line_no
|
||
print("2",df.dtypes)
|
||
df.loc[index + 0.6,'isIdentified'] = 'No'
|
||
df.loc[index + 0.6,'preassigned_weights'] = 'ps13-20;ps14-20;ps15-20'
|
||
|
||
new_pos_sp_dial_line_nos.append(new_line_no)
|
||
print("split pos_dia",df.loc[index + 0.6,'line_no'],type(df.loc[index + 0.6,'line_no']))
|
||
print("3",df.dtypes)
|
||
df = df.sort_index().reset_index(drop=True)
|
||
for index in df.index:
|
||
df['line_no'][index] = float(index + 1)
|
||
|
||
return df
|
||
|
||
|
||
|
||
|
||
def create_audit_df(df):
|
||
audit_df = df[['line_no','data']]
|
||
audit_df['Identification_Status'] = ''
|
||
audit_df['data_corrected'] = ''
|
||
audit_df['audited_line_no'] = ''
|
||
audit_df['scene_number'] = ''
|
||
audit_df['line_removed'] = 'No'
|
||
audit_df['introduction'] = 'No'
|
||
audit_df['appendix'] = 'No'
|
||
audit_df['page_no'] = 'No'
|
||
audit_df['left_indent_corrected'] = 'No'
|
||
audit_df['right_indent_corrected'] = 'No'
|
||
audit_df['line_wrapped_at_prescribed_right_indent'] = 'No'
|
||
audit_df['case_corrected'] = 'No'
|
||
audit_df['blank_inserted_before'] = 'No'
|
||
audit_df['blank_inserted_after'] = 'No'
|
||
audit_df['blank_deleted_before'] = 'No'
|
||
audit_df['blank_deleted_after'] = 'No'
|
||
audit_df['space_removed_between_characters'] = 'No'
|
||
audit_df['space_added_between_characters'] = 'No'
|
||
audit_df['line_merged_with_next_line'] = 'No'
|
||
audit_df['line_broken_into_multiple_lines'] = 'No'
|
||
audit_df['punctuation_mark_added'] = 'No'
|
||
audit_df['punctuation_mark_removed'] = 'No'
|
||
audit_df['language_specific_audit_comments'] = 'No'
|
||
|
||
audit_df.set_index('line_no',inplace=True)
|
||
|
||
return audit_df
|
||
|
||
def trim_intro(df,audit_df):
|
||
|
||
|
||
stopwords = ['FADE IN' ]
|
||
remove_upto = -1
|
||
intro_removed = False
|
||
|
||
for index in df.index:
|
||
data = df['data'][index]
|
||
data = ' '.join(data.split())
|
||
for sw in stopwords:
|
||
if re.search(sw,data,re.IGNORECASE):
|
||
print("Found Fade In",index)
|
||
remove_upto = index
|
||
if remove_upto <= 100 :
|
||
|
||
print("removing lines till ", remove_upto)
|
||
while remove_upto != -1:
|
||
line_no = df['line_no'][remove_upto]
|
||
audit_df['line_removed'][line_no] = 'Yes'
|
||
audit_df['introduction'][line_no] = 'Yes'
|
||
|
||
df.drop(remove_upto,inplace= True)
|
||
remove_upto -= 1
|
||
|
||
intro_removed = True
|
||
print("title and introduction removed")
|
||
break
|
||
if intro_removed:
|
||
break
|
||
|
||
|
||
def remove_page_numbers(df,audit_df):
|
||
|
||
page_no_found = False
|
||
for index in df.index:
|
||
data = df['data'][index]
|
||
if check_space(data) > 54:
|
||
pos_page_no = data.strip()
|
||
if pos_page_no:
|
||
for ch in pos_page_no:
|
||
if not re.match('[\d\.]',ch):
|
||
page_no_found = False
|
||
break
|
||
else:
|
||
page_no_found = True
|
||
else:
|
||
continue
|
||
if page_no_found:
|
||
line_no = df['line_no'][index]
|
||
|
||
audit_df['line_removed'][line_no] = 'Yes'
|
||
audit_df['page_no'][line_no] = 'Yes'
|
||
|
||
|
||
|
||
def get_per_uppercase(text):
|
||
count_upper = 0
|
||
for ch in text.strip():
|
||
if ch.isupper():
|
||
count_upper += 1
|
||
try:
|
||
return (int(count_upper/(len(text.strip()))*100))
|
||
except:
|
||
return 0
|
||
def prep_for_audit(df):
|
||
df.reset_index(inplace=True, drop=True)
|
||
import re
|
||
print("Entering prep_for_audit")
|
||
|
||
df['data'].fillna('',inplace =True)
|
||
|
||
if 'scene_number' not in df.columns:
|
||
df['scene_number'] = ''
|
||
if 'Identification_Status' not in df.columns:
|
||
df['Identification_Status'] = ''
|
||
if 'plb' not in df.columns:
|
||
df['plb'] = ''
|
||
if 'nlb' not in df.columns:
|
||
df['nlb'] = ''
|
||
if 'ssc' not in df.columns:
|
||
df['ssc'] = ''
|
||
if 'lcp' not in df.columns:
|
||
df['lcp'] = 0
|
||
if 'case' not in df.columns:
|
||
df['case'] = ''
|
||
if 'per_uppercase' not in df.columns:
|
||
df['per_uppercase'] = ''
|
||
if 'parenthetical' not in df.columns:
|
||
df['parenthetical'] = ''
|
||
if 'pnbl_line_no' not in df.columns:
|
||
df['pnbl_line_no'] = ''
|
||
if 'nnbl_line_no' not in df.columns:
|
||
df['nnbl_line_no'] = ''
|
||
if 'ppnbl_line_no' not in df.columns:
|
||
df['ppnbl_line_no'] = ''
|
||
if 'nnnbl_line_no' not in df.columns:
|
||
df['nnnbl_line_no'] = ''
|
||
if 'pdil_line_no' not in df.columns:
|
||
df['pdil_line_no'] = ''
|
||
if 'ndil_line_no' not in df.columns:
|
||
df['ndil_line_no'] = ''
|
||
print("prep_for_audit- after if")
|
||
#print(str(df['line_no']))
|
||
print("593")
|
||
print(df)
|
||
for index in df.index:
|
||
#print(index)
|
||
data=df['data'][index]
|
||
#print(data)
|
||
if check_space(data) >= 140 or data.isspace() or (not data ):
|
||
df['Identification_Status'][index] = 'blank'
|
||
|
||
|
||
first_line = False
|
||
last_line = False
|
||
|
||
if index == 0 :
|
||
first_line = True
|
||
plb = "N"
|
||
else:
|
||
pvs_data = df['data'][index-1]
|
||
|
||
if index == df.index[-1]:
|
||
last_line = True
|
||
nlb = "N"
|
||
else:
|
||
next_data = df['data'][index+1]
|
||
|
||
print("616")
|
||
if (not first_line):
|
||
if check_space(pvs_data) >= 140 or pvs_data.isspace() or (not pvs_data ):
|
||
plb = "Y"
|
||
else:
|
||
plb = "N"
|
||
|
||
#print(plb)
|
||
|
||
if (not last_line):
|
||
if check_space(next_data) >= 140 or next_data.isspace() or (not next_data ):
|
||
nlb = "Y"
|
||
else:
|
||
nlb = "N"
|
||
|
||
#print(nlb)
|
||
|
||
print("633")
|
||
cur_indent = check_space(data)
|
||
lcp = get_last_char_pos(data)
|
||
case = get_case(data)
|
||
per_uppercase = get_per_uppercase(data)
|
||
|
||
par = ''
|
||
if re.match('\(',data.strip()[:1]):
|
||
if re.match('\)',data.strip()[-1:]) :
|
||
par = 'Complete'
|
||
elif re.search('\)',data.strip()) :
|
||
par = 'PartStartMid'
|
||
else:
|
||
par = 'StartingLeft'
|
||
|
||
elif re.match('\)',data.strip()[-1:]):
|
||
if re.search('\(',data.strip()):
|
||
par = 'PartMidEnd'
|
||
else:
|
||
par = 'EndingRight'
|
||
# beginning end already checked so now if paren present it is mixed
|
||
elif re.search('\(',data.strip()) and re.search('\)',data.strip()):
|
||
par = 'PartMidMid'
|
||
elif re.search('\(',data.strip()):
|
||
par = 'MixedLeft'
|
||
elif re.search('\)',data.strip()):
|
||
par = 'MixedRight'
|
||
else:
|
||
par = 'Absent'
|
||
print("660")
|
||
|
||
df['plb'][index] = plb
|
||
df['nlb'][index] = nlb
|
||
df['ssc'][index] = cur_indent
|
||
df['lcp'][index] = lcp
|
||
df['case'][index] = case
|
||
df['parenthetical'][index] = par
|
||
df['per_uppercase'][index] = per_uppercase
|
||
## pnlb ?
|
||
if first_line:
|
||
pnbl_line_no = 0
|
||
elif plb == 'N':
|
||
pnbl_line_no = df['line_no'][index -1]
|
||
elif index - 1 == 0:
|
||
pnbl_line_no = 0
|
||
else:
|
||
pnbl_line_no = df['line_no'][index -2]
|
||
print("678")
|
||
## nnlb ?
|
||
if last_line:
|
||
nnbl_line_no = 100000
|
||
elif nlb == 'N':
|
||
nnbl_line_no = df['line_no'][index +1]
|
||
elif index + 1 == df.index[-1]:
|
||
nnbl_line_no = 100000
|
||
else:
|
||
try:
|
||
nnbl_line_no = df['line_no'][index +2]
|
||
except Exception as e:
|
||
print("Exception--",e)
|
||
i = float(index) + 2
|
||
print("691",i,index)
|
||
print(str(df['line_no']))
|
||
print("692",df['line_no'][i])
|
||
|
||
print(nnbl_line_no)
|
||
print("694")
|
||
df['pnbl_line_no'][index] = pnbl_line_no
|
||
df['nnbl_line_no'][index] = nnbl_line_no
|
||
|
||
print("prep_for_audit- after 1st for loop")
|
||
for index in df.index:
|
||
line_no = df['line_no'][index]
|
||
pnbl_line_no = df['pnbl_line_no'][index]
|
||
if pnbl_line_no == 0:
|
||
ppnbl_line_no = 0
|
||
else:
|
||
ppnbl_line_no = df.loc[df['line_no'] == pnbl_line_no, 'pnbl_line_no'].values[0]
|
||
|
||
nnbl_line_no = df['nnbl_line_no'][index]
|
||
print(index,line_no,pnbl_line_no,nnbl_line_no)
|
||
if nnbl_line_no == 100000:
|
||
nnnbl_line_no = 100000
|
||
else:
|
||
nnnbl_line_no = df.loc[df['line_no'] == nnbl_line_no, 'nnbl_line_no'].values[0]
|
||
|
||
df['ppnbl_line_no'][index] = ppnbl_line_no
|
||
df['nnnbl_line_no'][index] = nnnbl_line_no
|
||
print("prep_for_audit- after 2nd for loop")
|
||
for index in df.index:
|
||
|
||
data=df['data'][index]
|
||
pdil_line_no = 0
|
||
cur_indent = df['ssc'][index]
|
||
|
||
##pdil
|
||
## lets find previous different indent line
|
||
print(index,"looking for previous different indent line")
|
||
if index == 0:
|
||
df['pdil_line_no'][index] = pdil_line_no
|
||
continue
|
||
|
||
pdil_index = index - 1
|
||
while pdil_index >= 0 :
|
||
pdil_indent = df['ssc'][pdil_index]
|
||
print(cur_indent,pdil_indent)
|
||
if df['Identification_Status'][pdil_index] != 'blank' and pdil_indent != cur_indent:
|
||
pdil_line_no = df['line_no'][pdil_index]
|
||
break
|
||
else:
|
||
pdil_index -= 1
|
||
|
||
df['pdil_line_no'][index] = pdil_line_no
|
||
print("prep_for_audit- after 3rd for loop")
|
||
for index in df.index:
|
||
|
||
data=df['data'][index]
|
||
ndil_line_no = 100000
|
||
cur_indent = df['ssc'][index]
|
||
|
||
print("looking for next different indent line")
|
||
if index == df.index[-1]:
|
||
df['ndil_line_no'][index] = ndil_line_no
|
||
continue
|
||
|
||
ndil_index = index + 1
|
||
# ndil
|
||
while ndil_index <= df.index[-1]:
|
||
ndil_indent = df['ssc'][ndil_index]
|
||
print(cur_indent,ndil_indent)
|
||
if df['Identification_Status'][ndil_index] != 'blank' and ndil_indent != cur_indent:
|
||
ndil_line_no = df['line_no'][ndil_index]
|
||
break
|
||
else:
|
||
ndil_index += 1
|
||
|
||
df['ndil_line_no'][index] = ndil_line_no
|
||
|
||
return df
|
||
|
||
def remove_extra_blank_lines(df,audit_df):
|
||
# remove two or more consequtive blank lines.. keep one
|
||
for index in range(0,df.index[-1]):
|
||
data = df['data'][index]
|
||
line_no = df['line_no'][index]
|
||
|
||
nl_data = df['data'][index+1]
|
||
try:
|
||
print(data)
|
||
except:
|
||
pass
|
||
if not data.strip() and not nl_data.strip():
|
||
audit_df['line_removed'][line_no] = 'Yes'
|
||
audit_df['Identification_Status'][line_no] = 'blank'
|
||
elif not data.strip() and nl_data.strip():
|
||
df['plb'][index] = 'N'
|
||
|
||
def remove_blank_line_after_parenthetical(df,audit_df):
|
||
# remove two or more consequtive blank lines.. keep one
|
||
for index in range(0,df.index[-1]):
|
||
data = df['data'][index]
|
||
line_no = df['line_no'][index]
|
||
|
||
nl_data = df['data'][index+1]
|
||
nl_line_no = df['line_no'][index+1]
|
||
try:
|
||
print(data)
|
||
except:
|
||
pass
|
||
if df['parenthetical'][index] in ('Complete','EndingRight') and not nl_data.strip():
|
||
audit_df['line_removed'][nl_line_no] = 'Yes'
|
||
df['nlb'][index] = 'N'
|
||
|
||
|
||
|
||
def merge_broken_lines(df,audit_df):
|
||
|
||
index_iter = iter(range(0,df.index[-1]))
|
||
for index in index_iter:
|
||
|
||
cur_line_data = df['data'][index]
|
||
cur_line_indent = df['ssc'][index]
|
||
cur_case = 'AllUpper'
|
||
cur_lcp = df['lcp'][index]
|
||
nnbl_line_no = df['nnbl_line_no'][index]
|
||
nlb = df['nlb'][index]
|
||
# if nlb == 'Y':
|
||
# next_nbl_index = index +2
|
||
# if next_nbl_index > df.index[-1]:
|
||
# continue
|
||
# else:
|
||
# next_nbl_index = index +1
|
||
|
||
|
||
try:
|
||
next_nbl_data = df.loc[df['line_no'] == nnbl_line_no , 'data'].values[0]
|
||
next_nbl_indent = df.loc[df['line_no'] == nnbl_line_no , 'ssc'].values[0]
|
||
next_nbl_case = df.loc[df['line_no'] == nnbl_line_no , 'case'].values[0]
|
||
except:
|
||
next_nbl_data = ''
|
||
next_nbl_indent = 0
|
||
|
||
line_no = df['line_no'][index]
|
||
#next_nbl_line_no = df['line_no'][next_nbl_index]
|
||
|
||
two_line_data = ''
|
||
indent_dif = next_nbl_indent - cur_lcp
|
||
print(line_no,indent_dif)
|
||
if indent_dif > 0 and indent_dif <= 3 and next_nbl_case != 'AllUpper' and cur_case != 'AllUpper':
|
||
|
||
if indent_dif == 1:
|
||
two_line_data = cur_line_data.rstrip() + next_nbl_data.lstrip()
|
||
else:
|
||
two_line_data = cur_line_data.rstrip() + ' ' + next_nbl_data.lstrip()
|
||
two_line_len = len(two_line_data.strip())
|
||
|
||
|
||
print(index,line_no,cur_line_indent,next_nbl_indent,two_line_len)
|
||
print(cur_line_data)
|
||
print(next_nbl_data)
|
||
|
||
|
||
if two_line_len < 150:
|
||
|
||
print("merging lines")
|
||
df['data'][index] = two_line_data
|
||
case = get_case(two_line_data)
|
||
df['case'][index] = case
|
||
# lcp = get_last_char_pos(two_line_data)
|
||
# df['last_character_placement'][index] = lcp
|
||
print(line_no)
|
||
audit_df['line_merged_with_next_line'][line_no] = 'Yes'
|
||
print(two_line_data)
|
||
audit_df['line_removed'][nnbl_line_no] = 'Yes'
|
||
|
||
# try:
|
||
# df['nlb'][next_nbl_index-1] = df['nlb'][next_nbl_index]
|
||
# except:
|
||
# pass
|
||
|
||
# try:
|
||
# df['plb'][next_nbl_index+1] = df['plb'][next_nbl_index]
|
||
# except:
|
||
# pass
|
||
|
||
if nlb == 'N':
|
||
next(index_iter)
|
||
|
||
else:
|
||
next(index_iter)
|
||
next(index_iter)
|
||
|
||
|
||
else:
|
||
print(cur_line_data)
|
||
|
||
|
||
else:
|
||
print(index,cur_line_indent,next_nbl_indent)
|
||
try:
|
||
print(cur_line_data)
|
||
except:
|
||
pass
|
||
|
||
#newfile.write(cur_line_data)
|
||
|
||
def remove_space_between_words(df,audit_df):
|
||
|
||
lines_removed = audit_df.loc[audit_df['line_removed'] == 'Yes'].index.to_list()
|
||
# remove extra spaces between the words
|
||
for index in df.index:
|
||
cur_indent = df['ssc'][index]
|
||
line_no = df['line_no'][index]
|
||
|
||
if (line_no in lines_removed) or cur_indent > 140:
|
||
continue
|
||
data = df['data'][index]
|
||
new_data = ''
|
||
words = data.lstrip().split()
|
||
for word in words:
|
||
#print(word)
|
||
new_data += word + " "
|
||
new_data = new_data.rjust(len(new_data)+cur_indent)
|
||
df['data'][index] = new_data
|
||
df['lcp'][index] = get_last_char_pos(df['data'][index])
|
||
if new_data.strip() != data.strip():
|
||
audit_df['space_removed_between_characters'][line_no] = 'Yes'
|
||
|
||
print(index)
|
||
try:
|
||
print(data)
|
||
print(new_data)
|
||
except:
|
||
pass
|
||
|
||
|
||
#df = df.loc[df['line_removed'] != 'Yes',:]
|
||
|
||
|
||
def get_strict_conditions(csv_strict_conditions):
|
||
import pandas as pd
|
||
|
||
conditions_df = pd.read_csv(csv_strict_conditions, index_col = [0], skiprows = [0])
|
||
conditions_df = conditions_df.head(30)
|
||
cols = conditions_df.columns
|
||
conditions_df.rename(columns= { cols[3]:'cl_plb',
|
||
cols[4]:'cl_nlb',
|
||
cols[5]:'cl_ssc',
|
||
cols[6]:'cl_lcp',
|
||
cols[7]:'cl_par',
|
||
cols[8]:'cl_case',
|
||
cols[9]:'cl_per_uppercase',
|
||
cols[10]:'pnbl_plb',
|
||
cols[11]:'pnbl_par',
|
||
cols[12]:'pnbl_vs_cur_indent',
|
||
cols[15]:'pnbl_case',
|
||
cols[16]:'nnbl_nlb',
|
||
cols[17]:'nnbl_par',
|
||
cols[18]:'nnbl_vs_cur_indent',
|
||
cols[21]:'nnbl_case',
|
||
cols[22]:'pdil_plb',
|
||
cols[23]:'pdil_nlb',
|
||
cols[24]:'pdil_vs_cur_indent',
|
||
cols[27]:'pdil_par',
|
||
cols[29]:'ndil_plb',
|
||
cols[30]:'ndil_nlb',
|
||
cols[31]:'ndil_vs_cur_indent',
|
||
cols[34]:'ndil_par',
|
||
}, inplace = True)
|
||
conditions_df = conditions_df[['cl_plb','cl_nlb','cl_ssc','cl_lcp','cl_par','cl_case','cl_per_uppercase',
|
||
'pnbl_plb','pnbl_par','pnbl_vs_cur_indent','pnbl_case',
|
||
'nnbl_nlb','nnbl_par','nnbl_vs_cur_indent','nnbl_case',
|
||
'pdil_plb','pdil_nlb','pdil_par','pdil_vs_cur_indent',
|
||
'ndil_plb','ndil_nlb','ndil_par','ndil_vs_cur_indent']]
|
||
|
||
|
||
return conditions_df
|
||
|
||
def test_strict_conditions(df,csv_strict_conditions):
|
||
import pandas as pd
|
||
|
||
left_aligned = True
|
||
|
||
for index in df.index:
|
||
if df['ssc'][index] > 15:
|
||
left_aligned = False
|
||
break
|
||
## if conversion to text is left aligned then dialogue middle wrongly getting identified as action middle so skipping strict contions
|
||
if left_aligned:
|
||
df.loc[df['Identification_Status'] == 'blank','isIdentified'] = 'Yes'
|
||
return
|
||
|
||
conditions_df = get_strict_conditions(csv_strict_conditions)
|
||
#df['isIdentified'] = 'No'
|
||
df['When_Identified'] = ''
|
||
bb = False
|
||
for index in df.index:
|
||
|
||
if df['isIdentified'][index] == 'Yes':
|
||
continue
|
||
|
||
cl_plb = df['plb'][index]
|
||
cl_nlb = df['nlb'][index]
|
||
cl_indent = pd.to_numeric(df['ssc'][index])
|
||
cl_lcp = df['lcp'][index]
|
||
cl_par = df['parenthetical'][index]
|
||
cl_case = str(df['case'][index])
|
||
cl_per_uppercase = df['per_uppercase'][index]
|
||
|
||
pnbl_plb = None
|
||
pnbl_indent = None
|
||
pnbl_par = None
|
||
pnbl_case = None
|
||
|
||
nnbl_nlb = None
|
||
nnbl_indent = None
|
||
nnbl_par = None
|
||
nnbl_case = None
|
||
|
||
pnbl = True
|
||
nnbl = True
|
||
pdil = True
|
||
ndil = True
|
||
|
||
pdil_plb = None
|
||
pdil_nlb = None
|
||
pdil_par = None
|
||
pdil_indent = None
|
||
|
||
ndil_plb = None
|
||
ndil_nlb = None
|
||
ndil_par = None
|
||
ndil_indent = None
|
||
|
||
pnbl_vs_cur_indent = "NA"
|
||
nnbl_vs_cur_indent = "NA"
|
||
pdil_vs_cur_indent = "NA"
|
||
ndil_vs_cur_indent = "NA"
|
||
|
||
try:
|
||
pnbl_line_no = df['pnbl_line_no'][index]
|
||
pnbl_index = df.loc[df['line_no'] == pnbl_line_no, :].index.values[0]
|
||
except:
|
||
pnbl = False
|
||
|
||
if pnbl:
|
||
pnbl_plb = df['plb'][pnbl_index]
|
||
pnbl_indent = df['ssc'][pnbl_index]
|
||
pnbl_par = df['parenthetical'][pnbl_index]
|
||
pnbl_case = str(df['case'][pnbl_index])
|
||
|
||
if pnbl_indent > cl_indent:
|
||
pnbl_vs_cur_indent = "More"
|
||
elif pnbl_indent == cl_indent:
|
||
pnbl_vs_cur_indent = "Same"
|
||
else:
|
||
pnbl_vs_cur_indent = "Less"
|
||
|
||
#print(pnbl_index)
|
||
|
||
|
||
|
||
|
||
try:
|
||
nnbl_line_no = df['nnbl_line_no'][index]
|
||
nnbl_index = df.loc[df['line_no'] == nnbl_line_no, :].index.values[0]
|
||
|
||
nnbl_nlb = df['nlb'][nnbl_index]
|
||
nnbl_indent = df['ssc'][nnbl_index]
|
||
nnbl_par = df['parenthetical'][nnbl_index]
|
||
nnbl_case = str(df['case'][nnbl_index])
|
||
|
||
if nnbl_indent > cl_indent:
|
||
nnbl_vs_cur_indent = "More"
|
||
elif nnbl_indent == cl_indent:
|
||
nnbl_vs_cur_indent = "Same"
|
||
else:
|
||
nnbl_vs_cur_indent = "Less"
|
||
except:
|
||
nnbl = 'afterlast'
|
||
|
||
|
||
try:
|
||
pdil_line_no = df['pdil_line_no'][index]
|
||
pdil_index = df.loc[df['line_no'] == pdil_line_no, :].index.values[0]
|
||
except:
|
||
pdil = False
|
||
|
||
if pdil:
|
||
pdil_plb = df['plb'][pdil_index]
|
||
pdil_nlb = df['nlb'][pdil_index]
|
||
pdil_par = df['parenthetical'][pdil_index]
|
||
pdil_indent = df['ssc'][pdil_index]
|
||
|
||
|
||
if pdil_indent > cl_indent:
|
||
pdil_vs_cur_indent = "More"
|
||
elif pdil_indent == cl_indent:
|
||
pdil_vs_cur_indent = "Same"
|
||
else:
|
||
pdil_vs_cur_indent = "Less"
|
||
|
||
|
||
try:
|
||
ndil_line_no = df['ndil_line_no'][index]
|
||
ndil_index = df.loc[df['line_no'] == ndil_line_no, :].index.values[0]
|
||
except:
|
||
ndil = False
|
||
|
||
if ndil:
|
||
ndil_plb = df['plb'][ndil_index]
|
||
ndil_nlb = df['nlb'][ndil_index]
|
||
ndil_par = df['parenthetical'][ndil_index]
|
||
ndil_indent = df['ssc'][ndil_index]
|
||
|
||
|
||
if ndil_indent > cl_indent:
|
||
ndil_vs_cur_indent = "More"
|
||
elif ndil_indent == cl_indent:
|
||
ndil_vs_cur_indent = "Same"
|
||
else:
|
||
ndil_vs_cur_indent = "Less"
|
||
|
||
|
||
cl_pos = ''
|
||
|
||
## get the conditions
|
||
|
||
#for j in range(1,32):
|
||
for j in range(1,18):
|
||
|
||
if j in [23,24,32,33]:
|
||
continue
|
||
|
||
ev_cl_plb = conditions_df['cl_plb' ]["ps{0}".format(j)]
|
||
ev_cl_nlb = conditions_df['cl_nlb' ]["ps{0}".format(j)]
|
||
ev_cl_indent_range = conditions_df['cl_ssc' ]["ps{0}".format(j)].split('-')
|
||
try:
|
||
ev_cl_indent_from = pd.to_numeric(ev_cl_indent_range[0])
|
||
except:
|
||
ev_cl_indent_from = 200
|
||
try:
|
||
ev_cl_indent_to = pd.to_numeric(ev_cl_indent_range[1])
|
||
except:
|
||
ev_cl_indent_to = ev_cl_indent_from
|
||
|
||
ev_cl_lcp_range = conditions_df['cl_lcp' ]["ps{0}".format(j)].split('-')
|
||
try:
|
||
ev_cl_lcp_from = pd.to_numeric(ev_cl_lcp_range[0])
|
||
except:
|
||
ev_cl_lcp_from = 200
|
||
try:
|
||
ev_cl_lcp_to = pd.to_numeric(ev_cl_lcp_range[1])
|
||
except:
|
||
ev_cl_lcp_to = ev_cl_lcp_from
|
||
|
||
|
||
ev_cl_par = conditions_df['cl_par' ]["ps{0}".format(j)].split(";")
|
||
ev_cl_case = []
|
||
ev_cl_case = conditions_df['cl_case' ]["ps{0}".format(j)].split(";")
|
||
|
||
ev_cl_per_uppercase = conditions_df['cl_per_uppercase']["ps{0}".format(j)].split(";")
|
||
try:
|
||
operator = ev_cl_per_uppercase[0]
|
||
value = int(ev_cl_per_uppercase[1])
|
||
except:
|
||
operator = ""
|
||
value = ""
|
||
|
||
|
||
ev_pnbl_plb = conditions_df['pnbl_plb' ]["ps{0}".format(j)]
|
||
ev_pnbl_vs_cur_indent = conditions_df['pnbl_vs_cur_indent' ]["ps{0}".format(j)].split(";")
|
||
ev_pnbl_par = conditions_df['pnbl_par' ]["ps{0}".format(j)].split(";")
|
||
ev_pnbl_case = conditions_df['pnbl_case' ]["ps{0}".format(j)].split(";")
|
||
|
||
ev_nnbl_nlb = conditions_df['nnbl_nlb' ]["ps{0}".format(j)]
|
||
ev_nnbl_vs_cur_indent = conditions_df['nnbl_vs_cur_indent' ]["ps{0}".format(j)].split(";")
|
||
ev_nnbl_par = conditions_df['nnbl_par' ]["ps{0}".format(j)].split(";")
|
||
ev_nnbl_case = conditions_df['nnbl_case' ]["ps{0}".format(j)].split(";")
|
||
|
||
ev_pdil_plb = conditions_df['pdil_plb' ]["ps{0}".format(j)]
|
||
ev_pdil_nlb = conditions_df['pdil_nlb' ]["ps{0}".format(j)]
|
||
ev_pdil_vs_cur_indent = conditions_df['pdil_vs_cur_indent' ]["ps{0}".format(j)].split(";")
|
||
ev_pdil_par = conditions_df['pdil_par' ]["ps{0}".format(j)].split(";")
|
||
|
||
ev_ndil_plb = conditions_df['ndil_plb' ]["ps{0}".format(j)]
|
||
ev_ndil_nlb = conditions_df['ndil_nlb' ]["ps{0}".format(j)]
|
||
ev_ndil_vs_cur_indent = conditions_df['ndil_vs_cur_indent' ]["ps{0}".format(j)].split(";")
|
||
ev_ndil_par = conditions_df['ndil_par' ]["ps{0}".format(j)].split(";")
|
||
|
||
## checks
|
||
|
||
if not pnbl:
|
||
pnbl_plb_check = True
|
||
pnbl_indent_check = True
|
||
pnbl_par_check = True
|
||
pnbl_case_check = True
|
||
else:
|
||
|
||
pnbl_plb_check = True if (pnbl_plb == ev_pnbl_plb) or (ev_pnbl_plb == 'Maybe') else False
|
||
pnbl_indent_check = True if (pnbl_vs_cur_indent in ev_pnbl_vs_cur_indent ) else False
|
||
pnbl_par_check = True if pnbl_par in ev_pnbl_par else False
|
||
if pnbl_case in ev_pnbl_case or ev_pnbl_case == '' :
|
||
pnbl_case_check = True
|
||
else:
|
||
pnbl_case_check = False
|
||
|
||
|
||
if not nnbl :
|
||
nnbl_nlb_check = True
|
||
nnbl_indent_check = True
|
||
nnbl_par_check = True
|
||
nnbl_case_check = True
|
||
else:
|
||
|
||
nnbl_nlb_check = True if (nnbl_nlb == ev_nnbl_nlb) or (ev_nnbl_nlb == 'Maybe') else False
|
||
nnbl_indent_check = True if (nnbl_vs_cur_indent in ev_nnbl_vs_cur_indent ) else False
|
||
nnbl_par_check = True if nnbl_par in ev_nnbl_par else False
|
||
if nnbl_case in ev_nnbl_case or ev_nnbl_case == '' :
|
||
nnbl_case_check = True
|
||
else:
|
||
nnbl_case_check = False
|
||
|
||
if not pdil:
|
||
pdil_plb_check = True
|
||
pdil_nlb_check = True
|
||
pdil_indent_check = True
|
||
pdil_par_check = True
|
||
else:
|
||
|
||
pdil_plb_check = True if (pdil_plb == ev_pdil_plb) or (ev_pdil_plb == 'Maybe') else False
|
||
pdil_nlb_check = True if (pdil_nlb == ev_pdil_nlb) or (ev_pdil_nlb == 'Maybe') else False
|
||
pdil_indent_check = True if (pdil_vs_cur_indent in ev_pdil_vs_cur_indent ) else False
|
||
pdil_par_check = True if pdil_par in ev_pdil_par else False
|
||
|
||
if not ndil:
|
||
ndil_plb_check = True
|
||
ndil_nlb_check = True
|
||
ndil_indent_check = True
|
||
ndil_par_check = True
|
||
else:
|
||
|
||
ndil_plb_check = True if (ndil_plb == ev_ndil_plb) or (ev_ndil_plb == 'Maybe') else False
|
||
ndil_nlb_check = True if (ndil_nlb == ev_ndil_nlb) or (ev_ndil_nlb == 'Maybe') else False
|
||
ndil_indent_check = True if (ndil_vs_cur_indent in ev_ndil_vs_cur_indent ) else False
|
||
ndil_par_check = True if ndil_par in ev_ndil_par else False
|
||
|
||
|
||
cl_indent_check = False
|
||
cl_lcp_check = False
|
||
|
||
## check conditions
|
||
cl_plb_check = True if (cl_plb == ev_cl_plb) or (ev_cl_plb == 'Maybe') else False
|
||
cl_nlb_check = True if (cl_nlb == ev_cl_nlb) or (ev_cl_nlb == 'Maybe') else False
|
||
|
||
cl_indent_check = True if (cl_indent >= ev_cl_indent_from) and (cl_indent <= ev_cl_indent_to) else False
|
||
|
||
cl_lcp_check = True if (cl_lcp >= ev_cl_lcp_from) and (cl_lcp <= ev_cl_lcp_to) else False
|
||
|
||
cl_par_check = True if cl_par in ev_cl_par else False
|
||
|
||
if j == 21 and cl_case in ev_cl_case:
|
||
data = df['data'][index]
|
||
if data.split()[-1] == data.split()[-1].upper():
|
||
cl_case_check = True
|
||
else:
|
||
cl_case_check = False
|
||
|
||
else:
|
||
cl_case_check = True if cl_case in ev_cl_case else False
|
||
|
||
cl_per_uppercase_check = True
|
||
## cl percentage upper checks
|
||
if operator == "lessthan":
|
||
cl_per_uppercase_check = True if cl_per_uppercase <= value else False
|
||
elif operator == "morethan":
|
||
cl_per_uppercase_check = True if cl_per_uppercase >= value else False
|
||
elif operator == "equal":
|
||
cl_per_upperacse_check = True if cl_per_uppercase == value else False
|
||
|
||
#cl_par_check = True
|
||
checklist = [cl_plb_check,cl_nlb_check,cl_indent_check,cl_lcp_check,cl_par_check,cl_case_check,cl_per_uppercase_check,
|
||
pnbl_plb_check,pnbl_indent_check,pnbl_par_check,pnbl_case_check,
|
||
nnbl_nlb_check,nnbl_indent_check,nnbl_par_check,nnbl_case_check,
|
||
pdil_plb_check,pdil_nlb_check,pdil_indent_check,pdil_par_check,
|
||
ndil_plb_check,ndil_nlb_check,ndil_indent_check,ndil_par_check]
|
||
|
||
|
||
|
||
if all(checklist):
|
||
cl_pos = "ps{0}".format(j)
|
||
df['Identification_Status'][index] = cl_pos
|
||
df['isIdentified'][index] = 'Yes'
|
||
df['When_Identified'][index] = 'FirstStrictConditions'
|
||
break
|
||
|
||
|
||
df.loc[df['Identification_Status'] == 'blank','isIdentified'] = 'Yes'
|
||
|
||
|
||
def prep_weights_csv (weights_csv) :
|
||
|
||
wts_df = pd.read_csv(weights_csv,skiprows=[0])
|
||
wts_df = wts_df.head(50)
|
||
wts_df.rename(columns={wts_df.columns[1]:'Possibilities',wts_df.columns[2]:'Description',
|
||
wts_df.columns[3]:'PureImpure',wts_df.columns[7]:'AllUpper',
|
||
wts_df.columns[8]:'AllLower',
|
||
wts_df.columns[9]:'FirstCamel',
|
||
wts_df.columns[10]:'FirstUpper',wts_df.columns[11]:'FirstLowerSomeUpper',
|
||
wts_df.columns[12]:'Partial',wts_df.columns[13]:'EntireLine',
|
||
wts_df.columns[14]:'PartofLine',wts_df.columns[15]:'only left parenthetical present',
|
||
wts_df.columns[16]:'only right parenthetical present',
|
||
wts_df.columns[17]:'PLB_Yes',wts_df.columns[18]:'PLB_No',
|
||
wts_df.columns[19]:'NLB_Yes',wts_df.columns[20]:'NLB_No',
|
||
wts_df.columns[21]:'<15withNumeric',
|
||
# wts_df.columns[28]:'<15withoutNumeric',
|
||
# wts_df.columns[33]:'cur_indent_equals_pnbl',
|
||
# wts_df.columns[34]:'cur_indent_equals_nnbl',
|
||
# wts_df.columns[35]:'containsSpecialWords1',
|
||
# wts_df.columns[36]:'containsSpecialWords2',
|
||
# wts_df.columns[37]:'containsSpecialWords3',
|
||
# wts_df.columns[38]:'containsSpecialWords4'
|
||
wts_df.columns[29]:'<15withoutNumeric',
|
||
wts_df.columns[34]:'cur_indent_equals_pnbl',
|
||
wts_df.columns[35]:'cur_indent_equals_nnbl',
|
||
wts_df.columns[36]:'containsSpecialWords1',
|
||
wts_df.columns[37]:'containsSpecialWords2',
|
||
wts_df.columns[38]:'containsSpecialWords3',
|
||
wts_df.columns[39]:'containsSpecialWords4'
|
||
|
||
},inplace = True)
|
||
|
||
|
||
wts_df = wts_df.loc[:,['Possibilities', 'Description', 'PureImpure','AllUpper','AllLower','FirstCamel','FirstUpper', 'FirstLowerSomeUpper',
|
||
'Partial','EntireLine', 'PartofLine',
|
||
'only left parenthetical present', 'only right parenthetical present',
|
||
'PLB_Yes', 'PLB_No', 'NLB_Yes', 'NLB_No', '<15withNumeric',
|
||
'ssc_15', 'ssc_25', 'ssc_30', 'ssc_35','ssc_55','ssc_65','ssc_gt_65', '<15withoutNumeric','lcp_35','lcp_49','lcp_59','lcp_72','cur_indent_equals_pnbl','cur_indent_equals_nnbl','containsSpecialWords1','containsSpecialWords2','containsSpecialWords3','containsSpecialWords4']]
|
||
|
||
## interpolate the in between weights for the starting space count
|
||
sub = wts_df.loc[:,['Possibilities','ssc_15', 'ssc_25', 'ssc_30', 'ssc_35','ssc_55','ssc_65']]
|
||
sub = sub.set_index('Possibilities')
|
||
|
||
for col in range(16,65):
|
||
if col in [25,30,35]:
|
||
continue
|
||
|
||
sub['ssc_{}'.format(col)] = np.nan
|
||
|
||
sub = sub.sort_index(axis =1).interpolate(axis = 1).apply(round).reset_index()
|
||
wts_df = wts_df.merge(sub, how ='inner', on = ['Possibilities'], suffixes=('','_y'))
|
||
|
||
## interpolate the in between weights for the last character placement
|
||
wts_df['lcp_30'] = 1
|
||
wts_df['lcp_75'] = 1
|
||
|
||
sub = pd.DataFrame()
|
||
sub = wts_df.loc[:,['Possibilities','lcp_30','lcp_35','lcp_49','lcp_59','lcp_72','lcp_75']]
|
||
sub = sub.set_index('Possibilities')
|
||
|
||
for col in range(31,75):
|
||
if col in [35,49,59,72]:
|
||
continue
|
||
|
||
sub['lcp_{}'.format(col)] = np.nan
|
||
|
||
sub = sub.sort_index(axis =1).interpolate(axis = 1).apply(round).reset_index()
|
||
wts_df = wts_df.merge(sub, how ='inner', on = ['Possibilities'], suffixes=('','_y'))
|
||
|
||
wts_df.set_index('Possibilities',inplace =True)
|
||
return wts_df
|
||
|
||
def give_largest(df, n):
|
||
largest = df.nlargest(n)
|
||
data = [x for x in largest]
|
||
index = [f'{i}_largest' for i in range(1, len(largest)+1)]
|
||
return pd.Series(data, index=index)
|
||
|
||
|
||
def n_largest(df, axis, n):
|
||
'''
|
||
Function to return the n-largest value of each
|
||
column/row of the input DataFrame.
|
||
'''
|
||
return df.apply(give_largest, axis=axis, n=n)
|
||
|
||
def update_parenthetical_neighbor_wt(df):
|
||
|
||
print("updating weghts of parenthetical neighbors")
|
||
## line before complete of StartingLeft or Complete
|
||
for index in df.index:
|
||
par = df['parenthetical'][index]
|
||
if par == 'Absent':
|
||
continue
|
||
if par in ('StartingLeft','Complete'):
|
||
print(index,par)
|
||
try:
|
||
if df['plb'][index] == 'N':
|
||
df["ps7"][index-1] += 10
|
||
else:
|
||
df["ps7"][index-2] += 10
|
||
except:
|
||
pass
|
||
##line after Complete or EndingRight
|
||
if par in ('EndingRight','Complete'):
|
||
print(index,par)
|
||
try:
|
||
if df['nlb'][index] == 'N':
|
||
df['ps13'][index+1] += 15
|
||
df['ps15'][index+1] += 15
|
||
else:
|
||
df['ps13'][index+2] += 15
|
||
df['ps15'][index+2] += 15
|
||
except:
|
||
pass
|
||
|
||
return df
|
||
|
||
def gen_pos_weights(df,weights_csv):
|
||
|
||
# prep weights csv
|
||
wts_df = prep_weights_csv(weights_csv)
|
||
|
||
import os
|
||
import csv
|
||
import re
|
||
|
||
#mport global_file_db
|
||
import sys
|
||
from pathlib import Path
|
||
import argparse
|
||
import numpy as np
|
||
import pandas as pd
|
||
import math
|
||
|
||
left_aligned = True
|
||
contains_special = False
|
||
|
||
for index in df.index:
|
||
if df['ssc'][index] > 16:
|
||
left_aligned = False
|
||
break
|
||
print("is script left aligned: ",left_aligned)
|
||
|
||
for i in range(1,32):
|
||
|
||
if i in ('23','24','32','33'):
|
||
continue
|
||
df["ps{0}".format(i)] = 0
|
||
|
||
for index in df.index:
|
||
|
||
line_no = df['line_no'][index]
|
||
data = df['data'][index]
|
||
plb = df['plb'][index]
|
||
nlb = df['nlb'][index]
|
||
contains_special = False
|
||
print("processing weight for ",line_no)
|
||
|
||
# if index == 0:
|
||
# pnbl_index = 'first'
|
||
# elif df['plb'][index] == 'N' :
|
||
# pnbl_index = index -1
|
||
# elif index - 1 == 0:
|
||
# pnbl_index = 'first'
|
||
# else:
|
||
# pnbl_index = index -2
|
||
|
||
# if index == df.index[-1]:
|
||
# nnbl_index = 'last'
|
||
# elif df['nlb'][index] == 'N' :
|
||
# nnbl_index = index + 1
|
||
# elif index+1 == df.index[-1]:
|
||
# nnbl_index = 'last'
|
||
# else:
|
||
# nnbl_index = index + 2
|
||
|
||
pnbl_index = False
|
||
nnbl_index = False
|
||
|
||
try:
|
||
pnbl_line_no = df['pnbl_line_no'][index]
|
||
pnbl_index = df.loc[df['line_no'] == pnbl_line_no,:].index.values[0]
|
||
except:
|
||
pnbl_index = False
|
||
|
||
try:
|
||
nnbl_line_no = df['nnbl_line_no'][index]
|
||
nnbl_index = df.loc[df['line_no'] == nnbl_line_no,:].index.values[0]
|
||
except:
|
||
nnbl_index = False
|
||
|
||
|
||
|
||
try:
|
||
pnbl_indent = df['ssc'][pnbl_index]
|
||
except:
|
||
pnbl_indent = -1
|
||
try:
|
||
nnbl_indent = df['ssc'][nnbl_index]
|
||
except:
|
||
nnbl_indent = -1
|
||
|
||
cur_indent = df['ssc'][index]
|
||
ssc_col = 'ssc_' + str(cur_indent)
|
||
print(ssc_col)
|
||
|
||
case = df['case'][index]
|
||
try:
|
||
print("processing line no",line_no, data)
|
||
except:
|
||
pass
|
||
|
||
print(plb)
|
||
print(nlb)
|
||
print(pnbl_indent)
|
||
print(nnbl_indent)
|
||
|
||
### wights to be assigned based on space count, case, parentheseis and plb/nlb
|
||
|
||
|
||
# read the weights csv
|
||
#wts_df = pd.read_csv('weights1.csv',index_col = 'Possibilities')
|
||
## create the wights for last character placement from 41-78 by interpolation using wwights 51 63 78
|
||
|
||
lcp = df['lcp'][index]
|
||
#print("lcp ",lcp)
|
||
try:
|
||
lcp_col = "lcp_" + str(int(lcp))
|
||
except:
|
||
lcp_col = "lcp_" + str(lcp)
|
||
|
||
#print(wts_df.head(0))
|
||
# make space dict for getting relevant space columns for weights
|
||
sp_bin_dict = {1:'0-14',2:'15',3:'16-24',4:'25',5:'26-29',6:'30',7:'31-34',8:'35',9:'36-73',10:'74onwards'
|
||
}
|
||
#loop over for the possibilities
|
||
|
||
for i in range(1,32):
|
||
|
||
if i in ('23','24','32','33'):
|
||
continue
|
||
|
||
df["ps{0}".format(i)][index] = 0
|
||
|
||
## get weights for the case
|
||
if case in ('EndUpper','MidUpper'):
|
||
case = 'FirstLowerSomeUpper'
|
||
|
||
if case != 'None':
|
||
df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),case]
|
||
|
||
|
||
## get weights based on the starting space count
|
||
|
||
try:
|
||
df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),ssc_col]
|
||
#print("starting wight code was here")
|
||
except:
|
||
pass
|
||
|
||
print("ps{0}".format(i),df["ps{0}".format(i)][index])
|
||
## get weights for <19 with Numeric character or <19 without Numeric character
|
||
try:
|
||
start_num = True if re.search('[0-9]',data.strip()[0]) else False
|
||
except:
|
||
start_num = False
|
||
pos_num = re.search('[0-9]',data)
|
||
if (pos_num!= None) and start_num and cur_indent<15:
|
||
df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'<15withNumeric']
|
||
elif check_space(data)<15:
|
||
df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'<15withoutNumeric']
|
||
|
||
if cur_indent>65:
|
||
df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'ssc_gt_65']
|
||
## get weights based on the last character placement
|
||
|
||
print("ps{0}".format(i),df["ps{0}".format(i)][index])
|
||
|
||
try:
|
||
df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),lcp_col]
|
||
print("code was here")
|
||
print(wts_df.loc["ps{0}".format(i),lcp_col])
|
||
except Exception as e:
|
||
print ("lcp exception is",e)
|
||
pass
|
||
|
||
# how far is it from position 51 63 78
|
||
# 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78
|
||
# modify the wights matrix and create in between weights
|
||
|
||
#print("ps{0}".format(i),df["ps{0}".format(i)][index])
|
||
|
||
# Calculation of weights based on plb and nlb(L-O column in sheet)
|
||
if plb == "Y":
|
||
df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'PLB_Yes']
|
||
if plb == "N":
|
||
df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'PLB_No']
|
||
if nlb == "Y":
|
||
df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'NLB_Yes']
|
||
if nlb == "N":
|
||
df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'NLB_No']
|
||
|
||
#print("ps{0}".format(i),df["ps{0}".format(i)][index])
|
||
|
||
# Calculation of weights based on parenthesis(H-K column in sheet)
|
||
|
||
if re.match('\(',data.strip()[:1]) and re.match('\)',data.strip()[-1:]) :
|
||
# print('EntireLine')
|
||
df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'EntireLine']
|
||
elif re.search('\(',data.strip()) and re.search('\)',data.strip()) :
|
||
#print('PartofLine')
|
||
df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'PartofLine']
|
||
elif re.search('\(',data.strip()) and not(re.search('\)',data.strip())) :
|
||
#print('only left parenthetical present')
|
||
df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'only left parenthetical present']
|
||
elif not(re.search('\(',data.strip())) and re.search('\)',data.strip()) :
|
||
#print('only right parenthetical present')
|
||
df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'only right parenthetical present']
|
||
|
||
#df["ps{0}".format(i)][index] = math.trunc(df["ps{0}".format(i)][index])
|
||
#print("i is ",i)
|
||
#print(math.trunc(ps_dict["ps{0}".format(i)]))
|
||
|
||
## Calculation of weights based on indent equals previous / next non blank line
|
||
if cur_indent == pnbl_indent:
|
||
df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'cur_indent_equals_pnbl']
|
||
|
||
if cur_indent == nnbl_indent:
|
||
df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'cur_indent_equals_nnbl']
|
||
|
||
print("ps{0}".format(i),df["ps{0}".format(i)][index])
|
||
|
||
print("Special Words Check")
|
||
## calculation of weights based on special words
|
||
sp_words1 = ['cut to','CUT BACK TO','FLASHCUT TO','dissolve to', 'intercut', 'Inter Cut','PBS', 'interval',
|
||
'Flashback','FADE IN','FADE TO BLACK']
|
||
for sp_word in sp_words1:
|
||
print(sp_word)
|
||
search_data = data.replace(":","")
|
||
match = re.match(sp_word,search_data.strip(),re.IGNORECASE)
|
||
if match:
|
||
contains_special = True
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
break
|
||
print (contains_special,search_data)
|
||
if not contains_special:
|
||
search_data = data.strip()
|
||
## check if within quotes
|
||
if search_data:
|
||
if len(search_data) > 3:
|
||
if (search_data.startswith('“') or search_data.startswith('"')):
|
||
if (search_data.endswith('”') or search_data.endswith('"')):
|
||
contains_special = True
|
||
|
||
if contains_special:
|
||
try:
|
||
print("found match in ",data)
|
||
except:
|
||
print("found match ")
|
||
for i in range(1,32):
|
||
if i in ('23','24','32','33'):
|
||
continue
|
||
df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'containsSpecialWords1']
|
||
|
||
|
||
## calculation of weights based on special slug words
|
||
sp_words3 = ['INT.','EXT.','I/E','E/I','EXT-','INT-']
|
||
if not contains_special:
|
||
for sp_word in sp_words3:
|
||
print(sp_word)
|
||
#search_data = data.replace(":","")
|
||
found = re.search(sp_word,data.strip()[0:8])
|
||
if found:
|
||
contains_special = True
|
||
try:
|
||
print("found match in ",data)
|
||
except:
|
||
print("found match ")
|
||
for i in range(1,32):
|
||
if i in ('23','24','32','33'):
|
||
continue
|
||
df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'containsSpecialWords3']
|
||
break
|
||
|
||
|
||
## calculation of weights based on special slug endings
|
||
sp_words4 = [' - MORNING',' - DAY',' - EVENING',' - EVE',' - NIGHT',' - LATER',' - AFTERNOON']
|
||
|
||
for sp_word in sp_words4:
|
||
found = re.search(sp_word,data.strip())
|
||
if found:
|
||
contains_special = True
|
||
try:
|
||
print("found match in ",data)
|
||
except:
|
||
print("found match ")
|
||
for i in range(1,32):
|
||
if i in ('23','24','32','33'):
|
||
continue
|
||
df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'containsSpecialWords4']
|
||
break
|
||
|
||
|
||
|
||
|
||
# speaker possble if single all caps word
|
||
if left_aligned and not contains_special:
|
||
if case == 'AllUpper' and len(data.split()) <= 2 and "." not in data and ":" not in data and df['lcp'][index] < 30 :
|
||
print("boosting speaker possibility")
|
||
df["ps7"][index] += 30
|
||
|
||
## add preassigned weight
|
||
if not contains_special:
|
||
if df['preassigned_weights'][index]:
|
||
pre_psw_list = df['preassigned_weights'][index].split(';')
|
||
for psw in pre_psw_list:
|
||
ps = psw.split('-')[0]
|
||
wt = psw.split('-')[1]
|
||
df[ps][index] += int(wt)
|
||
|
||
|
||
|
||
df = update_parenthetical_neighbor_wt(df)
|
||
|
||
if 'actual_element' not in df.columns:
|
||
df['actual_element'] = ''
|
||
|
||
return df
|
||
|
||
def sort_pos_decr_wts(df):
|
||
## sort in decreasing order
|
||
|
||
x = n_largest(df[['ps1', 'ps2', 'ps3', 'ps4', 'ps5', 'ps6', 'ps7', 'ps8', 'ps9', 'ps10', 'ps11', 'ps12', 'ps13', 'ps14', 'ps15', 'ps16', 'ps17', 'ps18', 'ps19',
|
||
'ps21', 'ps22', 'ps25', 'ps26', 'ps27','ps28','ps29', 'ps30', 'ps31']], axis=1, n=28)
|
||
|
||
df.insert(8, "1_largest", x['1_largest'])
|
||
df.insert(9, "2_largest", x['2_largest'])
|
||
df.insert(10, "3_largest", x['3_largest'])
|
||
df.insert(11, "4_largest", x['4_largest'])
|
||
df.insert(12, "5_largest", x['5_largest'])
|
||
df.insert(13, "6_largest", x['6_largest'])
|
||
df.insert(14, "7_largest", x['7_largest'])
|
||
df.insert(15, "8_largest", x['8_largest'])
|
||
df.insert(16, "9_largest", x['9_largest'])
|
||
df.insert(17, "10_largest", x['10_largest'])
|
||
df.insert(18, "11_largest", x['11_largest'])
|
||
df.insert(19, "12_largest", x['12_largest'])
|
||
df.insert(20, "13_largest", x['13_largest'])
|
||
df.insert(21, "14_largest", x['14_largest'])
|
||
df.insert(22, "15_largest", x['15_largest'])
|
||
df.insert(23, "16_largest", x['16_largest'])
|
||
df.insert(24, "17_largest", x['17_largest'])
|
||
df.insert(25, "18_largest", x['18_largest'])
|
||
df.insert(26, "19_largest", x['19_largest'])
|
||
df.insert(27, "20_largest", x['20_largest'])
|
||
df.insert(28, "21_largest", x['21_largest'])
|
||
df.insert(29, "22_largest", x['22_largest'])
|
||
df.insert(30, "23_largest", x['23_largest'])
|
||
df.insert(31, "24_largest", x['24_largest'])
|
||
df.insert(32, "25_largest", x['25_largest'])
|
||
df.insert(33, "26_largest", x['26_largest'])
|
||
df.insert(34, "27_largest", x['27_largest'])
|
||
df.insert(35, "28_largest", x['28_largest'])
|
||
|
||
|
||
b = df[['ps1', 'ps2', 'ps3', 'ps4', 'ps5', 'ps6', 'ps7', 'ps8', 'ps9', 'ps10', 'ps11', 'ps12', 'ps13', 'ps14',
|
||
'ps15', 'ps16', 'ps17', 'ps18', 'ps19', 'ps21', 'ps22', 'ps25', 'ps26', 'ps27', 'ps28','ps29', 'ps30', 'ps31']]
|
||
|
||
Tops = pd.DataFrame(b.apply(lambda x: list(b.columns[np.array(x).argsort()[::-1][:28]]), axis=1).to_list(), columns=['Top1', 'Top2', 'Top3', 'Top4', 'Top5', 'Top6', 'Top7', 'Top8',
|
||
'Top9', 'Top10', 'Top11', 'Top12', 'Top13', 'Top14', 'Top15', 'Top16', 'Top17', 'Top18', 'Top19', 'Top20', 'Top21', 'Top22', 'Top23', 'Top24', 'Top25', 'Top26', 'Top27','Top28'])
|
||
print(Tops)
|
||
res = pd.concat([df, Tops], axis=1)
|
||
|
||
#print("Ye kuch result hai:",res)
|
||
res['first_largest'] = res['Top1'] + "-" + res['1_largest'].astype(str)
|
||
res['second_largest'] = res['Top2'] + "-" + res['2_largest'].astype(str)
|
||
res['third_largest'] = res['Top3'] + "-" + res['3_largest'].astype(str)
|
||
res['fourth_largest'] = res['Top4'] + "-" + res['4_largest'].astype(str)
|
||
res['fifth_largest'] = res['Top5'] + "-" + res['5_largest'].astype(str)
|
||
res['sixth_largest'] = res['Top6'] + "-" + res['6_largest'].astype(str)
|
||
res['seventh_largest'] = res['Top7'] + "-" + res['7_largest'].astype(str)
|
||
res['eight_largest'] = res['Top8'] + "-" + res['8_largest'].astype(str)
|
||
res['ninth_largest'] = res['Top9'] + "-" + res['9_largest'].astype(str)
|
||
res['tenth_largest'] = res['Top10'] + "-" + res['10_largest'].astype(str)
|
||
res['eleventh_largest'] = res['Top11'] + "-" + res['11_largest'].astype(str)
|
||
res['twelth_largest'] = res['Top12'] + "-" + res['12_largest'].astype(str)
|
||
res['thirteenth_largest'] = res['Top13'] + "-" + res['13_largest'].astype(str)
|
||
res['fourteenth_largest'] = res['Top14'] + "-" + res['14_largest'].astype(str)
|
||
res['fifteenth_largest'] = res['Top15'] + "-" + res['15_largest'].astype(str)
|
||
res['sixteenth_largest'] = res['Top16'] + "-" + res['16_largest'].astype(str)
|
||
res['seventeenth_largest'] = res['Top17'] + "-" + res['17_largest'].astype(str)
|
||
res['eighteenth_largest'] = res['Top18'] + "-" + res['18_largest'].astype(str)
|
||
res['ninteenth_largest'] = res['Top19'] + "-" + res['19_largest'].astype(str)
|
||
res['tewenty_largest'] = res['Top20'] + "-" + res['20_largest'].astype(str)
|
||
res['tone_largest'] = res['Top21'] + "-" + res['21_largest'].astype(str)
|
||
res['ttwo_largest'] = res['Top22'] + "-" + res['22_largest'].astype(str)
|
||
res['tthree_largest'] = res['Top23'] + "-" + res['23_largest'].astype(str)
|
||
res['tfour_largest'] = res['Top24'] + "-" + res['24_largest'].astype(str)
|
||
res['tfive_largest'] = res['Top25'] + "-" + res['25_largest'].astype(str)
|
||
res['tsix_largest'] = res['Top26'] + "-" + res['26_largest'].astype(str)
|
||
res['tseven_largest'] = res['Top27'] + "-" + res['27_largest'].astype(str)
|
||
res['teight_largest'] = res['Top28'] + "-" + res['28_largest'].astype(str)
|
||
|
||
# res['largest1'] = res['Top1']
|
||
# res['largest2'] = res['Top2']
|
||
# res['largest3'] = res['Top3']
|
||
# res['largest4'] = res['Top4']
|
||
# res['largest5'] = res['Top5']
|
||
# res['largest6'] = res['Top6']
|
||
# res['largest7'] = res['Top7']
|
||
# res['largest8'] = res['Top8']
|
||
# res['largest9'] = res['Top9']
|
||
# res['largest10'] = res['Top10']
|
||
# res['largest11'] = res['Top11']
|
||
# res['largest12'] = res['Top12']
|
||
# res['largest13'] = res['Top13']
|
||
# res['largest14'] = res['Top14']
|
||
# res['largest15'] = res['Top15']
|
||
# res['largest16'] = res['Top16']
|
||
# res['largest17'] = res['Top17']
|
||
# res['largest18'] = res['Top18']
|
||
# res['largest19'] = res['Top19']
|
||
# res['largest20'] = res['Top20']
|
||
# res['largest21'] = res['Top21']
|
||
# res['largest22'] = res['Top22']
|
||
# res['largest23'] = res['Top23']
|
||
# res['largest24'] = res['Top24']
|
||
# res['largest25'] = res['Top25']
|
||
# res['largest26'] = res['Top26']
|
||
# res['largest27'] = res['Top27']
|
||
# res['largest28'] = res['Top28']
|
||
# print(res)
|
||
# res.drop(['Top1', 'Top2', 'Top3', 'Top4', 'Top5', 'Top6', 'Top7', 'Top8', 'Top9', 'Top10', 'Top11', 'Top12', 'Top13', 'Top14', 'Top15',
|
||
# 'Top16', 'Top17', 'Top18', 'Top19', 'Top20', 'Top21', 'Top22', 'Top23', 'Top24', 'Top25', 'Top26', 'Top27','Top28'], axis=1, inplace=True)
|
||
|
||
y = ['line_no',
|
||
'data',
|
||
'actual_element',
|
||
'Identification_Status',
|
||
'scene_number',
|
||
'plb',
|
||
'nlb',
|
||
'ssc',
|
||
'lcp',
|
||
'case',
|
||
'parenthetical',
|
||
'isIdentified',
|
||
'When_Identified',
|
||
'first_largest',
|
||
'second_largest',
|
||
'third_largest',
|
||
'fourth_largest',
|
||
'fifth_largest',
|
||
'sixth_largest',
|
||
'seventh_largest',
|
||
'eight_largest',
|
||
'ninth_largest',
|
||
'tenth_largest',
|
||
'eleventh_largest',
|
||
'twelth_largest',
|
||
'thirteenth_largest',
|
||
'fourteenth_largest',
|
||
'fifteenth_largest',
|
||
'sixteenth_largest',
|
||
'seventeenth_largest',
|
||
'eighteenth_largest',
|
||
'ninteenth_largest',
|
||
'tewenty_largest',
|
||
'tone_largest',
|
||
'ttwo_largest',
|
||
'tthree_largest',
|
||
'tfour_largest',
|
||
'tfive_largest',
|
||
'tsix_largest',
|
||
'tseven_largest',
|
||
'teight_largest',
|
||
# 'largest1',
|
||
# 'largest2',
|
||
# 'largest3',
|
||
# 'largest4',
|
||
# 'largest5',
|
||
# 'largest6',
|
||
# 'largest7',
|
||
# 'largest8',
|
||
# 'largest9',
|
||
# 'largest10',
|
||
# 'largest11',
|
||
# 'largest12',
|
||
# 'largest13',
|
||
# 'largest14',
|
||
# 'largest15',
|
||
# 'largest16',
|
||
# 'largest17',
|
||
# 'largest18',
|
||
# 'largest19',
|
||
# 'largest20',
|
||
# 'largest21',
|
||
# 'largest22',
|
||
# 'largest23',
|
||
# 'largest24',
|
||
# 'largest25',
|
||
# 'largest26',
|
||
# 'largest27',
|
||
'ps1',
|
||
'ps2',
|
||
'ps3',
|
||
'ps4',
|
||
'ps5',
|
||
'ps6',
|
||
'ps7',
|
||
'ps8',
|
||
'ps9',
|
||
'ps10',
|
||
'ps11',
|
||
'ps12',
|
||
'ps13',
|
||
'ps14',
|
||
'ps15',
|
||
'ps16',
|
||
'ps17', 'ps18', 'ps19', 'ps21', 'ps22', 'ps25', 'ps26', 'ps27', 'ps28','ps29', 'ps30', 'ps31',
|
||
'pnbl_line_no',
|
||
'nnbl_line_no',
|
||
'ppnbl_line_no',
|
||
'nnnbl_line_no',
|
||
'pdil_line_no',
|
||
'ndil_line_no'
|
||
]
|
||
|
||
|
||
|
||
|
||
df = res.reindex(columns=y)
|
||
|
||
|
||
return(df)
|
||
|
||
|
||
def prep_for_pos_elimination(df):
|
||
|
||
|
||
all_pos = [ "ps{0}".format(ps) for ps in range(1,35) ]
|
||
df.insert(12,'Identification_Status_with_weights','')
|
||
|
||
|
||
# In[24]:
|
||
|
||
|
||
for x in ['ps23','ps24','ps32','ps33','ps34']:
|
||
all_pos.remove(x)
|
||
|
||
|
||
# In[25]:
|
||
|
||
|
||
## go through all lines
|
||
## if line is identified add all possibilties
|
||
## get weights of the psssibilities
|
||
## sort the possibilities in decreasing order of weights
|
||
|
||
def useWeights(ps):
|
||
return int(ps.split("-")[1])
|
||
|
||
for index in df.index:
|
||
if df["isIdentified"][index] == 'No':
|
||
cur_line_pos = all_pos
|
||
## append the weight to the possibilites
|
||
pos_with_weights = []
|
||
for pos in cur_line_pos:
|
||
wt = ''
|
||
pos_wt = str(pos)
|
||
try:
|
||
wt = df[pos][index].astype(int)
|
||
pos_wt += '-' + str(wt)
|
||
except:
|
||
continue
|
||
pos_with_weights.append(pos_wt)
|
||
|
||
# now sort in descending order using the weights as key
|
||
pos_with_weights = sorted(pos_with_weights,key=useWeights , reverse = True)
|
||
|
||
line_pos_string_with_weights = ';'.join([str(elem) for elem in pos_with_weights])
|
||
|
||
df['Identification_Status_with_weights'][index] = line_pos_string_with_weights
|
||
|
||
## copy over to identification status without the weights but in order of decreasing weights
|
||
pos_without_weight = []
|
||
for pos in pos_with_weights:
|
||
pos_without_weight.append(pos.split("-")[0])
|
||
|
||
line_pos_string = ';'.join([str(elem) for elem in pos_without_weight])
|
||
print(line_pos_string)
|
||
df['Identification_Status'][index] = line_pos_string
|
||
|
||
|
||
## make a column which indicates the possibilities not to be removed
|
||
## possibilities not to eliminate
|
||
df['ps_not_to_remove'] = 'ps34'
|
||
# df['parenthetical'] = ''
|
||
|
||
for index in df.index:
|
||
# par = ''
|
||
#
|
||
# print(data)
|
||
# if re.match('\(',data.strip()[:1]):
|
||
# if re.match('\)',data.strip()[-1:]) :
|
||
# par = 'Complete'
|
||
# elif re.search('\)',data.strip()) :
|
||
# par = 'PartBeginningMid'
|
||
# else:
|
||
# par = 'Beginning'
|
||
|
||
# elif re.match('\)',data.strip()[-1:]):
|
||
# if re.search('\(',data.strip()):
|
||
# par = 'PartMidEnd'
|
||
# else:
|
||
# par = 'End'
|
||
# # beginning end already checked so now if paren present it is mixed
|
||
# elif re.search('\(',data.strip()) and re.search('\)',data.strip()):
|
||
# par = 'PartMidMid'
|
||
# elif re.search('\(',data.strip()):
|
||
# par = 'MixedBeginning'
|
||
# elif re.search('\)',data.strip()):
|
||
# par = 'MixedEnd'
|
||
# else:
|
||
# par = 'Absent'
|
||
|
||
# df['Parenthetical'][index] = par
|
||
|
||
data = df['data'][index]
|
||
pos_not_to_remove = []
|
||
if df["isIdentified"][index] == 'No':
|
||
## find the top possibilities - max weight
|
||
pos_with_wts = df["Identification_Status_with_weights"][index].split(";")
|
||
max_pos_index = 0
|
||
wt1 = 0
|
||
wt2 = 0
|
||
print(df["line_no"][index])
|
||
for k in range(0,len(pos_with_wts)-1):
|
||
wt1 = pos_with_wts[k].split("-")[1]
|
||
wt2 = pos_with_wts[k+1].split("-")[1]
|
||
print(wt1,wt2)
|
||
if wt2 == wt1:
|
||
max_pos_index = k+1
|
||
continue
|
||
else:
|
||
break
|
||
|
||
print (max_pos_index)
|
||
for j in range(0,max_pos_index+1):
|
||
pos_not_to_remove.append(df["Identification_Status"][index].split(";")[j])
|
||
|
||
|
||
|
||
# if entire line in parenthetical don't remove ps8,ps10
|
||
if re.match('\(',data.strip()[:1]) and re.match('\)',data.strip()[-1:]):
|
||
pos_not_to_remove.append('ps8')
|
||
pos_not_to_remove.append('ps10')
|
||
|
||
words = data.split()
|
||
if len(words[0]) > 1 and words[0].isupper():
|
||
pos_not_to_remove.append('ps8')
|
||
pos_not_to_remove.append('ps25')
|
||
pos_not_to_remove.append('ps30')
|
||
|
||
if len(words) == 1:
|
||
pos_not_to_remove.append('ps7')
|
||
|
||
|
||
left_p = True if re.search('\(',data) else False
|
||
right_p = True if re.search('\)',data) else False
|
||
|
||
|
||
if left_p and right_p and not(re.match('\(',data.strip()[:1])):
|
||
if (re.search('\)',data).start() -re.search('\(',data).start()) > 0:
|
||
|
||
pos_not_to_remove.append('ps25')
|
||
pos_not_to_remove.append('ps27')
|
||
|
||
pos_not_to_remove = list(set(pos_not_to_remove))
|
||
df['ps_not_to_remove'][index] = (";").join(str(elem) for elem in pos_not_to_remove)
|
||
|
||
## go thorugh all the lines , if parenthetical is absent remove ps 8,10,11,12,25,26,27
|
||
pos_to_remove = ['ps10','ps11','ps12','ps25','ps26','ps27']
|
||
|
||
for index in df.index:
|
||
if df['parenthetical'][index] == 'Absent':
|
||
cur_line_pos = df["Identification_Status"][index].split(";")
|
||
pos_not_to_remove = df['ps_not_to_remove'][index].split(";")
|
||
|
||
cur_line_pos = [pos for pos in cur_line_pos if pos not in pos_to_remove]
|
||
pos_not_to_remove = [pos for pos in pos_not_to_remove if pos not in pos_to_remove]
|
||
|
||
df["Identification_Status"][index] = ";".join(cur_line_pos)
|
||
df['ps_not_to_remove'][index] = ";".join(pos_not_to_remove)
|
||
|
||
pos_to_remove = ['ps11','ps12']
|
||
|
||
for index in df.index:
|
||
if df['parenthetical'][index] in ('PartMidEnd','PartStartMid','PartMidMid'):
|
||
cur_line_pos = df["Identification_Status"][index].split(";")
|
||
pos_not_to_remove = df['ps_not_to_remove'][index].split(";")
|
||
|
||
cur_line_pos = [pos for pos in cur_line_pos if pos not in pos_to_remove]
|
||
pos_not_to_remove = [pos for pos in pos_not_to_remove if pos not in pos_to_remove]
|
||
|
||
df["Identification_Status"][index] = ";".join(cur_line_pos)
|
||
df['ps_not_to_remove'][index] = ";".join(pos_not_to_remove)
|
||
|
||
|
||
|
||
## refine the possibilties of first and last line
|
||
|
||
first_line_index = 0
|
||
if df['case'][first_line_index] == 'None':
|
||
first_line_index += 1
|
||
|
||
last_line_index = df.index[-1]
|
||
if df['case'][last_line_index] == 'None':
|
||
last_line_index -= 1
|
||
|
||
|
||
## keep possibilities of first and last line
|
||
|
||
eligible_pos = ['ps1','ps2','ps17','ps18']
|
||
first_line_pos = df["Identification_Status"][first_line_index].split(";")
|
||
|
||
first_line_pos = [ps for ps in first_line_pos if ps in eligible_pos ]
|
||
df['Identification_Status'][first_line_index] = ";".join(first_line_pos)
|
||
df['ps_not_to_remove'][first_line_index] = ""
|
||
|
||
## keep possibilities of last line
|
||
|
||
eligible_pos = ['ps6','ps15','ps16','ps17','ps29','ps30','ps31']
|
||
last_line_pos = df["Identification_Status"][last_line_index].split(";")
|
||
|
||
last_line_pos = [ps for ps in eligible_pos if ps in last_line_pos ]
|
||
df['Identification_Status'][last_line_index] = ";".join(last_line_pos)
|
||
df['ps_not_to_remove'][last_line_index] = ""
|
||
|
||
|
||
|
||
|
||
|
||
|
||
# In[26]:
|
||
|
||
|
||
for index in df.index:
|
||
cur_line_pos = df["Identification_Status"][index].split(";")
|
||
if len(cur_line_pos) == 1:
|
||
continue
|
||
## append the weight to the possibilites
|
||
pos_with_weights = []
|
||
for pos in cur_line_pos:
|
||
wt = ''
|
||
pos_wt = str(pos)
|
||
try:
|
||
wt = df[pos][index].astype(int)
|
||
pos_wt += '-' + str(wt)
|
||
except:
|
||
continue
|
||
pos_with_weights.append(pos_wt)
|
||
|
||
# now sort in descending order using the weights as key
|
||
pos_with_weights = sorted(pos_with_weights,key=useWeights , reverse = True)
|
||
|
||
line_pos_string_with_weights = ';'.join([str(elem) for elem in pos_with_weights])
|
||
|
||
df['Identification_Status_with_weights'][index] = line_pos_string_with_weights
|
||
|
||
## copy over to identification status without the weights but in order of decreasing weights
|
||
pos_without_weight = []
|
||
for pos in pos_with_weights:
|
||
pos_without_weight.append(pos.split("-")[0])
|
||
|
||
line_pos_string = ';'.join([str(elem) for elem in pos_without_weight])
|
||
print(line_pos_string)
|
||
df['Identification_Status'][index] = line_pos_string
|
||
|
||
|
||
|
||
|
||
|
||
|
||
def examine_speaker_pos(df,audit_df):
|
||
|
||
print("examining speaker possibilties")
|
||
speaker_list = df.loc[df['Identification_Status'] == 'ps7','data'].astype(str)
|
||
speaker_list = [ elem.strip() for elem in speaker_list ]
|
||
speaker_lines_list = df.loc[(df['Identification_Status'] == 'ps7') | (df['Identification_Status'] == 'ps8'),'line_no'].to_list()
|
||
unique_speaker_list = []
|
||
speaker_in_two_lines_list = []
|
||
for speaker in speaker_list:
|
||
speaker = speaker.strip()
|
||
#print(speaker)
|
||
if speaker not in unique_speaker_list:
|
||
unique_speaker_list.append(speaker)
|
||
## strip the blank spaces
|
||
|
||
try:
|
||
print(unique_speaker_list)
|
||
except:
|
||
pass
|
||
print(speaker_lines_list)
|
||
|
||
|
||
# In[173]:
|
||
|
||
|
||
for index in df.index:
|
||
line_no = df['line_no'][index]
|
||
data = df['data'][index].strip()
|
||
if df['Identification_Status'][index] == 'ps7':
|
||
continue
|
||
if df['Identification_Status'][index] == 'ps8':
|
||
continue
|
||
|
||
|
||
try:
|
||
if ("".join(data.split()).upper() in unique_speaker_list) or (data.upper() in unique_speaker_list):
|
||
print (line_no,data)
|
||
if line_no not in speaker_lines_list and df['isIdentified'][index] != 'Yes':
|
||
speaker_lines_list.append(line_no)
|
||
except:
|
||
print(line_no,data,"data is not str")
|
||
pass
|
||
|
||
|
||
if index != df.index[-1]:
|
||
nl_data = df['data'][index+1]
|
||
cur_par = df['parenthetical'][index]
|
||
if data.strip() and nl_data.strip() and cur_par == 'Absent':
|
||
two_line_data = "".join((data+nl_data).split())
|
||
#print(two_line_data)
|
||
#print(two_line_data.lstrip().split(" ")[0])
|
||
#print(unique_speaker_list)
|
||
if two_line_data in unique_speaker_list:
|
||
#print("Yes")
|
||
print(line_no,data)
|
||
print(line_no,nl_data)
|
||
speaker_in_two_lines_list.append(line_no)
|
||
|
||
elif two_line_data.lstrip().split("(")[0] in unique_speaker_list and data.lstrip().split("(")[0].strip() not in unique_speaker_list :
|
||
|
||
print(line_no,data)
|
||
print(line_no,nl_data)
|
||
speaker_in_two_lines_list.append(line_no)
|
||
|
||
speaker_lines_list.sort()
|
||
speaker_in_two_lines_list.sort()
|
||
print(speaker_lines_list)
|
||
print(speaker_in_two_lines_list)
|
||
|
||
|
||
# In[174]:
|
||
|
||
|
||
# create new df with line no as index
|
||
df_line_index =df.copy().set_index('line_no')
|
||
df_line_index.head()
|
||
|
||
|
||
|
||
|
||
## correct the speaker in two lines
|
||
|
||
## first go through speaker in two lines
|
||
for sp_line in speaker_in_two_lines_list:
|
||
if df_line_index['Identification_Status'][sp_line] == 'ps8':
|
||
continue
|
||
data = df_line_index['data'][sp_line]
|
||
line_no = sp_line
|
||
nl_data = df_line_index['data'][sp_line+1]
|
||
new_data = data.strip() + nl_data.strip()
|
||
|
||
if re.search('\(',new_data):
|
||
par_pos = re.search('\(',new_data).start()
|
||
before_par = new_data[:par_pos]
|
||
after_par = new_data[par_pos:]
|
||
df_line_index['data'][sp_line] = before_par
|
||
|
||
df_line_index['data'][sp_line+1] = after_par
|
||
if re.match('\)',after_par.strip()[-1]):
|
||
df_line_index['Identification_Status'][sp_line+1] = 'ps10'
|
||
df_line_index['parenthetical'][sp_line+1] = 'Complete'
|
||
df_line_index['When_Identified'][sp_line+1] = 'ExaminingSpeakerLines'
|
||
elif re.match('\)',after_par.strip()):
|
||
df_line_index['Identification_Status'][sp_line+1] = 'ps26'
|
||
df_line_index['parenthetical'][sp_line+1] = 'PartStartMid'
|
||
df_line_index['When_Identified'][sp_line+1] = 'ExaminingSpeakerLines'
|
||
else:
|
||
df_line_index['Identification_Status'][sp_line+1] = 'ps11'
|
||
df_line_index['parenthetical'][sp_line+1] = 'StartingLeft'
|
||
df_line_index['When_Identified'][sp_line+1] = 'ExaminingSpeakerLines'
|
||
|
||
|
||
|
||
|
||
# add line no to speaker lines
|
||
speaker_lines_list.append(sp_line)
|
||
|
||
|
||
# print to report
|
||
# audit_report.write("%s.line no: %s , Found Speaker in two lines with continuing parenthetical, Separated speaker \n" %(audit_sno,sp_line))
|
||
# audit_sno += 1
|
||
|
||
else:
|
||
# print to report
|
||
print("%s.line no: %s , Found Speaker in two lines merged the line \n" %(audit_sno,sp_line))
|
||
# audit_report.write("%s.line no: %s , Found Speaker in two lines merged the line \n" %(audit_sno,sp_line))
|
||
# audit_sno += 1
|
||
|
||
# correct the line
|
||
df_line_index['data'][sp_line] = new_data
|
||
# add line no to speaker lines
|
||
speaker_lines_list.append(sp_line)
|
||
|
||
# delete the next line
|
||
df_line_index.drop((sp_line+1),inplace= True)
|
||
audit_df['line_removed'][sp_line+1] = 'Yes'
|
||
print (data,nl_data,new_data)
|
||
|
||
|
||
|
||
|
||
# In[176]:
|
||
|
||
|
||
|
||
## go through the speaker lines and rectify them , print to audit report
|
||
for sp_line in speaker_lines_list:
|
||
line_no = sp_line
|
||
##identify the above identified speaker lines as ps7
|
||
if df_line_index['Identification_Status'][sp_line] in ('ps7','ps8'):
|
||
continue
|
||
|
||
df_line_index['Identification_Status'][sp_line] = 'ps7'
|
||
df_line_index['parenthetical'][sp_line] = 'Absent'
|
||
if df_line_index['When_Identified'][sp_line] != 'FirstStrictConditions' :
|
||
df_line_index['When_Identified'][sp_line] = 'ExaminingSpeakerLines'
|
||
|
||
sp_data = df_line_index['data'][sp_line].strip()
|
||
print(sp_line)
|
||
try:
|
||
print(sp_data)
|
||
except:
|
||
pass
|
||
cur_indent = df_line_index['ssc'][sp_line]
|
||
try:
|
||
new_speaker_indent = int(cur_indent)
|
||
except:
|
||
new_speaker_indent = 35
|
||
|
||
|
||
|
||
|
||
## check and correct case
|
||
if df_line_index['case'][sp_line] != 'AllUpper':
|
||
try:
|
||
print(sp_data)
|
||
except:
|
||
pass
|
||
df_line_index['data'][sp_line] = sp_data.rjust(len(sp_data)+new_speaker_indent).upper()
|
||
df_line_index['case'][sp_line] = 'AllUpper'
|
||
try:
|
||
print("case corrected to:",df_line_index['data'][sp_line])
|
||
except:
|
||
pass
|
||
audit_df['case_corrected'][line_no] = 'Speaker Case corrected to All Upper Case'
|
||
|
||
|
||
|
||
## check and correct gap between speaker name
|
||
sp_data_nogap = "".join(sp_data.split())
|
||
if sp_data != sp_data_nogap:
|
||
try:
|
||
print(sp_data)
|
||
except:
|
||
pass
|
||
## removing gap only if first world is single
|
||
first_word = sp_data.split()[0]
|
||
if len(first_word) == 1:
|
||
df_line_index['data'][sp_line] = (sp_data_nogap.upper()).rjust(len(sp_data_nogap)+new_speaker_indent)
|
||
try:
|
||
print("speaker name corrected to:",df_line_index['data'][sp_line])
|
||
except:
|
||
pass
|
||
audit_df['space_removed_between_characters'][line_no] = 'Yes'
|
||
|
||
|
||
|
||
|
||
|
||
|
||
df_line_index.reset_index(inplace=True)
|
||
|
||
|
||
df = df_line_index.sort_index().reset_index(drop=True)
|
||
|
||
|
||
|
||
## mixed speaker identification
|
||
## use the speakers to idenfity mixed speaker lines ps8, 25,30,21,28
|
||
|
||
|
||
# In[184]:
|
||
|
||
df['ps_not_to_remove'] = df['ps_not_to_remove'].astype(str)
|
||
try:
|
||
print(unique_speaker_list)
|
||
except:
|
||
pass
|
||
print("performing checks for speaker followed by parenthetical")
|
||
for index in df.index:
|
||
if df['Identification_Status'][index] == 'ps8':
|
||
continue
|
||
line_no = df['line_no'][index]
|
||
print("line_no",line_no)
|
||
print(df.dtypes)
|
||
new_line_no = 0.0
|
||
data = df['data'][index]
|
||
cur_pos_list = df['Identification_Status'][index].split(";")
|
||
top_pos = cur_pos_list[0]
|
||
new_pos_list = cur_pos_list
|
||
try:
|
||
pos_not_to_remove = df['ps_not_to_remove'][index].split(";")
|
||
except:
|
||
pos_not_to_remove = []
|
||
set_1 = set(cur_pos_list)
|
||
set_2 = set(pos_not_to_remove)
|
||
pos_not_to_remove = list(set.intersection(set_1,set_2))
|
||
|
||
for speaker in unique_speaker_list:
|
||
check_done =False
|
||
if re.search(speaker,data) and df['Identification_Status'][index] not in ('ps7','ps8') :
|
||
|
||
# check if speaker is at start of line followed by something (like parenthetical)
|
||
pos_starts = re.search(speaker,data,re.IGNORECASE).start()
|
||
pos_end = re.search(speaker,data,re.IGNORECASE).end()
|
||
before_speaker = data[:pos_starts]
|
||
after_speaker = data[pos_end:]
|
||
print("match found")
|
||
try:
|
||
|
||
print("data 2347:",data)
|
||
print("speaker 2348:", speaker)
|
||
print("before speaker 2349 :", before_speaker)
|
||
print("after speaker 2350:",after_speaker)
|
||
except:
|
||
pass
|
||
try:
|
||
char1_after_speaker = after_speaker.lstrip()[0]
|
||
except:
|
||
char1_after_speaker = ''
|
||
cur_indent = df['ssc'][index]
|
||
try:
|
||
new_speaker_indent = int(cur_indent)
|
||
except:
|
||
new_speaker_indent = 35
|
||
|
||
|
||
try:
|
||
print(before_speaker)
|
||
print(after_speaker)
|
||
print("char1_after_speaker 2367:",char1_after_speaker)
|
||
except:
|
||
pass
|
||
## separate parenthtical if speaker is followed by parenthtical
|
||
if before_speaker.isspace() and char1_after_speaker == '(' and df['parenthetical'][index] == 'PartMidEnd' and "V.O." not in str(after_speaker):
|
||
#print("before_speaker 2372:", before_par)
|
||
print ("Seperating parenthetical")
|
||
print("Identifying speaker")
|
||
print(index)
|
||
df['data'][index] = before_speaker + speaker
|
||
df['parenthetical'][index] = 'Absent'
|
||
df['When_Identified'][index] = 'ExaminingSpeakerLines'
|
||
df['case'][index] = 'AllUpper'
|
||
df['Identification_Status'][index] = 'ps7'
|
||
nlb = df['nlb'][index]
|
||
df['nlb'][index] = 'N'
|
||
|
||
audit_df['line_broken_into_multiple_lines'][line_no] = 'Separated Speaker and Parenthetical'
|
||
|
||
#print(df['Identification_Status'][index])
|
||
#new_line_no = str(int(line_no) + 0.5)
|
||
new_line_no = line_no + 0.5
|
||
print(type(line_no),type(new_line_no))
|
||
print("identifying parenthetical")
|
||
print(df.dtypes)
|
||
df.loc[index + 0.25] = np.nan
|
||
df.loc[index + 0.25,'data'] = str(after_speaker)
|
||
df.loc[index + 0.25,'parenthetical'] = 'Complete'
|
||
df.loc[index + 0.25,'When_Identified'] ='ExaminingSpeakerLines'
|
||
df.loc[index + 0.25,'Identification_Status'] = 'ps10'
|
||
df.loc[index + 0.25,'case'] = ''
|
||
df.loc[index + 0.25,'plb'] = 'N'
|
||
df.loc[index + 0.25,'nlb'] = nlb
|
||
df.loc[index + 0.25,'line_no'] = new_line_no
|
||
#print(df.loc[index + 0.25]['line_no'] ,new_line_no)
|
||
#df['line_no'] = pd.to_numeric(df['line_no'],errors ='coerce')
|
||
|
||
df = df.sort_index().reset_index(drop=True)
|
||
|
||
audit_df.loc[new_line_no] = np.nan
|
||
audit_df['line_removed'][new_line_no] = 'No'
|
||
print(df.dtypes)
|
||
print(audit_df.dtypes)
|
||
continue
|
||
|
||
|
||
elif before_speaker.isspace() and char1_after_speaker == '(' and df['parenthetical'][index] != 'PartMidMid':
|
||
##make the line possibilities as ps8;ps25
|
||
## next non-blank characteer after speker is left (
|
||
print ("parenthetical mix")
|
||
new_pos_list = ['ps8','ps25']
|
||
## check and change the indent to speaker indent of 35
|
||
if df['ssc'][index] != 35:
|
||
df['ssc'][index] = new_speaker_indent
|
||
df['data'][index] = data.strip().rjust(len(data.strip()) + new_speaker_indent)
|
||
|
||
# with open(audit_report_path,'a') as audit_report:
|
||
# audit_report.write("%s. line no: %s , Corrected Speaker Mix indent to \n" %(audit_sno,index))
|
||
# audit_sno += 1
|
||
|
||
#df_line_index['Identification_Status'][index] = 'ps8;ps25'
|
||
# break
|
||
elif before_speaker.isspace() and (not after_speaker.isspace()) :
|
||
## add code to remove
|
||
# cannot be ps1,ps2,ps3,ps7,ps9,ps10,ps11,ps12,ps16,ps17,ps18,ps19,ps21,ps22,ps26,ps27,ps28,ps29,ps31,
|
||
ps_remove = ['ps1','ps2','ps3','ps7','ps9','ps10','ps11','ps12','ps16','ps17','ps18','ps19','ps21','ps22','ps26','ps27','ps28','ps29','ps31']
|
||
for ps in ps_remove:
|
||
try:
|
||
new_pos_list.remove(ps)
|
||
except:
|
||
continue
|
||
|
||
#df_line_index['Identification_Status'][index] = 'ps30;ps4;ps5;ps6;ps8;ps13;ps14;ps15'
|
||
print ("present but not parenthetical removed except - ps30;ps4;ps5;ps6;ps8;ps13;ps14;ps15")
|
||
# break
|
||
elif (not before_speaker.isspace()) and after_speaker.isspace():
|
||
new_pos_list = ['ps21','ps28','ps5','ps4']
|
||
#df_line_index['Identification_Status'][index] = 'ps21;ps28'
|
||
print ("before speaker present")
|
||
check_done = True
|
||
else:
|
||
new_pos_list = cur_pos_list
|
||
print("no change done")
|
||
|
||
## append the posibility which were not to be removed back to the list if it got eliminated in the intersection
|
||
|
||
for ps in pos_not_to_remove:
|
||
if ps not in new_pos_list:
|
||
new_pos_list.insert(0,ps)
|
||
|
||
df['Identification_Status'][index] = ";".join([str(elem) for elem in new_pos_list])
|
||
print('\n')
|
||
|
||
if check_done:
|
||
break
|
||
|
||
|
||
for index in df.index:
|
||
if df['Identification_Status'][index] == 'blank' or (len(df['Identification_Status'][index].split(";")) == 1):
|
||
df['isIdentified'][index] = 'Yes'
|
||
else:
|
||
df['isIdentified'][index] = 'No'
|
||
|
||
return df
|
||
|
||
def examine_speaker_next_lines(df,audit_df):
|
||
df.reset_index(inplace=True, drop=True)
|
||
|
||
## identifying lines after speaker
|
||
## get the speaker lines
|
||
|
||
speaker_lines_list = df.loc[(df['Identification_Status'] == 'ps7') | (df['Identification_Status'] == 'ps8'),'line_no'].to_list()
|
||
|
||
## go through the speaker lines and identify the lines after speaker
|
||
for line in speaker_lines_list:
|
||
blank_to_delete = []
|
||
index = df.loc[df['line_no'] == line,:].index.values[0]
|
||
data = df['data'][index]
|
||
speaker_name = data.strip()
|
||
print("\n")
|
||
print("speaker index",index)
|
||
print("speaker line no",line)
|
||
try:
|
||
print("data:\n",data)
|
||
except:
|
||
pass
|
||
# move the index to next nbl line to check it
|
||
|
||
if df['nlb'][index] == 'Y':
|
||
index += 2
|
||
else:
|
||
index += 1
|
||
|
||
possible_dialog_line = False
|
||
parenthetical_begun = False
|
||
## examine the lines(if any) after speaker and before dialougue
|
||
## move index till end paranthetical comes
|
||
while not possible_dialog_line:
|
||
data = df['data'][index]
|
||
|
||
line_no = df['line_no'][index]
|
||
cur_line_par = df['parenthetical'][index]
|
||
print("examining line")
|
||
try:
|
||
print(data)
|
||
print(line_no,cur_line_par)
|
||
except:
|
||
pass
|
||
if df['Identification_Status'][index] == 'ps10':
|
||
index += 1
|
||
#possible_dialog_line = True
|
||
|
||
elif cur_line_par == 'Complete':
|
||
# complete line in paranthetical
|
||
print("Identifying as Parenthetical Complete")
|
||
df['Identification_Status'][index] = 'ps10'
|
||
df['When_Identified'][index] = 'ExaminingLinesAfterSpeaker'
|
||
index +=1
|
||
possible_dialog_line = True
|
||
elif cur_line_par == 'StartingLeft' and not(parenthetical_begun) :
|
||
# line has paranthetical in beginning only
|
||
print("identifying as parenthetical Beginning")
|
||
df['Identification_Status'][index] = 'ps11'
|
||
df['When_Identified'][index] = 'ExaminingLinesAfterSpeaker'
|
||
parenthetical_begun = True
|
||
index +=1
|
||
elif cur_line_par == 'EndingRight' and parenthetical_begun:
|
||
# line has paranthetical at end only
|
||
print("Identifying as parenthetical end")
|
||
df['Identification_Status'][index] = 'ps12'
|
||
df['When_Identified'][index] = 'ExaminingLinesAfterSpeaker'
|
||
index +=1
|
||
possible_dialog_line = True
|
||
|
||
elif cur_line_par == 'Absent' and parenthetical_begun:
|
||
# multiple lines could be in paranthetical
|
||
print("Identifying as parenthetical middle")
|
||
df['Identification_Status'][index] = 'ps20'
|
||
df['When_Identified'][index] = 'ExaminingLinesAfterSpeaker'
|
||
index +=1
|
||
|
||
elif df['Identification_Status'][index] == 'ps13' or df['Identification_Status'][index] == 'ps14':
|
||
## dialogue end cans still be there
|
||
index += 1
|
||
|
||
# elif cur_line_par == 'PartBeginningMid':
|
||
# # starting part of line in paranthetical
|
||
# print("Identifying as parenthetical mix with succeeding dialogue")
|
||
# df['Identification_Status'][index] = 'ps26'
|
||
# df['When_Identified'][index] = 'ExaminingLinesAfterSpeaker'
|
||
# index +=1
|
||
# possible_dialog_line = True
|
||
|
||
# elif cur_line_par == 'PartMidEnd':
|
||
# # end part line in parenthetical
|
||
# print("Identifying as parenthetical mix with preceeding dialogue")
|
||
# df['Identification_Status'][index] = 'ps27'
|
||
# df['When_Identified'][index] = 'ExaminingLinesAfterSpeaker'
|
||
# ## breaking the line to dialogue and parenthtical complete-
|
||
|
||
# index +=1
|
||
# possible_dialog_line = True
|
||
|
||
elif cur_line_par == 'Absent':
|
||
print("line should be dialogue")
|
||
possible_dialog_line = True
|
||
else:
|
||
print("line could be dialogue")
|
||
possible_dialog_line = True
|
||
|
||
print(line_no,possible_dialog_line,parenthetical_begun)
|
||
|
||
|
||
## skip if blank
|
||
if df['Identification_Status'][index] == 'blank':
|
||
print("skipping blank line")
|
||
blank_to_delete.append(index)
|
||
index +=1
|
||
|
||
|
||
# check if the current line has possibility of being a dialogue , if not move to next speaker line
|
||
if 'ps15' not in df['Identification_Status'][index].split(";") :
|
||
print("line does not have possibility of dialogue, so cannot process")
|
||
continue
|
||
else:
|
||
print("dialogue line(s) after speaker")
|
||
|
||
|
||
cur_indent = df['ssc'][index]
|
||
cur_line_par = df['parenthetical'][index]
|
||
|
||
next_line_blank = True if df['nlb'][index] == 'Y' else False
|
||
|
||
if index+2 > df.index[-1]:
|
||
break
|
||
|
||
if next_line_blank:
|
||
next_nbl_indent = df['ssc'][index+2]
|
||
next_nbl_par = df['parenthetical'][index+2]
|
||
next_nbl_case = df['case'][index+2]
|
||
next_nbl_data = df['data'][index+2]
|
||
j = index + 2
|
||
|
||
else:
|
||
next_nbl_indent = df['ssc'][index+1]
|
||
next_nbl_par = df['parenthetical'][index+1]
|
||
next_nbl_case = df['case'][index+1]
|
||
next_nbl_data = df['data'][index+1]
|
||
j = index + 1
|
||
|
||
start_index = index
|
||
rev_index = index
|
||
lines_count = 1
|
||
data = df['data'][index]
|
||
print("all lines after speaker with same indent with parentheticals")
|
||
try:
|
||
print(data)
|
||
except:
|
||
pass
|
||
|
||
dia_indent = cur_indent
|
||
print(dia_indent)
|
||
## adding condition - next line can be parenthetical
|
||
## removing conditions - next to next line check not required
|
||
## get the number of line with same indent
|
||
while (cur_indent == next_nbl_indent or dia_indent == next_nbl_indent or next_nbl_par in ('Complete','StartingLeft','EndingRight')) and not (next_nbl_case == 'AllUpper') and (not next_line_blank): # or next_nbl_case in ('AllLower','MidUpper','Partial')): ## considering dialogue ends if next blank except next line blank and nn line case Alllower
|
||
#and len(next_nbl_data.split()) == 1):
|
||
|
||
lines_count +=1
|
||
cur_indent = next_nbl_indent
|
||
cur_line_par = next_nbl_par
|
||
next_line_blank = True if df['nlb'][j] == 'Y' else False
|
||
data = df['data'][j]
|
||
try:
|
||
print(data)
|
||
print(cur_indent)
|
||
except:
|
||
pass
|
||
rev_index = j
|
||
|
||
if j+1 >= df.index[-1]:
|
||
break
|
||
|
||
if next_line_blank:
|
||
break ## as not considering space between dialogue .. action becoming dialogue
|
||
#next_nbl_indent = df['ssc'][j+2]
|
||
#next_nbl_par = df['parenthetical'][j+2]
|
||
#next_nbl_case = df['case'][j+2]
|
||
#next_nbl_data = df['data'][j+2]
|
||
#j += 2
|
||
|
||
else:
|
||
next_nbl_indent = df['ssc'][j+1]
|
||
next_nbl_par = df['parenthetical'][j+1]
|
||
next_nbl_case = df['case'][j+1]
|
||
next_nbl_data = df['data'][j+1]
|
||
j += 1
|
||
|
||
print("\n Next line indent is",next_nbl_indent)
|
||
|
||
|
||
## now lets start examing these lines in reverse order
|
||
## if last line has parenthtical end or complete then it is action mixed not dialogue
|
||
|
||
last_line_par = cur_line_par
|
||
data = df['data'][rev_index]
|
||
dialogue_end_identified = False
|
||
dia_end = rev_index
|
||
###
|
||
|
||
|
||
# last line is mixed with parenthetical. if it is parenthetical Mid end -> separate to new action line
|
||
if last_line_par == 'PartMidEnd':
|
||
print("Dialogue mixed with parenthetical")
|
||
#separate line to before and after parenthetical
|
||
par_start = re.search('\(',data).start()
|
||
before_par = data[:par_start]
|
||
after_par = data[par_start:]
|
||
# make current line as before par and tag as dialogue
|
||
print(" identifying before parenthentical line as ps15")
|
||
try:
|
||
print(before_par)
|
||
except:
|
||
pass
|
||
df['data'][rev_index] = before_par
|
||
next_line_flag = df['nlb'][rev_index]
|
||
df['nlb'][rev_index] = 'N'
|
||
df['Identification_Status'][rev_index] = 'ps15'
|
||
df['parenthetical'][rev_index] = 'Absent'
|
||
df['When_Identified'][rev_index] = 'ExaminingLinesAfterSpeaker'
|
||
df['isIdentified'][rev_index] = 'Yes'
|
||
dialogue_end_identified = True
|
||
|
||
|
||
#make new next line as action line ps6
|
||
print("action after dialogue, separating to newline , identifying line as ps6")
|
||
try:
|
||
print(after_par)
|
||
print("after_par is here")
|
||
except:
|
||
pass
|
||
print("df['line_no'][rev_index]:",df['line_no'][rev_index])
|
||
print(df['line_no'][rev_index])
|
||
line_no = df['line_no'][rev_index]
|
||
next_line_no = df['line_no'][rev_index+1]
|
||
new_line_no = (float(line_no) + float(next_line_no)) / 2
|
||
|
||
if new_line_no in audit_df.index:
|
||
new_line_no = (float(new_line_no) + float(next_line_no))/2
|
||
new_line_no = new_line_no
|
||
|
||
audit_df.loc[new_line_no] = np.nan
|
||
audit_df.loc[new_line_no]['line_removed'] = 'No'
|
||
|
||
audit_df.loc[line_no]['line_broken_into_multiple_lines'] = 'Separated Action from Dialogue and added Speaker'
|
||
|
||
# add line before action end
|
||
df.loc[rev_index + 0.25] = np.nan
|
||
df.loc[rev_index + 0.25,'ssc'] = 0
|
||
new_data = speaker_name.capitalize() + ' ' + after_par.replace('(','').replace(')','')
|
||
df.loc[rev_index + 0.25,'data'] = new_data
|
||
df.loc[rev_index + 0.25,'case'] = ''
|
||
df.loc[rev_index + 0.25,'plb'] = 'N'
|
||
df.loc[rev_index + 0.25,'nlb'] = next_line_flag
|
||
df.loc[rev_index + 0.25,'Identification_Status'] = 'ps6'
|
||
df.loc[rev_index + 0.25,'When_Identified'] = 'ExaminingLinesAfterSpeaker'
|
||
df.loc[rev_index + 0.25,'isIdentified'] = 'Yes'
|
||
df.loc[rev_index + 0.25,'parenthetical'] = 'Absent'
|
||
df.loc[rev_index + 0.25,'line_no'] = new_line_no
|
||
try:
|
||
print(new_data)
|
||
except:
|
||
pass
|
||
# insert audit report
|
||
|
||
|
||
df = df.sort_index().reset_index(drop=True)
|
||
|
||
elif last_line_par == 'EndingRight':
|
||
print("last line has parenthtical end")
|
||
# if parentical is end then find the beginning and split after begiining identify as action
|
||
j=1
|
||
beginning_not_found = True
|
||
while beginning_not_found:
|
||
print("looking for beginning parenthtical")
|
||
data = df['data'][rev_index-j]
|
||
try:
|
||
print(data)
|
||
except:
|
||
pass
|
||
if df['parenthetical'][rev_index-j] in ('StartingLeft','MixedLeft') :
|
||
beginning_not_found = False
|
||
else:
|
||
j+=1
|
||
if beginning_not_found == False :
|
||
print("parenthetical beginning found")
|
||
if df['parenthetical'][rev_index-j] == 'MixedLeft' and (rev_index-j)>=start_index:
|
||
data = df['data'][rev_index-j]
|
||
#separate line to before and after parenthetical
|
||
par_start = re.search('\(',data).start()
|
||
before_par = data[:par_start]
|
||
after_par = data[par_start:]
|
||
# make current line as before par and tag as dialogue
|
||
print(" splitting and identifying before parenthentical line as ps15")
|
||
try:
|
||
print(before_par)
|
||
except:
|
||
pass
|
||
print("here")
|
||
df['data'][rev_index-j] = before_par
|
||
next_line_flag = df['nlb'][rev_index-j]
|
||
df['nlb'][rev_index-j] = 'N'
|
||
df['Identification_Status'][rev_index-j] = 'ps15'
|
||
df['parenthetical'][rev_index-j] = 'Absent'
|
||
df['When_Identified'][rev_index-j] = 'ExaminingLinesAfterSpeaker'
|
||
df['isIdentified'][rev_index-j] = 'Yes'
|
||
dialogue_end_identified = True
|
||
dia_end = rev_index-j
|
||
|
||
#make new next line as action line
|
||
print("action after dialogue, separating to newline ")
|
||
try:
|
||
print(after_par)
|
||
print("after par")
|
||
except:
|
||
pass
|
||
|
||
line_no = df['line_no'][rev_index-j]
|
||
print("2799",type(line_no))
|
||
try:
|
||
next_line_no = df['line_no'][rev_index-j+1]
|
||
print("2802",type(new_line_no))
|
||
except:
|
||
next_line_no = df['line_no'][int(rev_index-j+1)]
|
||
print("2805",rev_index-j+1)
|
||
try:
|
||
new_line_no = (line_no + next_line_no) / 2
|
||
except:
|
||
new_line_no = (float(line_no) + float(next_line_no)) / 2
|
||
|
||
try:
|
||
if new_line_no in audit_df.index:
|
||
new_line_no = (new_line_no + next_line_no)/2
|
||
except:
|
||
if new_line_no in audit_df.index:
|
||
new_line_no = (float(new_line_no) + float(next_line_no))/2
|
||
|
||
audit_df.loc[new_line_no] = np.nan
|
||
audit_df.loc[new_line_no]['line_removed'] = 'No'
|
||
|
||
audit_df.loc[line_no]['line_broken_into_multiple_lines'] = 'Separated Action from Dialogue and added Speaker'
|
||
|
||
# add new action line , audit report, and change flag
|
||
df.loc[rev_index-j + 0.25] = np.nan
|
||
df.loc[rev_index-j + 0.25,'ssc'] = 0
|
||
df.loc[rev_index-j + 0.25,'data'] = after_par
|
||
df.loc[rev_index-j + 0.25,'case'] = ''
|
||
df.loc[rev_index-j + 0.25,'plb'] = 'N'
|
||
df.loc[rev_index-j + 0.25,'nlb'] = next_line_flag
|
||
#df.loc[index + 0.25,'Identification_Status'] = 'ps6'
|
||
df.loc[rev_index-j + 0.25,'When_Identified'] = 'ExaminingLinesAfterSpeaker'
|
||
df.loc[rev_index-j + 0.25,'isIdentified'] = 'Yes'
|
||
df.loc[rev_index-j + 0.25,'parenthetical'] = 'StartingLeft'
|
||
df.loc[rev_index-j + 0.25,'line_no'] = new_line_no
|
||
|
||
df = df.sort_index().reset_index(drop=True)
|
||
|
||
rev_index += 1
|
||
# insert audit report
|
||
|
||
if df['parenthetical'][rev_index-j] == 'StartingLeft' and (rev_index-j)>=start_index:
|
||
if j >=1:
|
||
df['Identification_Status'][rev_index-j] = 'ps4'
|
||
df['When_Identified'][rev_index-j] = 'ExaminingLinesAfterSpeaker'
|
||
df['isIdentified'][rev_index-j] = 'Yes'
|
||
cur_data = df['data'][rev_index-j]
|
||
new_data = speaker_name.capitalize() + ' ' + cur_data.replace('(','').strip()
|
||
try:
|
||
print(new_data)
|
||
except:
|
||
pass
|
||
df['data'][rev_index-j] = new_data
|
||
df['parenthetical'][rev_index-j] = 'Absent'
|
||
j -= 1
|
||
|
||
while j != 0:
|
||
df['Identification_Status'][rev_index-j] = 'ps5'
|
||
df['When_Identified'][rev_index-j] = 'ExaminingLinesAfterSpeaker'
|
||
df['isIdentified'][rev_index-j] = 'Yes'
|
||
try:
|
||
print(df['data'][rev_index-j])
|
||
except:
|
||
pass
|
||
j -= 1
|
||
df['Identification_Status'][rev_index] = 'ps6'
|
||
df['When_Identified'][rev_index] = 'ExaminingLinesAfterSpeaker'
|
||
df['isIdentified'][rev_index] = 'Yes'
|
||
cur_data = df['data'][rev_index]
|
||
new_data = cur_data.replace(')','').strip()
|
||
df['data'][rev_index] = new_data
|
||
df['parenthetical'][rev_index] = 'Absent'
|
||
try:
|
||
print(new_data)
|
||
except:
|
||
pass
|
||
# insert audit report
|
||
|
||
elif last_line_par == 'Absent' and df['case'][rev_index] != 'AllUpper':
|
||
print("Identifying as dialogue end")
|
||
df['Identification_Status'][rev_index] = 'ps15'
|
||
df['When_Identified'][rev_index] = 'ExaminingLinesAfterSpeaker'
|
||
df['isIdentified'][rev_index] = 'Yes'
|
||
dialogue_end_identified = True
|
||
|
||
|
||
### Now the last line or lines till parenthtical start have been examined
|
||
## if dialogue end is not identified then any last dialogue will be end
|
||
|
||
|
||
|
||
if not dialogue_end_identified:
|
||
print("Could not identify the dialogue")
|
||
continue
|
||
else:
|
||
print("dialogue end identfied as")
|
||
data = df['data'][dia_end]
|
||
try:
|
||
print(data)
|
||
except:
|
||
pass
|
||
## not for the remaining lines identify first as dialoguee beginning and others as middle
|
||
j = start_index
|
||
cur_line_par = df['parenthetical'][j]
|
||
data = df['data'][j]
|
||
parenthetical_begun = False
|
||
|
||
if j < dia_end :
|
||
print("\n")
|
||
try:
|
||
print(data)
|
||
except:
|
||
pass
|
||
|
||
if cur_line_par == 'Absent':
|
||
|
||
print("Identifying as dialogue begining")
|
||
|
||
df['Identification_Status'][j] = 'ps13'
|
||
df['When_Identified'][j] = 'ExaminingLinesAfterSpeaker'
|
||
df['isIdentified'][j] = 'Yes'
|
||
|
||
elif cur_line_par in ('PartMidEnd'):
|
||
print("Identifying as dialogue mixed with parenthetical")
|
||
|
||
df['Identification_Status'][j] = 'ps27'
|
||
df['When_Identified'][j] = 'ExaminingLinesAfterSpeaker'
|
||
df['isIdentified'][j] = 'Yes'
|
||
|
||
elif cur_line_par == 'MixedLeft':
|
||
print("Identifying as dialogue mixed with parenthtical")
|
||
|
||
df['Identification_Status'][j] = 'ps27'
|
||
df['When_Identified'][j] = 'ExaminingLinesAfterSpeaker'
|
||
df['isIdentified'][j] = 'Yes'
|
||
parenthetical_begun = True
|
||
|
||
elif cur_line_par == 'StartingLeft':
|
||
print("Identifying as parenthetical beginning")
|
||
|
||
df['Identification_Status'][j] = 'ps11'
|
||
df['When_Identified'][j] = 'ExaminingLinesAfterSpeaker'
|
||
df['isIdentified'][j] = 'Yes'
|
||
parenthetical_begun = True
|
||
|
||
|
||
elif cur_line_par in ('PartStartMid'):
|
||
print("Identifying as parenthetical mixed with dialog")
|
||
|
||
df['Identification_Status'][j] = 'ps26'
|
||
df['When_Identified'][j] = 'ExaminingLinesAfterSpeaker'
|
||
df['isIdentified'][j] = 'Yes'
|
||
|
||
elif cur_line_par in ('PartMidMid') :
|
||
print("Identifying as dialogue mixed with parenthtical ")
|
||
|
||
df['Identification_Status'][j] = 'ps26;ps27'
|
||
df['When_Identified'][j] = 'ExaminingLinesAfterSpeaker'
|
||
df['isIdentified'][j] = 'Yes'
|
||
|
||
next_line_blank = True if df['nlb'][j] == 'Y' else False
|
||
|
||
if next_line_blank :
|
||
j += 2
|
||
else:
|
||
j += 1
|
||
|
||
|
||
else:
|
||
continue
|
||
|
||
## now end and beginning have been examined . rest are middle if parentheical absent
|
||
cur_line_par = df['parenthetical'][j]
|
||
data = df['data'][j]
|
||
|
||
|
||
while j < dia_end :
|
||
print("\n")
|
||
try:
|
||
print(data)
|
||
except:
|
||
pass
|
||
|
||
|
||
if cur_line_par == 'Absent':
|
||
|
||
print("Identifying as dialogue middle")
|
||
|
||
df['Identification_Status'][j] = 'ps14'
|
||
df['When_Identified'][j] = 'ExaminingLinesAfterSpeaker'
|
||
df['isIdentified'][j] = 'Yes'
|
||
|
||
elif cur_line_par in ('PartMidEnd'):
|
||
print("Identifying as dialogue mixed with parenthetical")
|
||
|
||
df['Identification_Status'][j] = 'ps27'
|
||
df['When_Identified'][j] = 'ExaminingLinesAfterSpeaker'
|
||
df['isIdentified'][j] = 'Yes'
|
||
|
||
elif cur_line_par == 'MixedLeft':
|
||
print("Identifying dialogue mixed with parenthtical")
|
||
|
||
df['Identification_Status'][j] = 'ps27'
|
||
df['When_Identified'][j] = 'ExaminingLinesAfterSpeaker'
|
||
df['isIdentified'][j] = 'Yes'
|
||
parenthetical_begun = True
|
||
|
||
elif cur_line_par == 'StartingLeft':
|
||
print("Identifying as parenthetical beginning")
|
||
|
||
df['Identification_Status'][j] = 'ps11'
|
||
df['When_Identified'][j] = 'ExaminingLinesAfterSpeaker'
|
||
df['isIdentified'][j] = 'Yes'
|
||
parenthetical_begun = True
|
||
|
||
elif cur_line_par in ('Absent') and parenthetical_begun:
|
||
print("Identifying as parenthetical middle")
|
||
|
||
df['Identification_Status'][j] = 'ps20'
|
||
df['When_Identified'][j] = 'ExaminingLinesAfterSpeaker'
|
||
df['isIdentified'][j] = 'Yes'
|
||
parenthetical_begun = True
|
||
|
||
elif cur_line_par in ('EndingRight') and parenthetical_begun:
|
||
print("Identifying as parenthetical ending")
|
||
|
||
df['Identification_Status'][j] = 'ps12'
|
||
df['When_Identified'][j] = 'ExaminingLinesAfterSpeaker'
|
||
df['isIdentified'][j] = 'Yes'
|
||
parenthetical_begun = False
|
||
|
||
elif cur_line_par in ('MixedRight') and parenthetical_begun:
|
||
print("Identifying as dialogue mixed with parenthetical ")
|
||
|
||
df['Identification_Status'][j] = 'ps26'
|
||
df['When_Identified'][j] = 'ExaminingLinesAfterSpeaker'
|
||
df['isIdentified'][j] = 'Yes'
|
||
parenthetical_begun = False
|
||
|
||
elif cur_line_par in ('PartStartMid'):
|
||
print("Identifying as parenthetical mixed with dialog")
|
||
|
||
df['Identification_Status'][j] = 'ps26'
|
||
df['When_Identified'][j] = 'ExaminingLinesAfterSpeaker'
|
||
df['isIdentified'][j] = 'Yes'
|
||
|
||
elif cur_line_par in ('PartMidMid') :
|
||
print("Identifying as dialogue mixed with parenthtical ")
|
||
|
||
df['Identification_Status'][j] = 'ps26;ps27'
|
||
df['When_Identified'][j] = 'ExaminingLinesAfterSpeaker'
|
||
df['isIdentified'][j] = 'Yes'
|
||
|
||
|
||
next_line_blank = True if df['nlb'][j] == 'Y' else False
|
||
|
||
if next_line_blank :
|
||
j += 2
|
||
else:
|
||
j += 1
|
||
|
||
data = df['data'][j]
|
||
print("printing_data before loop")
|
||
cur_line_par = df['parenthetical'][j]
|
||
|
||
for index in df.index:
|
||
if df['Identification_Status'][index] == 'blank' or (len(df['Identification_Status'][index].split(";")) == 1):
|
||
df['isIdentified'][index] = 'Yes'
|
||
else:
|
||
df['isIdentified'][index] = 'No'
|
||
|
||
return df
|
||
|
||
def prep_pnnbl_wts(csv_pnbl_nnbl,cur_dir):
|
||
|
||
pnbl_nnbl_df = pd.read_csv(csv_pnbl_nnbl,skiprows = [0])
|
||
pnbl_df = pnbl_nnbl_df.iloc[:,[0,28,29,30,31,32,33,34,35,36,37,38,39]]
|
||
nnbl_df = pnbl_nnbl_df.iloc[:,[0,41,42,43,44,45,46,47,48,49,50,51,52]]
|
||
|
||
pnbl_df.rename(columns={pnbl_df.columns[0]:'Possibilities',
|
||
pnbl_df.columns[1]:'ps2',
|
||
pnbl_df.columns[2]:'ps1',
|
||
pnbl_df.columns[3]:'ps3',
|
||
pnbl_df.columns[4]:'ps4',
|
||
pnbl_df.columns[5]:'ps5',
|
||
pnbl_df.columns[6]:'ps6',
|
||
pnbl_df.columns[7]:'ps7',
|
||
pnbl_df.columns[8]:'ps10',
|
||
pnbl_df.columns[9]:'ps13',
|
||
pnbl_df.columns[10]:'ps14',
|
||
pnbl_df.columns[11]:'ps15',
|
||
pnbl_df.columns[12]:'ps16',
|
||
|
||
},inplace=True)
|
||
|
||
nnbl_df.rename(columns={nnbl_df.columns[0]:'Possibilities',
|
||
nnbl_df.columns[1]:'ps3',
|
||
nnbl_df.columns[2]:'ps2',
|
||
nnbl_df.columns[3]:'ps1',
|
||
nnbl_df.columns[4]:'ps16',
|
||
nnbl_df.columns[5]:'ps13',
|
||
nnbl_df.columns[6]:'ps14',
|
||
nnbl_df.columns[7]:'ps15',
|
||
nnbl_df.columns[8]:'ps10',
|
||
nnbl_df.columns[9]:'ps7',
|
||
nnbl_df.columns[10]:'ps4',
|
||
nnbl_df.columns[11]:'ps5',
|
||
nnbl_df.columns[12]:'ps6',
|
||
|
||
},inplace=True)
|
||
|
||
|
||
pnbl_df.to_csv(os.path.join(cur_dir,'pnbl_weights.csv'),index = False)
|
||
nnbl_df.to_csv(os.path.join(cur_dir,'nnbl_weights.csv'),index = False)
|
||
|
||
#return pnbl_df,nnbl_df
|
||
|
||
|
||
def identify_using_pnbl_nnbl(df,identify_using,iteration):
|
||
|
||
def takeNumeric(ps):
|
||
return int(ps[2:])
|
||
|
||
def useWeights(ps):
|
||
return int(ps.split("-")[1])
|
||
|
||
count_lines_identified = 0
|
||
all_pos = [ "ps{0}".format(ps) for ps in range(1,35) ]
|
||
new_lines_identified = False
|
||
## column to store the current identification status
|
||
## for 1st iteration this is the stage1 output (output after the strict conditions)
|
||
##last_identification = 'stage-1_output'
|
||
df['Identification_Status'] = df[identify_using]
|
||
#pos_count_column_name_before = 'CountofPossibilities_before_Iteration' + str(iteration)
|
||
pos_count_column_name = 'CountofPossibilities_afterIteration' + str(iteration)
|
||
df[pos_count_column_name] = ''
|
||
line_nos_identified = []
|
||
total_pos_before = 0
|
||
total_pos_after = 0
|
||
pos_decreased = False
|
||
|
||
|
||
for index in df.index:
|
||
## process the line only if it is unidentified (inclduing more than 1 possibilities)
|
||
cur_line_pos = df['Identification_Status'][index].split(";")
|
||
line_no = df['line_no'][index]
|
||
|
||
# if (df['isIdentified'][index] == 'No'):
|
||
# print(line_no,": line currently unidentified")
|
||
# print(df['data'][index])
|
||
# line_pos_using_pnbl = all_pos
|
||
# line_pos_using_nnbl = all_pos
|
||
# total_pos_before += len(all_pos)
|
||
# el
|
||
if len(cur_line_pos) > 1:
|
||
print(line_no,": line currently has more than one possibilties")
|
||
try:
|
||
print(df['data'][index])
|
||
except:
|
||
pass
|
||
|
||
print(cur_line_pos)
|
||
line_pos_using_pnbl = cur_line_pos
|
||
line_pos_using_nnbl = cur_line_pos
|
||
total_pos_before += len(cur_line_pos)
|
||
else:
|
||
print(line_no, ": line already identified as",df['Identification_Status'][index])
|
||
total_pos_before += 1
|
||
total_pos_after += 1
|
||
continue
|
||
|
||
|
||
|
||
|
||
## for unidentified line find the previous-line which is non-blank
|
||
j=1
|
||
pnbl = 'not found'
|
||
print("looking for previous non-blank line")
|
||
while (pnbl == 'not found') and ((index-j) >= 0 ):
|
||
if (df['Identification_Status'][index-j] == 'blank'):
|
||
print ("previous line is blank")
|
||
print ("moving to subsequent previous line")
|
||
j+=1
|
||
else:
|
||
pnbl = "found"
|
||
print("found previous non-blank line")
|
||
pnbl_line_pos = df['Identification_Status'][index-j].split(";")
|
||
|
||
|
||
# check if the previous line found is identified or not (unidentified or having more than one possibilities)
|
||
if (pnbl == 'found'):
|
||
|
||
if (df['isIdentified'][index-j] == 'No') :
|
||
print("but as previous non-blank line is unidentified so cannot perform pnbl check, so skipping")
|
||
elif len(pnbl_line_pos) > 1:
|
||
print("but as previous non-blank line is unidentified (has more than one possibilties) so cannot perform pnbl check, so skipping")
|
||
else:
|
||
print("AND previous non-blank line is already identified as",df['Identification_Status'][index-j] )
|
||
try:
|
||
print(df['data'][index-j])
|
||
except:
|
||
pass
|
||
pnbl_identified_as = df['Identification_Status'][index-j]
|
||
df['pnbl_identified_as'][index] = pnbl_identified_as
|
||
try:
|
||
line_pos_using_pnbl = sorted(list(pnbl_df.loc[pnbl_df[pnbl_identified_as] > 0,pnbl_identified_as].index),key=takeNumeric)
|
||
line_pos_string = ';'.join([str(elem) for elem in line_pos_using_pnbl])
|
||
df['pos_using_pnbl'][index] = line_pos_string
|
||
except:
|
||
print("pnbl weights sheet does not have column",pnbl_identified_as)
|
||
pass
|
||
|
||
|
||
|
||
## for unidentified line find the next-line which is non-blank
|
||
j=1
|
||
nnbl = 'not found'
|
||
print("looking for next non-blank line")
|
||
while (nnbl == 'not found') and ((index+j) < (len(df))):
|
||
if (df['Identification_Status'][index+j] == 'blank'):
|
||
print ("next line is blank")
|
||
print ("moving to subsequent next line")
|
||
j+=1
|
||
else:
|
||
nnbl = "found"
|
||
print("found next non-blank line")
|
||
nnbl_line_pos = df['Identification_Status'][index+j].split(";")
|
||
|
||
# check if the line found is identified or not
|
||
if (nnbl == 'found'):
|
||
if (df['isIdentified'][index+j] == 'No'):
|
||
print("but as next non-blank line is unidentified so cannot perform nnbl check, so skipping")
|
||
elif len(nnbl_line_pos)>1:
|
||
print("but as next non-blank line is unidentified(has multiple possibilties) so cannot perform nnbl check, so skipping")
|
||
|
||
else:
|
||
|
||
print("AND next non-blank line is already identified as",df['Identification_Status'][index+j] )
|
||
try:
|
||
print(df['data'][index+j])
|
||
except:
|
||
pass
|
||
nnbl_identified_as = df['Identification_Status'][index+j]
|
||
df['nnbl_identified_as'][index] = nnbl_identified_as
|
||
try:
|
||
line_pos_using_nnbl = sorted(list(nnbl_df.loc[nnbl_df[nnbl_identified_as] > 0,nnbl_identified_as].index),key=takeNumeric)
|
||
line_pos_string = ';'.join([str(elem) for elem in line_pos_using_nnbl])
|
||
df['pos_using_nnbl'][index] = line_pos_string
|
||
except:
|
||
print("nnbl weights sheet does not have column",nnbl_identified_as)
|
||
pass
|
||
|
||
|
||
|
||
|
||
## now get the combined possibility , doing this for unidentified lines
|
||
if(df['isIdentified'][index] == 'No') or len(cur_line_pos)> 1:
|
||
set_a = set(line_pos_using_pnbl)
|
||
set_b = set(line_pos_using_nnbl)
|
||
|
||
|
||
## for some lines having mulitple possibilites some possibilities might get eliminated
|
||
## so take intersection with current possibilites
|
||
if len(cur_line_pos)> 1:
|
||
|
||
set_c = set(cur_line_pos)
|
||
pos_using_pnbl_nnbl = sorted(set.intersection(set_a,set_b,set_c))
|
||
|
||
|
||
else:
|
||
pos_using_pnbl_nnbl = sorted(set.intersection(set_a,set_b))
|
||
|
||
## append the top and other posibility back to the beginning list if it got eliminated in the intersection
|
||
## append the posibility which were not to be removed back to the list if it got eliminated in the intersection
|
||
try:
|
||
pos_not_to_remove = df['ps_not_to_remove'][index].split(";")
|
||
except:
|
||
pos_not_to_remove = []
|
||
for ps in pos_not_to_remove:
|
||
if ps not in pos_using_pnbl_nnbl:
|
||
pos_using_pnbl_nnbl.insert(0,ps)
|
||
|
||
|
||
print("pos_using_pnbl_nnbl is ",pos_using_pnbl_nnbl)
|
||
## if only one poss then that means identified
|
||
if len(pos_using_pnbl_nnbl) == 1:
|
||
new_lines_identified = True
|
||
identify_using = 'Identification_Status'
|
||
count_lines_identified += 1
|
||
line_nos_identified.append(df['line_no'][index])
|
||
df['When_Identified'][index] = 'PNBL_NNBL'
|
||
|
||
## append the weight to the possibilites
|
||
pos_with_weights = []
|
||
for pos in pos_using_pnbl_nnbl:
|
||
wt = ''
|
||
pos_wt = str(pos)
|
||
try:
|
||
wt = df[pos_wt][index]
|
||
|
||
except:
|
||
print("could not find weight for pos ",pos," at index ",index)
|
||
continue
|
||
|
||
try:
|
||
wt = int(wt)
|
||
pos_wt += '-' + str(wt)
|
||
except:
|
||
print("could not convert wt to int for pos ",pos," at index ",index)
|
||
continue
|
||
|
||
pos_with_weights.append(pos_wt)
|
||
|
||
print("pos_with_weights is ",pos_with_weights)
|
||
# now sort in descending order using the weights as key
|
||
pos_with_weights = sorted(pos_with_weights,key=useWeights , reverse = True)
|
||
print("sorted pos_with_weights is ",pos_with_weights)
|
||
line_pos_string_with_weights = ';'.join([str(elem) for elem in pos_with_weights])
|
||
|
||
df['Identification_Status_with_weights'][index] = line_pos_string_with_weights
|
||
|
||
## copy over to identification status without the weights but in order of decreasing weights
|
||
pos_without_weight = []
|
||
for pos in pos_with_weights:
|
||
pos_without_weight.append(pos.split("-")[0])
|
||
|
||
line_pos_string = ';'.join([str(elem) for elem in pos_without_weight])
|
||
print("line new possibilites",line_pos_string)
|
||
df['pos_using_pnbl_nnbl'][index] = line_pos_string
|
||
|
||
df['Identification_Status'][index] = line_pos_string
|
||
print(df['Identification_Status'][index])
|
||
|
||
##write the count of possibilities to a column, make new column for each iteration
|
||
df[pos_count_column_name][index] = len(pos_without_weight)
|
||
total_pos_after += len(pos_without_weight)
|
||
|
||
|
||
print( "new lines identified :" ,new_lines_identified)
|
||
print(total_pos_before,total_pos_after)
|
||
|
||
if (total_pos_before - total_pos_after) > 0:
|
||
pos_decreased = True
|
||
else:
|
||
pos_decreased = False
|
||
|
||
return df,new_lines_identified,identify_using,count_lines_identified,line_nos_identified,pos_decreased
|
||
|
||
|
||
|
||
|
||
|
||
def remove_ineligible_pos(df,identify_using,iteration):
|
||
|
||
|
||
def useWeights(ps):
|
||
return int(ps.split("-")[1])
|
||
|
||
def check_pos_eligibility(pos,pvs_line_pos,nxt_line_pos,first_line_flag,last_line_flag):
|
||
pos_eligible = False
|
||
pos_eligible = ps_conditions_dict.get(pos,pos)
|
||
# if pos == 'ps7':
|
||
# print(pvs_line_pos)
|
||
# print(nxt_line_pos)
|
||
# pos_eligible = 'ps7' if (any(ps in pvs_line_pos for ps in ['blank']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['ps10','ps11','ps13']) or last_line_flag) else False
|
||
# else :
|
||
# pos_eligible = pos
|
||
|
||
return pos_eligible
|
||
|
||
count_lines_identified = 0
|
||
pos_count_column_name = 'CountofEligiblePossibilities_afterIteration' + str(iteration)
|
||
df[pos_count_column_name] = ''
|
||
new_lines_identified = False
|
||
total_pos_before = 0
|
||
total_pos = 0
|
||
pos_decreased = False
|
||
line_not_identified = False
|
||
|
||
|
||
for index in df.index:
|
||
#for index in range(5,8):
|
||
|
||
total_pos_before += len(df[identify_using][index].split(";"))
|
||
line_not_identified = True if (len(df[identify_using][index].split(";")) > 1) else False
|
||
print (index,line_not_identified)
|
||
if line_not_identified :
|
||
line_pos = df[identify_using][index].split(";")
|
||
pvs_line_pos = []
|
||
nxt_line_pos = []
|
||
first_line_flag = False
|
||
last_line_flag = False
|
||
|
||
|
||
if index == 0:
|
||
first_line_flag = True
|
||
nxt_line_pos = df[identify_using][index+1].split(";")
|
||
elif index == df.index[-1]:
|
||
pvs_line_pos = df[identify_using][index-1].split(";")
|
||
last_line_flag = True
|
||
else:
|
||
pvs_line_pos = df[identify_using][index-1].split(";")
|
||
nxt_line_pos = df[identify_using][index+1].split(";")
|
||
|
||
line_eligible_pos = []
|
||
|
||
print('\n')
|
||
print (index)
|
||
print(pvs_line_pos)
|
||
print(nxt_line_pos)
|
||
|
||
ps_conditions_dict = {
|
||
'ps1': 'ps1' if (any(ps in pvs_line_pos for ps in ['blank','ps6','ps15','ps16','ps17']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['blank','ps4','ps6']) or last_line_flag) else False,
|
||
'ps2': 'ps2' if (any(ps in pvs_line_pos for ps in ['blank','ps6','ps15','ps16','ps17']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['ps3']) or last_line_flag) else False,
|
||
'ps3': 'ps3' if (any(ps in pvs_line_pos for ps in ['ps2']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['blank','ps4','ps6']) or last_line_flag) else False,
|
||
'ps4': 'ps4' if (any(ps in pvs_line_pos for ps in ['blank','ps1','ps3','ps15']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['blank','ps5','ps6']) or last_line_flag) else False,
|
||
'ps5': 'ps5' if (any(ps in pvs_line_pos for ps in ['ps4','ps5']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['ps5','ps6']) or last_line_flag) else False,
|
||
'ps6': 'ps6' if (any(ps in pvs_line_pos for ps in ['blank','ps1','ps3','ps4','ps5','ps15']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['blank','ps1','ps2','ps4','ps6','ps7','ps8','ps16']) or last_line_flag) else False,
|
||
'ps7': 'ps7' if (any(ps in pvs_line_pos for ps in ['blank','ps6']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['blank','ps10','ps11','ps13','ps15']) or last_line_flag) else False,
|
||
'ps8': 'ps8' if (any(ps in pvs_line_pos for ps in ['blank','ps6']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['ps9','ps10','ps11','ps13','ps15']) or last_line_flag) else False,
|
||
'ps9': 'ps9' if (any(ps in pvs_line_pos for ps in ['ps7']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['ps10','ps11','ps13','ps15']) or last_line_flag) else False,
|
||
'ps10':'ps10' if (any(ps in pvs_line_pos for ps in ['ps7','ps8','ps9']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['ps13','ps15']) or last_line_flag) else False,
|
||
'ps11':'ps11' if (any(ps in pvs_line_pos for ps in ['ps7','ps8','ps9','ps14','blank']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['ps12','ps26']) or last_line_flag) else False,
|
||
'ps12':'ps12' if (any(ps in pvs_line_pos for ps in ['ps11']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['ps13']) or last_line_flag) else False,
|
||
'ps13':'ps13' if (any(ps in pvs_line_pos for ps in ['blank','ps7','ps8','ps9','ps10','ps12']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['blank','ps14','ps15']) or last_line_flag) else False,
|
||
'ps14':'ps14' if (any(ps in pvs_line_pos for ps in ['ps13','ps14','blank']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['ps14','ps15','blank']) or last_line_flag) else False,
|
||
'ps15':'ps15' if (any(ps in pvs_line_pos for ps in ['ps7','ps8','ps9','ps10','ps12','ps13','ps14','blank']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['blank','ps1','ps3','ps4','ps6','ps7','ps8','ps16']) or last_line_flag) else False,
|
||
'ps16':'ps16' if (any(ps in pvs_line_pos for ps in ['blank','ps6','ps15']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['blank','ps1','ps3']) or last_line_flag) else False,
|
||
'ps17':'ps17' if (any(ps in pvs_line_pos for ps in ['blank','ps1','ps3','ps6','ps15','ps16','ps17','ps18']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['ps1','ps2','ps8','ps16','ps17','ps18','ps19','blank']) or last_line_flag) else False,
|
||
'ps18':'ps18' if (any(ps in pvs_line_pos for ps in ['blank','ps6','ps15','ps16']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['blank','ps5','ps16']) or last_line_flag) else False,
|
||
'ps19':'ps19' if (any(ps in pvs_line_pos for ps in ['blank','ps15','ps6']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['blank','ps4','ps6']) or last_line_flag) else False,
|
||
'ps21':'ps21' if (any(ps in pvs_line_pos for ps in ['blank','ps1','ps3','ps4','ps5','ps15']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['ps10','ps11','ps13']) or last_line_flag) else False,
|
||
'ps22':'ps22' if (any(ps in pvs_line_pos for ps in ['blank','ps1','ps3','ps4','ps5','ps15']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['blank','ps1','ps3']) or last_line_flag) else False,
|
||
'ps25':'ps25' if (any(ps in pvs_line_pos for ps in ['blank','ps6']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['ps12','ps13']) or last_line_flag) else False,
|
||
'ps26':'ps26' if (any(ps in pvs_line_pos for ps in ['ps7','ps8','ps9','ps11','blank','ps27']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['ps14','ps15','ps27']) or last_line_flag) else False,
|
||
'ps27':'ps27' if (any(ps in pvs_line_pos for ps in ['ps7','ps8','ps9','ps10','ps12','ps13','ps14','ps26','blank']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['ps26','ps14','ps15']) or last_line_flag) else False,
|
||
'ps28':'ps28' if (any(ps in pvs_line_pos for ps in ['ps7','ps8','ps9','ps10','ps12','ps13','ps14','blank']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['blank','ps9','ps10','ps11','ps13','ps15']) or last_line_flag) else False,
|
||
'ps29':'ps29' if (any(ps in pvs_line_pos for ps in ['ps7','ps8','ps9','ps10','ps12','ps13','ps14']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['blank','ps1','ps2','ps4','ps6','ps7','ps8','ps16']) or last_line_flag) else False,
|
||
'ps30':'ps30' if (any(ps in pvs_line_pos for ps in ['blank','ps6','ps7']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['ps14','ps15','blank']) or last_line_flag) else False,
|
||
'ps31':'ps31' if (any(ps in pvs_line_pos for ps in ['blank','ps15','ps6']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['blank','ps1','ps2']) or last_line_flag) else False
|
||
}
|
||
print("current possibilities",line_pos)
|
||
for pos in line_pos:
|
||
#print (pos)
|
||
pos_checked = check_pos_eligibility(pos,pvs_line_pos,nxt_line_pos,first_line_flag,last_line_flag)
|
||
if pos_checked:
|
||
line_eligible_pos.append(pos_checked)
|
||
|
||
print("eligible possibilities",line_eligible_pos)
|
||
## append back the possibilties (top and other which are not to be removed)
|
||
## append the top and other posibility back to the beginning list if it got eliminated in the intersection
|
||
|
||
try:
|
||
pos_not_to_remove = df['ps_not_to_remove'][index].split(";")
|
||
except:
|
||
pos_not_to_remove = []
|
||
for ps in pos_not_to_remove:
|
||
if ps not in line_eligible_pos:
|
||
line_eligible_pos.insert(0,ps)
|
||
|
||
|
||
print (";".join(line_pos))
|
||
print (";".join(line_eligible_pos))
|
||
df['Identification_Status_ineligible_removed'][index] = ";".join(line_eligible_pos)
|
||
df['CountofPossibilities_afterIneligibleRemoved'][index] = len(line_eligible_pos)
|
||
|
||
##write the count of possibilities to a column, make new column for each iteration
|
||
df[pos_count_column_name][index] = len(line_eligible_pos)
|
||
total_pos += len(line_eligible_pos)
|
||
if len(line_eligible_pos) == 1:
|
||
count_lines_identified +=1
|
||
new_lines_identified = True
|
||
df['When_Identified'][index] = 'RemovingIneligiblePossibilities'
|
||
|
||
# if len(line_pos) - len(line_eligible_pos) > 0:
|
||
# pos_decreased = True
|
||
|
||
else:
|
||
df['Identification_Status_ineligible_removed'][index] = df[identify_using][index]
|
||
total_pos += 1
|
||
continue
|
||
|
||
|
||
## copy over the inelgible removed to Identification Status and sort in decreasing order of weights
|
||
##df.loc[:,'Identification_Status'] = df.loc[:,'Identification_Status_ineligible_removed']
|
||
pos_eligible = df['Identification_Status_ineligible_removed'][index].split(";")
|
||
## append the weight to the possibilites
|
||
print("test")
|
||
pos_with_weights = []
|
||
for pos in pos_eligible:
|
||
|
||
wt = ''
|
||
pos_wt = str(pos)
|
||
try:
|
||
wt = df[pos_wt][index]
|
||
|
||
except:
|
||
print("could not find weight for pos ",pos," at index ",index)
|
||
continue
|
||
|
||
try:
|
||
wt = int(wt)
|
||
pos_wt += '-' + str(wt)
|
||
except:
|
||
print("could not convert wt to int for pos ",pos," at index ",index)
|
||
continue
|
||
|
||
pos_with_weights.append(pos_wt)
|
||
|
||
# now sort in descending order using the weights as key
|
||
pos_with_weights = sorted(pos_with_weights,key=useWeights , reverse = True)
|
||
print(pos_with_weights)
|
||
line_pos_string_with_weights = ';'.join([str(elem) for elem in pos_with_weights])
|
||
|
||
df['Identification_Status_with_weights'][index] = line_pos_string_with_weights
|
||
|
||
## copy over to identification status without the weights but in order of decreasing weights
|
||
pos_without_weight = []
|
||
for pos in pos_with_weights:
|
||
pos_without_weight.append(pos.split("-")[0])
|
||
|
||
line_pos_string = ';'.join([str(elem) for elem in pos_without_weight])
|
||
print(line_pos_string)
|
||
|
||
df['Identification_Status_ineligible_removed'][index] = line_pos_string
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
if (total_pos_before - total_pos) > 0:
|
||
pos_decreased = True
|
||
else:
|
||
pos_decreased = False
|
||
|
||
print(total_pos_before,total_pos)
|
||
return df,new_lines_identified,pos_decreased,count_lines_identified,total_pos
|
||
|
||
|
||
def do_while_pnnbl_ineligible(df):
|
||
|
||
## import the pnbl and nnbl weights
|
||
##pnbl_df,nnbl_df = prep_pnnbl_wts(csv_pnbl_nnbl)
|
||
# pnbl_df.set_index('Possibilities',inplace= True)
|
||
# nnbl_df.set_index('Possibilities',inplace= True)
|
||
cur_dir = mypath
|
||
pnbl_df = pd.read_csv(os.path.join(cur_dir,'pnbl_weights.csv') , index_col = 'Possibilities' , keep_default_na = False)
|
||
pnbl_df = pnbl_df.head(34)
|
||
pnbl_df = pnbl_df.apply(pd.to_numeric,errors ='ignore')
|
||
|
||
nnbl_df= pd.read_csv(os.path.join(cur_dir,'nnbl_weights.csv'),index_col = 'Possibilities' , keep_default_na = False )
|
||
nnbl_df = nnbl_df.head(34)
|
||
nnbl_df = nnbl_df.apply(pd.to_numeric,errors ='ignore')
|
||
|
||
|
||
|
||
run_again = True
|
||
total_pos_initial = 0
|
||
|
||
for index in df.index:
|
||
total_pos_initial += len(df['Identification_Status'][index].split(";"))
|
||
|
||
while run_again:
|
||
## run the identification using pnbl_nnbl till no new lines get identified
|
||
new_lines_identified = True
|
||
iteration = 1
|
||
line_nos_identified_iteration = []
|
||
line_nos_identified = []
|
||
|
||
count_total = 0
|
||
pos_decreased = False
|
||
|
||
|
||
if 'Identification_Status' in df.columns:
|
||
|
||
identify_using = 'Identification_Status'
|
||
|
||
else:
|
||
identify_using = 'stage-1_output'
|
||
|
||
if 'Identification_Status_with_weights' not in df:
|
||
df['Identification_Status_with_weights'] = ''
|
||
|
||
if 'pnbl_identified_as' not in df:
|
||
df['pnbl_identified_as'] = ''
|
||
|
||
if 'pos_using_pnbl' not in df:
|
||
df['pos_using_pnbl'] = ''
|
||
|
||
if 'nnbl_identified_as' not in df:
|
||
df['nnbl_identified_as'] = ''
|
||
|
||
if 'pos_using_nnbl' not in df:
|
||
df['pos_using_nnbl'] = ''
|
||
|
||
if 'pos_using_pnbl_nnbl' not in df:
|
||
df['pos_using_pnbl_nnbl'] = ''
|
||
|
||
while new_lines_identified or pos_decreased:
|
||
print("Identifying lines using pnbl_nnbl ")
|
||
print("using:",identify_using)
|
||
df,new_lines_identified,identify_using,count,line_nos_identified_iteration,pos_decreased = identify_using_pnbl_nnbl(df,identify_using,iteration)
|
||
|
||
#df.to_csv( 'After_Iteration' + str(iteration) + '.csv')
|
||
print("New lines identified in Iteration",iteration,": ",count)
|
||
iteration += 1
|
||
count_total+= count
|
||
line_nos_identified.append(line_nos_identified_iteration)
|
||
print("lines identified in iteration",line_nos_identified)
|
||
|
||
|
||
|
||
print(df['Identification_Status'].value_counts())
|
||
|
||
print ("Total new lines identified in pnbl nnbl after all iteration:",count_total)
|
||
print ("line nos identified in all iterations",line_nos_identified)
|
||
|
||
## run the identification by eliminating possibilities no new lines get identified
|
||
new_lines_identified = True
|
||
pos_decreased = True
|
||
iteration = 1
|
||
|
||
count_total = 0
|
||
total_pos_start = 0
|
||
total_pos_after = 0
|
||
|
||
identify_using = 'Identification_Status'
|
||
|
||
if 'Identification_Status_ineligible_removed' not in df.columns:
|
||
|
||
df['Identification_Status_ineligible_removed'] = ''
|
||
df['CountofPossibilities_afterIneligibleRemoved'] = ''
|
||
|
||
|
||
for index in df.index:
|
||
total_pos_start += len(df['Identification_Status'][index].split(";"))
|
||
|
||
# df.to_csv('test_pnnbl.csv')
|
||
|
||
while new_lines_identified or pos_decreased:
|
||
print("\n Identifying lines using eliminating ineligible possibilities ")
|
||
print("using:",identify_using)
|
||
df,new_lines_identified,pos_decreased,count,total_pos_after = remove_ineligible_pos(df,identify_using,iteration)
|
||
|
||
#df.to_csv( 'eligiblePossibilitiesAfter_Iteration' + str(iteration) + '.csv')
|
||
print("New lines identified in Iteration",iteration,": ",count)
|
||
identify_using = 'Identification_Status_ineligible_removed'
|
||
iteration += 1
|
||
count_total+= count
|
||
|
||
print ("Total new lines identified by eliminating ineligible possibilities after all iteration:",count_total)
|
||
|
||
## copy over column
|
||
df['Identification_Status'] = df['Identification_Status_ineligible_removed']
|
||
print(df['Identification_Status'].value_counts())
|
||
|
||
print(total_pos_start,total_pos_after,iteration)
|
||
|
||
|
||
## run both pnnbl and pos ineligible if
|
||
## pos is decreased using ineligble code
|
||
run_again = True if total_pos_start > total_pos_after else False
|
||
|
||
|
||
print(total_pos_initial,total_pos_after)
|
||
return df
|
||
|
||
def examine_same_content_lines(df):
|
||
df_udn = df.loc[df['isIdentified'] == 'No', : ]
|
||
|
||
df_udn['data_strip'] = df_udn['data'].str.strip()
|
||
|
||
df_occurences = df_udn.value_counts(['data_strip']).reset_index(name='count')
|
||
|
||
pos_sp_list = df_occurences.loc[df_occurences['count'] > 1,'data_strip'].to_list()
|
||
|
||
|
||
for pos_sp in pos_sp_list:
|
||
print (pos_sp)
|
||
for index in df.index:
|
||
if df['isIdentified'][index] == 'Yes':
|
||
continue
|
||
if 'ps7' not in df['Identification_Status'][index].split(";"):
|
||
continue
|
||
#df['data'][index].strip
|
||
|
||
## preceeded by
|
||
|
||
prev_line_blank = True if df['plb'][index] == 'Y' else False
|
||
|
||
|
||
if prev_line_blank:
|
||
if index - 2 >= 0:
|
||
pnbl = index - 2
|
||
else:
|
||
print("start of script \n")
|
||
continue
|
||
else:
|
||
if index - 1 >= 0:
|
||
pnbl = index - 1
|
||
else:
|
||
print("start of script \n")
|
||
continue
|
||
|
||
|
||
if df['data'][index].strip() == pos_sp and df['Identification_Status'][pnbl] == ('ps15' or 'ps6'):
|
||
print(index)
|
||
try:
|
||
print(df['data'][pnbl])
|
||
except:
|
||
pass
|
||
|
||
print(df['Identification_Status'][pnbl])
|
||
if df['parenthetical'][index] == 'Absent':
|
||
df['Identification_Status'][index] == 'ps7'
|
||
df['isIdentified'][index] = 'Yes'
|
||
df['When_Identified'][index] = 'ExaminingSameContentLines'
|
||
else:
|
||
df['Identification_Status'][index] == 'ps8;ps25'
|
||
|
||
return df
|
||
|
||
def examine_action_possibilities_part1(df):
|
||
|
||
# loop through the lines and check lines possibility for being action
|
||
for index in df.index[2:-2]:
|
||
if df['isIdentified'][index] == 'Yes':
|
||
continue
|
||
|
||
print("unidentified line index is",index)
|
||
cur_line_indent = df['ssc'][index]
|
||
next_line_blank = df['nlb'][index]
|
||
prev_line_blank = df['plb'][index]
|
||
|
||
if next_line_blank == 'N':
|
||
next_nbl_line_indent = df['ssc'][index+1]
|
||
next_nbl_line_pos = df['Identification_Status'][index+1].split(";")
|
||
else:
|
||
next_nbl_line_indent = df['ssc'][index+2]
|
||
next_nbl_line_pos = df['Identification_Status'][index+2].split(";")
|
||
|
||
if prev_line_blank == 'N':
|
||
|
||
prev_nbl_line_indent = df['ssc'][index-1]
|
||
prev_nbl_line_pos = df['Identification_Status'][index-1].split(";")
|
||
else:
|
||
prev_nbl_line_indent = df['ssc'][index-2]
|
||
prev_nbl_line_pos = df['Identification_Status'][index-2].split(";")
|
||
|
||
|
||
## check for ps5,ps4
|
||
if cur_line_indent >=15 and cur_line_indent <=25:
|
||
data = df['data'][index]
|
||
# if cur_line_indent == prev_nbl_line_indent and cur_line_indent == next_nbl_line_indent:
|
||
# ps4_in_prev = True if 'ps4' in prev_nbl_line_pos[0] else False
|
||
# ps5_in_prev = True if 'ps5' in prev_nbl_line_pos[0] else False
|
||
# ps5_in_next = True if 'ps5' in next_nbl_line_pos else False
|
||
# ps6_in_next = True if 'ps6' in next_nbl_line_pos else False
|
||
# ps16_in_prev = True if 'ps16' in prev_nbl_line_pos[0] else False
|
||
# ps17_in_prev = True if 'ps17' in prev_nbl_line_pos[0] else False
|
||
|
||
# next_line_flag = True if next_line_blank == 'Y' else False
|
||
# prev_action_special_transition = True if any([ps4_in_prev,ps5_in_prev,ps16_in_prev,ps17_in_prev]) else False
|
||
# prev_action = True if any([ps4_in_prev,ps5_in_prev]) else False
|
||
|
||
# if all([ps5_in_next,ps6_in_next,prev_action_special_transition]):
|
||
# cur_line_new_pos = 'ps1;ps5;ps4'
|
||
# df['Identification_Status'][index] = cur_line_new_pos
|
||
# df['When_Identified'][index] = 'ExaminingActionPossibilities'
|
||
# print(data)
|
||
# print(cur_line_new_pos)
|
||
# print("\n")
|
||
# continue
|
||
|
||
# elif prev_action and ((ps5_in_next and ps6_in_next) or next_line_flag):
|
||
# cur_line_new_pos = 'ps6;ps5;ps4'
|
||
# df['Identification_Status'][index] = cur_line_new_pos
|
||
# df['When_Identified'][index] = 'ExaminingActionPossibilities'
|
||
# print(data)
|
||
# print(cur_line_new_pos)
|
||
# print("\n")
|
||
# continue
|
||
|
||
# elif prev_action and (ps5_in_next and ps6_in_next) :
|
||
# cur_line_new_pos = 'ps5;ps4'
|
||
# df['Identification_Status'][index] = cur_line_new_pos
|
||
# df['When_Identified'][index] = 'ExaminingActionPossibilities'
|
||
# print(data)
|
||
# print(cur_line_new_pos)
|
||
# print("\n")
|
||
# continue
|
||
|
||
## check for ps6
|
||
if len(prev_nbl_line_pos) == 1 and cur_line_indent == prev_nbl_line_indent and next_nbl_line_pos[0] == 'ps7':
|
||
ps4_in_prev = True if 'ps4' == prev_nbl_line_pos[0] else False
|
||
ps5_in_prev = True if 'ps5' == prev_nbl_line_pos[0] else False
|
||
if any([ps4_in_prev,ps5_in_prev]):
|
||
cur_line_new_pos = 'ps6'
|
||
df['Identification_Status'][index] = cur_line_new_pos
|
||
df['When_Identified'][index] = 'ExaminingActionPossibilities'
|
||
try:
|
||
print(data)
|
||
except:
|
||
pass
|
||
print(cur_line_new_pos)
|
||
print("\n")
|
||
continue
|
||
|
||
return df
|
||
|
||
def examine_action_possibilities_part2(df):
|
||
|
||
# loop through the lines and check lines possibility for being action
|
||
for index in df.index[2:-2]:
|
||
if df['isIdentified'][index] == 'Yes':
|
||
continue
|
||
cur_line_pos = df['Identification_Status'][index].split(";")
|
||
|
||
if cur_line_pos[0] == 'ps1':
|
||
continue
|
||
|
||
if df['plb'][index] == 'N' :
|
||
pnbl_pos = df['Identification_Status'][index-1].split(";")
|
||
else:
|
||
pnbl_pos = df['Identification_Status'][index-2].split(";")
|
||
|
||
if df['nlb'][index] == 'N' :
|
||
nnbl_pos = df['Identification_Status'][index+1].split(";")
|
||
else:
|
||
nnbl_pos = df['Identification_Status'][index+2].split(";")
|
||
|
||
|
||
line_no = df['line_no'][index]
|
||
data = df['data'][index]
|
||
|
||
## declare ps6 if nnbl ps7 and pnbl has either ps1 ,ps3, ps15, ps6 as possibility
|
||
if len(pnbl_pos) == 1 and len(nnbl_pos) == 1 and nnbl_pos[0] == 'ps7':
|
||
ps1_equal_prev = True if 'ps1' == pnbl_pos[0] else False
|
||
ps3_equal_prev = True if 'ps3' == pnbl_pos[0] else False
|
||
ps6_equal_prev = True if 'ps6' == pnbl_pos[0] else False
|
||
ps15_equal_prev = True if 'ps15' == pnbl_pos[0] else False
|
||
if any([ps1_equal_prev,ps3_equal_prev,ps6_equal_prev,ps15_equal_prev]):
|
||
print("Identifying line as ps6 as before speaker and after 1,3,6, 15",)
|
||
try:
|
||
print(line_no , data)
|
||
except:
|
||
pass
|
||
df['Identification_Status'][index] = 'ps6'
|
||
df['When_Identified'][index] = 'ExaminingActionPossibilitiesAfterIneligible'
|
||
df['isIdentified'][index] = 'Yes'
|
||
continue
|
||
|
||
# loop through to examine for ps5
|
||
for index in df.index[2:-2]:
|
||
if df['Identification_Status'][index] == 'blank':
|
||
continue
|
||
if len(df['Identification_Status'][index].split(";")) == 1 :
|
||
continue
|
||
|
||
|
||
if df['plb'][index] == 'N' :
|
||
pnbl_pos = df['Identification_Status'][index-1].split(";")
|
||
else:
|
||
pnbl_pos = df['Identification_Status'][index-2].split(";")
|
||
|
||
if df['nlb'][index] == 'N' :
|
||
nnbl_pos = df['Identification_Status'][index+1].split(";")
|
||
else:
|
||
nnbl_pos = df['Identification_Status'][index+2].split(";")
|
||
|
||
line_no = df['line_no'][index]
|
||
data = df['data'][index]
|
||
|
||
## declare ps5 if prev ps4,ps5 and next ps5,ps6
|
||
if pnbl_pos in ('ps4','ps5') and nnbl_pos in ('ps5','ps6'):
|
||
print("Identifying line as ps5 as between 4,5 and 5,6")
|
||
df['Identification_Status'][index] = 'ps5'
|
||
df['When_Identified'][index] = 'ExaminingActionPossibilitiesAfterIneligible'
|
||
df['isIdentified'][index] = 'Yes'
|
||
continue
|
||
|
||
### declare ps5 if prev has a possibility as ps4,ps5 and next ps5,ps6
|
||
# ps4_in_prev = True if 'ps4' in pnbl_pos else False
|
||
# ps5_in_prev = True if 'ps5' in pnbl_pos else False
|
||
# ps5_in_next = True if 'ps5' in nnbl_pos else False
|
||
# ps6_in_next = True if 'ps6' in nnbl_pos else False
|
||
|
||
# if any([ps4_in_prev,ps5_in_prev]) and any([ps5_in_next,ps6_in_next]):
|
||
# print("Identifying line as ps5 in between possibilities of 4,5 and 5,6")
|
||
# df['Identification_Status'][index] = 'ps5'
|
||
# df['When_Identified'][index] = 'ExaminingActionPossibilitiesAfterIneligible'
|
||
# continue
|
||
|
||
return df
|
||
|
||
def examine_same_indent_bunch(df):
|
||
|
||
|
||
total_pos_before = 0
|
||
total_pos_after = 0
|
||
|
||
for index in df.index:
|
||
line_pos = df['Identification_Status'][index].split(";")
|
||
total_pos_before += len(line_pos)
|
||
|
||
|
||
index_iter = iter(df.index)
|
||
|
||
for index in index_iter:
|
||
# print("index",index)
|
||
# print(df['Identification_Status'][index])
|
||
# print(len(df['Identification_Status'][index].split(";")))
|
||
line_pos = df['Identification_Status'][index].split(";")
|
||
if len(line_pos) == 1:
|
||
continue
|
||
print(index)
|
||
cur_indent = df['ssc'][index]
|
||
|
||
next_line_blank = True if df['nlb'][index] == 'Y' else False
|
||
|
||
if index+2 > df.index[-1]:
|
||
break
|
||
|
||
if next_line_blank:
|
||
next_nbl_indent = df['ssc'][index+2]
|
||
nbl_identified = True if len(df['Identification_Status'][index+2].split(";")) == 1 else False
|
||
j = index + 2
|
||
lines_count = 2
|
||
|
||
else:
|
||
next_nbl_indent = df['ssc'][index+1]
|
||
nbl_identified = True if len(df['Identification_Status'][index+1].split(";")) == 1 else False
|
||
j = index + 1
|
||
lines_count = 1
|
||
|
||
start_index = index
|
||
rev_index = index
|
||
nbl_lines_count = 1
|
||
data = df['data'][index]
|
||
|
||
print("lines with same indent")
|
||
try:
|
||
print(data)
|
||
except:
|
||
pass
|
||
|
||
bunch_index = []
|
||
bunch_index.append(start_index)
|
||
|
||
## get the number of line with same indent
|
||
while cur_indent == next_nbl_indent and not nbl_identified:
|
||
|
||
nbl_lines_count +=1
|
||
cur_indent = next_nbl_indent
|
||
|
||
next_line_blank = True if df['nlb'][j] == 'Y' else False
|
||
data = df['data'][j]
|
||
try:
|
||
print(data)
|
||
except:
|
||
pass
|
||
|
||
bunch_index.append(j)
|
||
rev_index = j
|
||
|
||
if j+2 >= df.index[-1]:
|
||
break
|
||
|
||
if next_line_blank:
|
||
next_nbl_indent = df['ssc'][j+2]
|
||
j += 2
|
||
|
||
else:
|
||
next_nbl_indent = df['ssc'][j+1]
|
||
j += 1
|
||
|
||
print(nbl_lines_count)
|
||
|
||
## preceeded by
|
||
|
||
prev_line_blank = True if df['plb'][index] == 'Y' else False
|
||
|
||
|
||
if prev_line_blank:
|
||
if start_index - 2 >= 0:
|
||
pnbl = start_index - 2
|
||
else:
|
||
print("start of script \n")
|
||
continue
|
||
else:
|
||
if start_index - 1 >= 0:
|
||
pnbl = start_index - 1
|
||
else:
|
||
print("start of script \n")
|
||
continue
|
||
|
||
print("preceeded by",df['Identification_Status'][pnbl])
|
||
try:
|
||
print(df['data'][pnbl])
|
||
except:
|
||
pass
|
||
|
||
|
||
## followed by
|
||
|
||
next_line_blank = True if df['nlb'][rev_index] == 'Y' else False
|
||
|
||
|
||
if next_line_blank:
|
||
if rev_index + 2 <= df.index[-1]:
|
||
nnbl = rev_index + 2
|
||
else:
|
||
print("end of script \n")
|
||
continue
|
||
else:
|
||
if rev_index + 1 <= df.index[-1]:
|
||
nnbl = rev_index + 1
|
||
else:
|
||
print("end of script \n")
|
||
continue
|
||
|
||
try:
|
||
print(df['data'][nnbl])
|
||
except:
|
||
pass
|
||
print("followed by",df['Identification_Status'][nnbl])
|
||
|
||
print("\n")
|
||
|
||
if df['Identification_Status'][pnbl] == 'ps15' and df['Identification_Status'][nnbl] == 'ps7':
|
||
last_line_pos = df['Identification_Status'][rev_index].split(";")
|
||
|
||
if nbl_lines_count == 1 and len(df['Identification_Status'][rev_index].split(";")) > 1:
|
||
if line_pos[0] == 'ps1':
|
||
continue
|
||
print("CASE A2")
|
||
# single line is ps6
|
||
df['Identification_Status'][rev_index] = 'ps6'
|
||
df['isIdentified'][rev_index] = 'Yes'
|
||
df['When_Identified'][rev_index] = 'ExaminingSameIndentBunch'
|
||
bunch_index.remove(rev_index)
|
||
print("ps6", df['data'][rev_index])
|
||
|
||
elif nbl_lines_count > 1 and len(last_line_pos) > 1 and 'ps15' not in last_line_pos:
|
||
print("CASE A1")
|
||
# last line is ps6
|
||
df['Identification_Status'][rev_index] = 'ps6'
|
||
df['isIdentified'][rev_index] = 'Yes'
|
||
df['When_Identified'][rev_index] = 'ExaminingSameIndentBunch'
|
||
bunch_index.remove(rev_index)
|
||
print("ps6", df['data'][rev_index])
|
||
|
||
# rest of lines ; remove possibility other than slugline,transition and action
|
||
bunch_iter = iter(bunch_index)
|
||
|
||
# remove possibilities other than action, slug , transition
|
||
ps_not_to_remove = ['ps1','ps2','ps3','ps4','ps5','ps6','ps16','ps18','ps19']
|
||
|
||
|
||
for k in bunch_iter:
|
||
cur_line_pos = df['Identification_Status'][k].split(";")
|
||
print(cur_line_pos)
|
||
new_line_pos = []
|
||
for pos in cur_line_pos:
|
||
if pos in ps_not_to_remove:
|
||
new_line_pos.append(pos)
|
||
|
||
df['Identification_Status'][k] = ";".join(new_line_pos)
|
||
df['When_Identified'][k] = 'ExaminingSameIndentBunch'
|
||
print(df['Identification_Status'][k], df['data'][k])
|
||
|
||
|
||
|
||
|
||
# if nbl_lines_count > 1:
|
||
# print("CASE A")
|
||
# # last line is ps6
|
||
# df['Identification_Status'][rev_index] = 'ps6'
|
||
# df['When_Identified'][rev_index] = 'ExaminingSameIndentBunch'
|
||
# bunch_index.remove(rev_index)
|
||
# print("ps6", df['data'][rev_index])
|
||
|
||
# # first line is ps4
|
||
# df['Identification_Status'][start_index] = 'ps4'
|
||
# df['When_Identified'][start_index] = 'ExaminingSameIndentBunch'
|
||
# bunch_index.remove(start_index)
|
||
# print("ps4", df['data'][start_index])
|
||
|
||
# bunch_iter = iter(bunch_index)
|
||
|
||
# # middle lines are ps5
|
||
# for k in bunch_iter:
|
||
# df['Identification_Status'][k] = 'ps5'
|
||
# df['When_Identified'][k] = 'ExaminingSameIndentBunch'
|
||
# print("ps5", df['data'][k])
|
||
|
||
# if df['Identification_Status'][pnbl] != 'ps15' and df['Identification_Status'][nnbl] in ('ps4','ps6'):
|
||
# print("CASE B")
|
||
# bunch_iter = iter(bunch_index)
|
||
|
||
# # remove possibilities other than action, slug , transition
|
||
# ps_not_to_remove = ['ps1','ps2','ps3','ps4','ps5','ps6','ps16','ps18','ps19']
|
||
|
||
|
||
# for k in bunch_iter:
|
||
# cur_line_pos = df['Identification_Status'][k].split(";")
|
||
# print(cur_line_pos)
|
||
# new_line_pos = []
|
||
# for pos in cur_line_pos:
|
||
# if pos in ps_not_to_remove:
|
||
# new_line_pos.append(pos)
|
||
|
||
# df['Identification_Status'][k] = ";".join(new_line_pos)
|
||
# df['When_Identified'][k] = 'ExaminingSameIndentBunch'
|
||
# print(df['Identification_Status'][k], df['data'][k])
|
||
|
||
|
||
for advance in range(start_index,rev_index):
|
||
next(index_iter)
|
||
|
||
for index in df.index:
|
||
line_pos = df['Identification_Status'][index].split(";")
|
||
total_pos_after += len(line_pos)
|
||
|
||
# In[ ]:
|
||
|
||
print(total_pos_before,total_pos_after)
|
||
|
||
return df
|
||
|
||
def examine_relative_indent(df):
|
||
|
||
|
||
total_pos_before = 0
|
||
total_pos_after = 0
|
||
|
||
|
||
for index in df.index:
|
||
line_pos = df['Identification_Status'][index].split(";")
|
||
total_pos_before += len(line_pos)
|
||
|
||
|
||
|
||
index_iter = iter(df.index)
|
||
|
||
for index in index_iter:
|
||
# print("index",index)
|
||
# print(df['Identification_Status'][index])
|
||
# print(len(df['Identification_Status'][index].split(";")))
|
||
line_pos = df['Identification_Status'][index].split(";")
|
||
if len(line_pos) == 1:
|
||
continue
|
||
|
||
data = df['data'][index]
|
||
cur_indent = df['ssc'][index]
|
||
cur_parenthetical_absent = True if df['parenthetical'][index] == 'Absent' else False
|
||
|
||
|
||
|
||
## preceeded by
|
||
|
||
prev_line_blank = True if df['plb'][index] == 'Y' else False
|
||
|
||
|
||
if prev_line_blank:
|
||
if index - 2 >= 0:
|
||
pnbl = index - 2
|
||
else:
|
||
# print("start of script \n")
|
||
continue
|
||
else:
|
||
if index - 1 >= 0:
|
||
pnbl = index - 1
|
||
else:
|
||
# print("start of script \n")
|
||
continue
|
||
|
||
# print("preceeded by",df['Identification_Status'][pnbl])
|
||
pnbl_data = df['data'][pnbl]
|
||
pnbl_indent = df['ssc'][pnbl]
|
||
pnbl_parenthetical_absent = True if df['parenthetical'][pnbl] == 'Absent' else False
|
||
# print (pnbl,pnbl_indent,pnbl_data)
|
||
|
||
|
||
## followed by
|
||
|
||
next_line_blank = True if df['nlb'][index] == 'Y' else False
|
||
|
||
|
||
if next_line_blank:
|
||
if index + 2 <= df.index[-1]:
|
||
nnbl = index + 2
|
||
else:
|
||
print("end of script \n")
|
||
continue
|
||
else:
|
||
if index + 1 <= df.index[-1]:
|
||
nnbl = index + 1
|
||
else:
|
||
# print("end of script \n")
|
||
continue
|
||
|
||
|
||
nnbl_data = df['data'][nnbl]
|
||
nnbl_indent = df['ssc'][nnbl]
|
||
nnbl_parenthetical_absent = True if df['parenthetical'][nnbl] == 'Absent' else False
|
||
nnbl_line_pos = df['Identification_Status'][nnbl].split(";")
|
||
# print("followed by",df['Identification_Status'][nnbl])
|
||
|
||
# print(nnbl,nnbl_indent,nnbl_data)
|
||
|
||
## followed followed by
|
||
next_next_line_blank = True if df['nlb'][nnbl] == 'Y' else False
|
||
|
||
|
||
if next_next_line_blank:
|
||
if nnbl + 2 <= df.index[-1]:
|
||
nnnbl = nnbl + 2
|
||
else:
|
||
print("end of script \n")
|
||
continue
|
||
else:
|
||
if nnbl + 1 <= df.index[-1]:
|
||
nnnbl = nnbl + 1
|
||
else:
|
||
# print("end of script \n")
|
||
continue
|
||
|
||
|
||
nnnbl_data = df['data'][nnnbl]
|
||
nnnbl_indent = df['ssc'][nnnbl]
|
||
nnnbl_parenthetical_absent = True if df['parenthetical'][nnnbl] == 'Absent' else False
|
||
|
||
|
||
try:
|
||
|
||
if cur_indent > nnbl_indent and nnbl_indent > nnnbl_indent and cur_parenthetical_absent and nnbl_parenthetical_absent and nnnbl_parenthetical_absent:
|
||
try:
|
||
print(data)
|
||
except:
|
||
pass
|
||
print("current possibility",line_pos)
|
||
if 'ps7' in line_pos and 'ps7' not in nnbl_line_pos:
|
||
print("Identifying as ps7")
|
||
df['Identification_Status'][index] = 'ps7'
|
||
df['When_Identified'][index] = 'ExaminingRelativeIndent'
|
||
df['isIdentified'][index] = 'Yes'
|
||
print("\n")
|
||
except:
|
||
pass
|
||
for index in df.index:
|
||
line_pos = df['Identification_Status'][index].split(";")
|
||
total_pos_after += len(line_pos)
|
||
|
||
|
||
print(total_pos_before,total_pos_after)
|
||
|
||
|
||
|
||
|
||
|
||
|
||
# total_pos_before = 0
|
||
# total_pos_after = 0
|
||
|
||
# for index in df.index:
|
||
# line_pos = df['Identification_Status'][index].split(";")
|
||
# total_pos_before += len(line_pos)
|
||
|
||
|
||
|
||
# index_iter = iter(df.index)
|
||
|
||
# for index in index_iter:
|
||
# # print("index",index)
|
||
# # print(df['Identification_Status'][index])
|
||
# # print(len(df['Identification_Status'][index].split(";")))
|
||
# line_pos = df['Identification_Status'][index].split(";")
|
||
# if len(line_pos) == 1:
|
||
# continue
|
||
|
||
# data = df['data'][index]
|
||
# cur_indent = df['data_begins/Space count'][index]
|
||
# cur_parenthetical_absent = True if df['Parenthetical'][index] == 'Absent' else False
|
||
|
||
|
||
|
||
# ## preceeded by
|
||
|
||
# prev_line_blank = True if df['prvious_line_blank'][index] == 'Y' else False
|
||
|
||
|
||
# if prev_line_blank:
|
||
# if index - 2 >= 0:
|
||
# pnbl = index - 2
|
||
# else:
|
||
# # print("start of script \n")
|
||
# continue
|
||
# else:
|
||
# if index - 1 >= 0:
|
||
# pnbl = index - 1
|
||
# else:
|
||
# # print("start of script \n")
|
||
# continue
|
||
|
||
# # print("preceeded by",df['Identification_Status'][pnbl])
|
||
# pnbl_data = df['data'][pnbl]
|
||
# pnbl_indent = df['data_begins/Space count'][pnbl]
|
||
# pnbl_parenthetical_absent = True if df['Parenthetical'][pnbl] == 'Absent' else False
|
||
# # print (pnbl,pnbl_indent,pnbl_data)
|
||
|
||
|
||
# ## followed by
|
||
|
||
# next_line_blank = True if df['next_line_blank'][index] == 'Y' else False
|
||
|
||
|
||
# if next_line_blank:
|
||
# if index + 2 <= df.index[-1]:
|
||
# nnbl = index + 2
|
||
# else:
|
||
# print("end of script \n")
|
||
# continue
|
||
# else:
|
||
# if index + 1 <= df.index[-1]:
|
||
# nnbl = index + 1
|
||
# else:
|
||
# # print("end of script \n")
|
||
# continue
|
||
|
||
# nnbl_data = df['data'][nnbl]
|
||
# nnbl_indent = df['data_begins/Space count'][nnbl]
|
||
# nnbl_parenthetical_absent = True if df['Parenthetical'][nnbl] == 'Absent' else False
|
||
# # print("followed by",df['Identification_Status'][nnbl])
|
||
|
||
# # print(nnbl,nnbl_indent,nnbl_data)
|
||
|
||
|
||
# if cur_indent > pnbl_indent and cur_indent > nnbl_indent and cur_parenthetical_absent and pnbl_parenthetical_absent and nnbl_parenthetical_absent:
|
||
# print(data)
|
||
# print("current possibility",line_pos)
|
||
# print("Identifying as ps7")
|
||
# df['Identification_Status'][index] = 'ps7'
|
||
# df['When_Identified'][index] = 'ExaminingRelativeIndent'
|
||
# df['Identified'][index] = 'identified'
|
||
# print("\n")
|
||
|
||
# for index in df.index:
|
||
# line_pos = df['Identification_Status'][index].split(";")
|
||
# total_pos_after += len(line_pos)
|
||
|
||
# # In[ ]:
|
||
|
||
# print(total_pos_before,total_pos_after)
|
||
|
||
return df
|
||
|
||
def examine_pos_sp_indent(df,csv_removed_space_between_words,csv_pnnbl_ineligble_after_relative_indent):
|
||
|
||
df_indents = pd.read_csv(csv_removed_space_between_words,usecols = ['line_no','ssc'])
|
||
df_indents['ssc'].value_counts().sort_index()
|
||
identification_status = pd.read_csv(csv_pnnbl_ineligble_after_relative_indent, usecols = ['line_no','Identification_Status','isIdentified'] )
|
||
identification_status['line_no'] = identification_status['line_no'].astype(float)
|
||
df_indents = df_indents.merge(identification_status, how = 'inner' , on = 'line_no')
|
||
df_indents['ssc'].value_counts().sort_index()
|
||
|
||
df_indents.loc[df_indents['isIdentified'] == 'Yes' , 'ssc'].value_counts().sort_index()
|
||
|
||
sp_indents_df = df_indents.loc[df_indents['Identification_Status'] == 'ps7','ssc'].value_counts().sort_values(ascending = False).head(5)
|
||
sp_indents_list = sp_indents_df.index.values.tolist()
|
||
sp_indents_list.sort()
|
||
sp_indents_list
|
||
try:
|
||
|
||
pos_sp_indent = sp_indents_list[-1]
|
||
except:
|
||
pos_sp_indent = 200
|
||
|
||
margin = 3
|
||
for index in df.index:
|
||
if df['isIdentified'][index] == 'Yes':
|
||
continue
|
||
cur_indent = df['ssc'][index]
|
||
if cur_indent >= pos_sp_indent-margin and cur_indent <= pos_sp_indent+margin:
|
||
data = df['data'][index]
|
||
word_count = len(data.split())
|
||
#print(word_count)
|
||
if 'ps7' in df['Identification_Status'][index] and df['parenthetical'][index] == 'Absent' and word_count <= 2:
|
||
try:
|
||
print(index,data)
|
||
except:
|
||
pass
|
||
print("Identifying as speaker")
|
||
df['Identification_Status'][index] = 'ps7'
|
||
df['isIdentified'][index] = 'Yes'
|
||
df['When_Identified'][index] = 'ExaminingPossibleSpeakerIndent'
|
||
|
||
return df
|
||
|
||
|
||
def examine_action_middle_possibilities_using_pnnbl_top(df):
|
||
|
||
|
||
# loop through to examine for ps5
|
||
for index in df.index[2:-2]:
|
||
if df['isIdentified'][index] == 'Yes':
|
||
continue
|
||
|
||
if df['plb'][index] == 'N' :
|
||
pnbl_pos = df['Identification_Status'][index-1].split(";")
|
||
pnbl_index = index -1
|
||
else:
|
||
pnbl_pos = df['Identification_Status'][index-2].split(";")
|
||
pnbl_index = index -2
|
||
|
||
if df['nlb'][index] == 'N' :
|
||
nnbl_pos = df['Identification_Status'][index+1].split(";")
|
||
else:
|
||
nnbl_pos = df['Identification_Status'][index+2].split(";")
|
||
|
||
try:
|
||
if df['plb'][pnbl_index] == 'N' :
|
||
ppnbl_pos = df['Identification_Status'][pnbl_index-1].split(";")
|
||
else:
|
||
ppnbl_pos = df['Identification_Status'][pnbl_index-2].split(";")
|
||
except:
|
||
pass
|
||
|
||
|
||
line_no = df['line_no'][index]
|
||
data = df['data'][index]
|
||
cur_line_pos = df['Identification_Status'][index].split(";")
|
||
|
||
## from here
|
||
# ## declare ps5 if prev ps4,ps5 and next ps5,ps6
|
||
# if cur_line_pos[0] == 'ps5' and pnbl_pos[0] in ('ps4','ps5') and nnbl_pos[0] in ('ps5','ps6'):
|
||
|
||
# ## if cur parenthtical or
|
||
# if df['Parenthetical'][index] != 'Absent':
|
||
# print("skipping as current has parenthetical ")
|
||
# print(line_no,data)
|
||
# continue
|
||
|
||
# if ppnbl_pos[0] == 'ps16' :
|
||
# print(" skipping as pre previous top transition")
|
||
# print(line_no,data)
|
||
# continue
|
||
|
||
# try:
|
||
# if ppnbl_pos[1] == 'ps16' or pnbl_pos[1] == 'ps16':
|
||
# print("skipping as previous or pre previous top2 transition")
|
||
# print(line_no,data)
|
||
# continue
|
||
# except:
|
||
# pass
|
||
|
||
# print("Identifying line as ps5 as between 4,5 and 5,6")
|
||
# print(data)
|
||
# df['Identification_Status'][index] = 'ps5'
|
||
# df['When_Identified'][index] = 'ExaminingActionMiddlePossibilitiesUsingTopPnnbl'
|
||
# df['Identified'] = 'identified'
|
||
# continue
|
||
|
||
## till here
|
||
|
||
return df
|
||
|
||
|
||
def examine_speaker_extension(df,audit_df):
|
||
|
||
|
||
# loop through to examine speaker extensiton top 'ps8'
|
||
for index in df.index[2:-2]:
|
||
if df['isIdentified'][index] == 'Yes':
|
||
continue
|
||
|
||
# if df['prvious_line_blank'][index] == 'N' :
|
||
# pnbl_pos = df['Identification_Status'][index-1].split(";")
|
||
# pnbl_index = index -1
|
||
# else:
|
||
# pnbl_pos = df['Identification_Status'][index-2].split(";")
|
||
# pnbl_index = index -2
|
||
|
||
if df['nlb'][index] == 'N' :
|
||
nnbl_pos = df['Identification_Status'][index+1].split(";")
|
||
nnbl_par = df['parenthetical'][index+1]
|
||
else:
|
||
nnbl_pos = df['Identification_Status'][index+2].split(";")
|
||
nnbl_par = df['parenthetical'][index+2]
|
||
|
||
# try:
|
||
# if df['prvious_line_blank'][pnbl_index] == 'N' :
|
||
# ppnbl_pos = df['Identification_Status'][pnbl_index-1].split(";")
|
||
# else:
|
||
# ppnbl_pos = df['Identification_Status'][pnbl_index-2].split(";")
|
||
# except:
|
||
# pass
|
||
|
||
|
||
line_no = df['line_no'][index]
|
||
data = df['data'][index]
|
||
cur_line_pos = df['Identification_Status'][index].split(";")
|
||
cur_line_par = df['parenthetical'][index]
|
||
extn_found = False
|
||
extn_list = ['O.S.','V.O.',"CONT'D","CONT’D",'VOICE']
|
||
for extn in extn_list:
|
||
if extn in str(data):
|
||
extn_found = True
|
||
break
|
||
## if hishest is ps8
|
||
if cur_line_pos[0] == 'ps8' and cur_line_par == 'PartMidEnd' and nnbl_par == 'Absent' and not extn_found:
|
||
|
||
try:
|
||
print(data)
|
||
except:
|
||
pass
|
||
if re.search('\(',data,re.IGNORECASE) :
|
||
pos_starts = re.search('\(',data,re.IGNORECASE).start()
|
||
#pos_end = re.search('(',data,re.IGNORECASE).end()
|
||
before_par = data[:pos_starts]
|
||
after_par = data[pos_starts:]
|
||
|
||
print ("Separating Parenthetical")
|
||
print("Identifying as speaker")
|
||
print(index)
|
||
try:
|
||
print(before_par)
|
||
except:
|
||
pass
|
||
df['data'][index] = before_par
|
||
df['parenthetical'][index] = 'Absent'
|
||
df['When_Identified'][index] = 'ExaminingSpeakerLines'
|
||
df['case'][index] = 'AllUpper'
|
||
df['Identification_Status'][index] = 'ps7'
|
||
nlb = df['nlb'][index]
|
||
df['nlb'][index] = 'N'
|
||
|
||
line_no = df['line_no'][index]
|
||
next_line_no = df['line_no'][index+1]
|
||
new_line_no = (line_no + next_line_no) / 2
|
||
if new_line_no in audit_df.index:
|
||
new_line_no = (new_line_no + next_line_no)/2
|
||
|
||
audit_df.loc[new_line_no] = ''
|
||
audit_df.loc[new_line_no]['line_removed'] = 'No'
|
||
|
||
audit_df.loc[line_no]['line_broken_into_multiple_lines'] = 'Separated Speaker and Parenthetical'
|
||
|
||
print(df['Identification_Status'][index])
|
||
try:
|
||
print(after_par)
|
||
except:
|
||
pass
|
||
print("identifying parenthetical")
|
||
df.loc[index + 0.25] = ''
|
||
df.loc[index + 0.25,'data'] = after_par
|
||
df.loc[index + 0.25,'parenthetical'] = 'Complete'
|
||
df.loc[index + 0.25,'When_Identified'] ='ExaminingSpeakerLines'
|
||
df.loc[index + 0.25,'Identification_Status'] = 'ps10'
|
||
df.loc[index + 0.25,'case'] = ''
|
||
df.loc[index + 0.25,'plb'] = 'N'
|
||
df.loc[index + 0.25,'nlb'] = nlb
|
||
df.loc[index + 0.25,'line_no'] = new_line_no
|
||
|
||
df = df.sort_index().reset_index(drop=True)
|
||
continue
|
||
|
||
|
||
|
||
|
||
|
||
|
||
##now examine the speakers having : or apstrophe after them and separate to new line
|
||
|
||
|
||
speaker_list = df.loc[df['Identification_Status'] == 'ps7','data'].astype(str)
|
||
speaker_list = [ elem.strip() for elem in speaker_list ]
|
||
speaker_lines_list = df.loc[df['Identification_Status'] == 'ps7','line_no'].to_list()
|
||
unique_speaker_list = []
|
||
speaker_in_two_lines_list = []
|
||
for speaker in speaker_list:
|
||
speaker = speaker.strip()
|
||
#print(speaker)
|
||
if speaker not in unique_speaker_list:
|
||
unique_speaker_list.append(speaker)
|
||
## strip the blank spaces
|
||
|
||
|
||
print(unique_speaker_list)
|
||
|
||
for index in df.index[2:-2]:
|
||
if df['isIdentified'][index] == 'Yes':
|
||
continue
|
||
|
||
|
||
line_no = df['line_no'][index]
|
||
data = df['data'][index]
|
||
cur_line_pos = df['Identification_Status'][index].split(";")
|
||
extn_found = False
|
||
extn_list = ['O.S.','V.O.',"CONT'D","CONT’D",'VOICE']
|
||
for extn in extn_list:
|
||
if extn in str(data):
|
||
extn_found = True
|
||
break
|
||
|
||
|
||
for speaker in unique_speaker_list:
|
||
|
||
if re.search(speaker,data) and df['Identification_Status'][index] not in ('ps7','ps8','ps9') :
|
||
|
||
# check if speaker is at start of line followed by something (like : apostrpohe)
|
||
print(index)
|
||
pos_starts = re.search(speaker,data,re.IGNORECASE).start()
|
||
pos_end = re.search(speaker,data,re.IGNORECASE).end()
|
||
before_speaker = data[:pos_starts]
|
||
after_speaker = data[pos_end:]
|
||
print("speaker match found")
|
||
try:
|
||
print("data 4567:", data)
|
||
print("speaker 4568:",speaker)
|
||
print("before speaker:",before_speaker)
|
||
print("after speaker:",after_speaker)
|
||
except:
|
||
pass
|
||
try:
|
||
char1_after_speaker = after_speaker.lstrip()[0]
|
||
except:
|
||
char1_after_speaker = ''
|
||
|
||
try:
|
||
print("char1_after_speaker 4579 :",char1_after_speaker)
|
||
except:
|
||
pass
|
||
|
||
speaker_skip_list = ['MONTAGES','MUSICAL MONTAGES','MORNING','AT HOTEL','TV','ESSENTIALS','ESSENTIAL','LATER']
|
||
|
||
## separate parenthtical if speaker is followed by parenthtical
|
||
if before_speaker.isspace() and char1_after_speaker == '(' and df['parenthetical'][index] == 'PartMidEnd' and not extn_found:
|
||
print("before speaker inside the if condition:",before_speaker)
|
||
print ("Seperating Parenthetical")
|
||
print("Identifying speaker")
|
||
print(index)
|
||
df['data'][index] = before_speaker + speaker
|
||
df['parenthetical'][index] = 'Absent'
|
||
df['When_Identified'][index] = 'ExaminingSpeakerLines'
|
||
df['case'][index] = 'AllUpper'
|
||
df['Identification_Status'][index] = 'ps7'
|
||
nlb = df['nlb'][index]
|
||
df['nlb'][index] = 'N'
|
||
|
||
line_no = df['line_no'][index]
|
||
next_line_no = df['line_no'][index+1]
|
||
new_line_no = (line_no + next_line_no) / 2
|
||
if new_line_no in audit_df.index:
|
||
new_line_no = (new_line_no + next_line_no)/2
|
||
|
||
audit_df.loc[new_line_no] = ''
|
||
audit_df.loc[new_line_no]['line_removed'] = 'No'
|
||
|
||
audit_df.loc[line_no]['line_broken_into_multiple_lines'] = 'Separated Speaker and Parenthetical'
|
||
#print(df['Identification_Status'][index])
|
||
|
||
print("identifying parenthetical")
|
||
df.loc[index + 0.25] = ''
|
||
df.loc[index + 0.25,'data'] = after_speaker
|
||
df.loc[index + 0.25,'parenthetical'] = 'Complete'
|
||
df.loc[index + 0.25,'When_Identified'] ='ExaminingSpeakerLines'
|
||
df.loc[index + 0.25,'Identification_Status'] = 'ps10'
|
||
df.loc[index + 0.25,'case'] = ''
|
||
df.loc[index + 0.25,'plb'] = 'N'
|
||
df.loc[index + 0.25,'nlb'] = nlb
|
||
df.loc[index + 0.25,'line_no'] = new_line_no
|
||
|
||
df = df.sort_index().reset_index(drop=True)
|
||
continue
|
||
|
||
elif before_speaker.isspace() and char1_after_speaker == ':' and not extn_found and speaker not in speaker_skip_list:
|
||
print("before speaker in elif condition 4624:", before_speaker)
|
||
print ("Seperating : colon dialogue")
|
||
print("Identifying speaker")
|
||
print(index)
|
||
df['data'][index] = before_speaker + speaker
|
||
df['parenthetical'][index] = 'Absent'
|
||
df['When_Identified'][index] = 'ExaminingSpeakerLines'
|
||
df['case'][index] = 'AllUpper'
|
||
df['Identification_Status'][index] = 'ps7'
|
||
nlb = df['nlb'][index]
|
||
df['nlb'][index] = 'N'
|
||
|
||
#print(df['Identification_Status'][index])
|
||
line_no = df['line_no'][index]
|
||
next_line_no = df['line_no'][index+1]
|
||
new_line_no = (line_no + next_line_no) / 2
|
||
if new_line_no in audit_df.index:
|
||
new_line_no = (new_line_no + next_line_no)/2
|
||
|
||
audit_df.loc[new_line_no] = ''
|
||
audit_df.loc[new_line_no]['line_removed'] = 'No'
|
||
|
||
audit_df.loc[line_no]['line_broken_into_multiple_lines'] = 'Separated Speaker and Dialogue seperated by colon:'
|
||
|
||
|
||
print("possible dialogue")
|
||
print(after_speaker)
|
||
df.loc[index + 0.25] = ''
|
||
df.loc[index + 0.25,'data'] = after_speaker
|
||
#df.loc[index + 0.25,'Parenthetical'] = 'Complete'
|
||
df.loc[index + 0.25,'When_Identified'] ='ExaminingSpeakerLines'
|
||
df.loc[index + 0.25,'Identification_Status'] = ";".join(cur_line_pos)
|
||
df.loc[index + 0.25,'case'] = ''
|
||
df.loc[index + 0.25,'plb'] = 'N'
|
||
df.loc[index + 0.25,'nlb'] = nlb
|
||
df.loc[index + 0.25,'line_no'] = new_line_no
|
||
|
||
|
||
df = df.sort_index().reset_index(drop=True)
|
||
continue
|
||
|
||
elif before_speaker.isspace() and (char1_after_speaker == '‘' or char1_after_speaker == '"') and not extn_found:
|
||
print("before speaker in seperating apostrophe:", before_speaker)
|
||
print ("Seperating apostrophe")
|
||
print("Identifying speaker")
|
||
print(index)
|
||
df['data'][index] = before_speaker + speaker
|
||
df['parenthetical'][index] = 'Absent'
|
||
df['When_Identified'][index] = 'ExaminingSpeakerLines'
|
||
df['case'][index] = 'AllUpper'
|
||
df['Identification_Status'][index] = 'ps7'
|
||
nlb = df['nlb'][index]
|
||
df['nlb'][index] = 'N'
|
||
|
||
line_no = df['line_no'][index]
|
||
next_line_no = df['line_no'][index+1]
|
||
new_line_no = (line_no + next_line_no) / 2
|
||
if new_line_no in audit_df.index:
|
||
new_line_no = (new_line_no + next_line_no)/2
|
||
|
||
audit_df.loc[new_line_no] = ''
|
||
audit_df.loc[new_line_no]['line_removed'] = 'No'
|
||
|
||
audit_df.loc[line_no]['line_broken_into_multiple_lines'] = 'Separated Speaker and Dialogue seperated by colon:'
|
||
|
||
#print(df['Identification_Status'][index])
|
||
|
||
print("identifying as parenthetical")
|
||
df.loc[index + 0.25] = ''
|
||
df.loc[index + 0.25,'data'] = '(' + after_speaker.strip() + ')'
|
||
df.loc[index + 0.25,'parenthetical'] = 'Complete'
|
||
df.loc[index + 0.25,'When_Identified'] ='ExaminingSpeakerLines'
|
||
df.loc[index + 0.25,'Identification_Status'] = 'ps10'
|
||
df.loc[index + 0.25,'case'] = ''
|
||
df.loc[index + 0.25,'plb'] = 'N'
|
||
df.loc[index + 0.25,'nlb'] = nlb
|
||
df.loc[index + 0.25,'line_no'] = new_line_no
|
||
|
||
df = df.sort_index().reset_index(drop=True)
|
||
continue
|
||
|
||
# df.to_csv(p.output_file_path,index=False)
|
||
|
||
# lines_not_removed = audit_df.loc[audit_df['line_removed'] != 'Yes'].index.to_list()
|
||
# audit_df.sort_index(inplace= True)
|
||
# audit_df.reset_index(inplace= True)
|
||
|
||
# for line in lines_not_removed:
|
||
# new_data = ''
|
||
# try:
|
||
# new_data =df.loc[df['line_no'] == line, 'data'].values[0]
|
||
# except:
|
||
# pass
|
||
# #print(new_data)
|
||
# audit_df.loc[audit_df['line_no'] == line, 'data_corrected'] = new_data
|
||
# #print(audit_df.loc[audit_df['line_no'] == line, 'data_corrected'])
|
||
|
||
|
||
return df
|
||
|
||
|
||
|
||
def examine_action_using_top2_part1(df):
|
||
|
||
|
||
# loop through
|
||
for index in df.index[2:-2]:
|
||
if df['isIdentified'][index] == 'Yes':
|
||
continue
|
||
|
||
|
||
cur_indent = df['ssc'][index]
|
||
nnbl_indent = 0
|
||
|
||
if df['plb'][index] == 'N' :
|
||
pnbl_pos = df['Identification_Status'][index-1].split(";")
|
||
pnbl_index = index -1
|
||
else:
|
||
pnbl_pos = df['Identification_Status'][index-2].split(";")
|
||
pnbl_index = index -2
|
||
|
||
if df['nlb'][index] == 'N' :
|
||
nnbl_pos = df['Identification_Status'][index+1].split(";")
|
||
nnbl_par = df['parenthetical'][index+1]
|
||
nnbl_indent = df['ssc'][index+1]
|
||
else:
|
||
nnbl_pos = df['Identification_Status'][index+2].split(";")
|
||
nnbl_par = df['parenthetical'][index+2]
|
||
nnbl_indent = df['ssc'][index+2]
|
||
|
||
# try:
|
||
# if df['prvious_line_blank'][pnbl_index] == 'N' :
|
||
# ppnbl_pos = df['Identification_Status'][pnbl_index-1].split(";")
|
||
# else:
|
||
# ppnbl_pos = df['Identification_Status'][pnbl_index-2].split(";")
|
||
# except:
|
||
# pass
|
||
|
||
|
||
line_no = df['line_no'][index]
|
||
data = df['data'][index]
|
||
cur_line_pos = df['Identification_Status'][index].split(";")
|
||
|
||
## skip if next is dialogue
|
||
if ("".join(nnbl_pos) == 'ps13') or ("".join(nnbl_pos) == 'ps15'):
|
||
continue
|
||
|
||
|
||
try:
|
||
pnbl_top2 = pnbl_pos[1]
|
||
|
||
except:
|
||
pnbl_top2 = ''
|
||
|
||
try:
|
||
nnbl_top2 = nnbl_pos[1]
|
||
|
||
except:
|
||
nnbl_top2 = ''
|
||
|
||
## ps4 identification made stricter if pnbl top is ps8
|
||
try:
|
||
if pnbl_pos[0] == 'ps8':
|
||
continue
|
||
except:
|
||
pass
|
||
|
||
## examine ps4 = action beginning
|
||
line_identified = False
|
||
|
||
if cur_line_pos[0] == 'ps4' and pnbl_pos[0] != 'ps4' and nnbl_pos[0] != 'ps7':
|
||
try:
|
||
print(data)
|
||
except:
|
||
pass
|
||
print(pnbl_pos[0],cur_line_pos[0],nnbl_pos[0])
|
||
if 'ps6' in (pnbl_pos[0]) and df['nlb'][index] == 'N' and ('ps7' not in cur_line_pos):
|
||
## can make strict by indent also
|
||
print('identifying as ps4 case 1 top 1')
|
||
df['Identification_Status'][index] = 'ps4'
|
||
df['When_Identified'][index] = 'UsingTop2PNNBL'
|
||
line_identified = True
|
||
|
||
# elif pnbl_top2 and 'ps6' in pnbl_top2:
|
||
|
||
# print('identifying as ps4 case 1 top2')
|
||
# df['Identification_Status'][index] = 'ps4'
|
||
# df['When_Identified'][index] = 'UsingTop2PNNBL'
|
||
# line_identified = True
|
||
else:
|
||
print("ps6 not in previous")
|
||
# if line_identified :
|
||
# # run pnnbl ineligible
|
||
# do_while_pnnbl_ineligible_v035.run_pnnbl_ineligible(df)
|
||
|
||
|
||
|
||
if cur_line_pos[0] == 'ps4' and not line_identified and pnbl_pos[0] != 'ps4':
|
||
print("checking for ps5/6 in next")
|
||
if pnbl_pos[0] == 'ps5':
|
||
print("skipping as previous top is ps5" )
|
||
continue
|
||
elif pnbl_top2 and 'ps5' in pnbl_top2:
|
||
print("skipping as previous top2 is ps5" )
|
||
continue
|
||
if cur_indent == nnbl_indent:
|
||
if 'ps5' in nnbl_pos[0] or 'ps6' in nnbl_pos[0]:
|
||
try:
|
||
print(data,'identifying as ps4 case 2 top1')
|
||
except:
|
||
pass
|
||
df['Identification_Status'][index] = 'ps4'
|
||
df['When_Identified'][index] = 'UsingTop2PNNBL'
|
||
|
||
elif nnbl_top2 and 'ps5' in nnbl_top2:
|
||
try:
|
||
print(data,'identifying as ps4 case 2 top2 ps5')
|
||
except:
|
||
pass
|
||
df['Identification_Status'][index] = 'ps4'
|
||
df['When_Identified'][index] = 'UsingTop2PNNBL'
|
||
|
||
elif nnbl_top2 and 'ps6' in nnbl_top2:
|
||
try:
|
||
print(data,'identifying as ps4 case 2 top2 ps6')
|
||
except:
|
||
pass
|
||
df['Identification_Status'][index] = 'ps4'
|
||
df['When_Identified'][index] = 'UsingTop2PNNBL'
|
||
else:
|
||
print("current indent is not equal to next indent")
|
||
|
||
print("\n")
|
||
|
||
## commented as ps6 getting wrong
|
||
# for index in df.index[2:-2]:
|
||
# if df['isIdentified'][index] == 'Yes':
|
||
# continue
|
||
|
||
|
||
|
||
# if df['plb'][index] == 'N' :
|
||
# pnbl_pos = df['Identification_Status'][index-1].split(";")
|
||
# pnbl_index = index -1
|
||
# else:
|
||
# pnbl_pos = df['Identification_Status'][index-2].split(";")
|
||
# pnbl_index = index -2
|
||
|
||
# if df['nlb'][index] == 'N' :
|
||
# nnbl_pos = df['Identification_Status'][index+1].split(";")
|
||
# nnbl_par = df['parenthetical'][index+1]
|
||
# else:
|
||
# nnbl_pos = df['Identification_Status'][index+2].split(";")
|
||
# nnbl_par = df['parenthetical'][index+2]
|
||
|
||
# # try:
|
||
# # if df['prvious_line_blank'][pnbl_index] == 'N' :
|
||
# # ppnbl_pos = df['Identification_Status'][pnbl_index-1].split(";")
|
||
# # else:
|
||
# # ppnbl_pos = df['Identification_Status'][pnbl_index-2].split(";")
|
||
# # except:
|
||
# # pass
|
||
|
||
|
||
# line_no = df['line_no'][index]
|
||
# data = df['data'][index]
|
||
# cur_line_pos = df['Identification_Status'][index].split(";")
|
||
|
||
# try:
|
||
# pnbl_top2 = pnbl_pos[1]
|
||
|
||
# except:
|
||
# pnbl_top2 = ''
|
||
|
||
# try:
|
||
# nnbl_top2 = nnbl_pos[1]
|
||
|
||
# except:
|
||
# nnbl_top2 = ''
|
||
# ## examine action end
|
||
# if cur_line_pos[0] == 'ps6':
|
||
# try:
|
||
# print("pnbl",df['data'][pnbl_index])
|
||
# except:
|
||
# pass
|
||
# #print(pnbl_pos)
|
||
# if 'ps4' in pnbl_pos[0] or 'ps5' in pnbl_pos[0] :
|
||
# #print(pnbl_pos[0])
|
||
# try:
|
||
# print(data)
|
||
# except:
|
||
# pass
|
||
# print("identifying as ps6 using top1 pnbl")
|
||
# df['Identification_Status'][index] = 'ps6'
|
||
# df['When_Identified'][index] = 'UsingTop2PNNBL'
|
||
|
||
|
||
# elif pnbl_top2 and ('ps4' in pnbl_top2 or 'ps5' in pnbl_top2):
|
||
# try:
|
||
# print(data)
|
||
# except:
|
||
# pass
|
||
# df['Identification_Status'][index] = 'ps6'
|
||
# df['When_Identified'][index] = 'UsingTop2PNNBL'
|
||
# print("identifying as ps6 using top2 pnbl")
|
||
|
||
|
||
return df
|
||
|
||
|
||
def refine_action_possibilties(df):
|
||
|
||
for index in df.index[1:-1]:
|
||
if df['isIdentified'][index] == 'Yes':
|
||
continue
|
||
|
||
|
||
|
||
pnbl_pos = []
|
||
nnbl_pos = []
|
||
if index == 0:
|
||
pnbl_pos = ['blank']
|
||
elif df['plb'][index] == 'N' :
|
||
pnbl_pos = df['Identification_Status'][index-1].split(";")
|
||
pnbl_index = index -1
|
||
elif index - 1 == 0:
|
||
pnpl_pos = ['blank']
|
||
else:
|
||
pnbl_pos = df['Identification_Status'][index-2].split(";")
|
||
pnbl_index = index -2
|
||
|
||
if index == df.index[-1]:
|
||
nnbl_pos = ['blank']
|
||
elif df['nlb'][index] == 'N' :
|
||
nnbl_pos = df['Identification_Status'][index+1].split(";")
|
||
nnbl_par = df['parenthetical'][index+1]
|
||
elif index+1 == df.index[-1]:
|
||
nnbl_pos = ['blank']
|
||
else:
|
||
nnbl_pos = df['Identification_Status'][index+2].split(";")
|
||
nnbl_par = df['parenthetical'][index+2]
|
||
|
||
# try:
|
||
# if df['prvious_line_blank'][pnbl_index] == 'N' :
|
||
# ppnbl_pos = df['Identification_Status'][pnbl_index-1].split(";")
|
||
# else:
|
||
# ppnbl_pos = df['Identification_Status'][pnbl_index-2].split(";")
|
||
# except:
|
||
# pass
|
||
|
||
|
||
line_no = df['line_no'][index]
|
||
data = df['data'][index]
|
||
cur_line_pos = df['Identification_Status'][index].split(";")
|
||
|
||
# try:
|
||
# pnbl_top2 = pnbl_pos[1]
|
||
|
||
# except:
|
||
# pnbl_top2 = ''
|
||
|
||
# try:
|
||
# nnbl_top2 = nnbl_pos[1]
|
||
|
||
# except:
|
||
# nnbl_top2 = ''
|
||
|
||
# print(line_no,data)
|
||
# print(pnbl_pos)
|
||
# print(cur_line_pos)
|
||
# print(nnbl_pos)
|
||
line_new_pos = []
|
||
#using pnbl and nnbl identified lines refine/identify current line
|
||
# if "".join(pnbl_pos) in ('ps15','ps6') and cur_line_pos[0] == 'ps4':
|
||
# print(line_no,data)
|
||
# print("pnbl is 15 or 6 and current top is 'ps4'")
|
||
# print("Identifying as ps4")
|
||
# df['Identification_Status'][index] = 'ps4'
|
||
# cur_line_pos = ['ps4']
|
||
# df['When_Identified'][index] = 'RefiningActionPossibilities'
|
||
|
||
line_new_pos = cur_line_pos
|
||
if "".join(nnbl_pos) == 'ps7':
|
||
try:
|
||
print(line_no,data)
|
||
except:
|
||
pass
|
||
print("remove ps5,14")
|
||
if 'ps5' in line_new_pos:
|
||
line_new_pos.remove('ps5')
|
||
if 'ps14' in line_new_pos:
|
||
line_new_pos.remove('ps14')
|
||
|
||
if "".join(nnbl_pos) == 'ps4':
|
||
try:
|
||
print(line_no,data)
|
||
except:
|
||
pass
|
||
print("remove ps5")
|
||
if 'ps5' in line_new_pos:
|
||
line_new_pos.remove('ps5')
|
||
|
||
if "".join(pnbl_pos) == 'ps4':
|
||
try:
|
||
print(line_no,data)
|
||
except:
|
||
pass
|
||
print("remove ps3 and 7")
|
||
if 'ps3' in line_new_pos:
|
||
line_new_pos.remove('ps3')
|
||
if 'ps7' in line_new_pos:
|
||
line_new_pos.remove('ps7')
|
||
|
||
df['Identification_Status'][index] = ";".join(line_new_pos)
|
||
|
||
return df
|
||
|
||
|
||
def prep_pnnbl_eligible_csv(pnbl_eligibility_matrix,nnbl_eligibility_matrix):
|
||
|
||
cur_dir = mypath
|
||
# cur_dir = os.getcwd()
|
||
|
||
pnbl_eligible_df = pd.read_csv(pnbl_eligibility_matrix, skiprows = [0])
|
||
nnbl_eligible_df = pd.read_csv(nnbl_eligibility_matrix, skiprows = [0])
|
||
|
||
|
||
pnbl_eligible_df.rename(columns={pnbl_eligible_df.columns[1]:'Possibilities',pnbl_eligible_df.columns[0]:'Description'}
|
||
,inplace = True)
|
||
nnbl_eligible_df.rename(columns={nnbl_eligible_df.columns[1]:'Possibilities',nnbl_eligible_df.columns[0]:'Description'}
|
||
,inplace = True)
|
||
|
||
pnbl_eligible_df.to_csv(os.path.join(cur_dir,'pnbl_eligible_pos.csv'), index =False)
|
||
nnbl_eligible_df.to_csv(os.path.join(cur_dir,'nnbl_eligible_pos.csv'), index =False)
|
||
|
||
pnbl_eligible_df = pd.read_csv(os.path.join(cur_dir,'pnbl_eligible_pos.csv'), index_col = ['Possibilities'])
|
||
nnbl_eligible_df = pd.read_csv(os.path.join(cur_dir,'nnbl_eligible_pos.csv'), index_col = ['Possibilities'])
|
||
|
||
|
||
def check_eligibility_using_identified_pnnbl(df):
|
||
|
||
total_pos_before = 0
|
||
total_pos_after = 0
|
||
lines_identified = 0
|
||
cur_dir = mypath
|
||
pnbl_eligible_df = pd.read_csv(os.path.join(cur_dir,'pnbl_eligible_pos.csv'))
|
||
nnbl_eligible_df = pd.read_csv(os.path.join(cur_dir,'nnbl_eligible_pos.csv'))
|
||
|
||
|
||
for index in df.index:
|
||
if df['isIdentified'][index] == 'Yes':
|
||
total_pos_before += 1
|
||
total_pos_after += 1
|
||
print(total_pos_before,total_pos_after)
|
||
continue
|
||
|
||
|
||
line_no = df['line_no'][index]
|
||
data = df['data'][index]
|
||
cur_line_pos = df['Identification_Status'][index].split(";")
|
||
if cur_line_pos[0] != '':
|
||
total_pos_before += len(cur_line_pos)
|
||
|
||
pnbl_pos = []
|
||
nnbl_pos = []
|
||
if index == 0:
|
||
pnbl_pos = ['blank']
|
||
elif df['plb'][index] == 'N' :
|
||
pnbl_pos = df['Identification_Status'][index-1].split(";")
|
||
pnbl_index = index -1
|
||
elif index - 1 == 0:
|
||
pnpl_pos = ['blank']
|
||
else:
|
||
pnbl_pos = df['Identification_Status'][index-2].split(";")
|
||
pnbl_index = index -2
|
||
|
||
if index == df.index[-1]:
|
||
nnbl_pos = ['blank']
|
||
elif df['nlb'][index] == 'N' :
|
||
nnbl_pos = df['Identification_Status'][index+1].split(";")
|
||
nnbl_par = df['parenthetical'][index+1]
|
||
elif index+1 == df.index[-1]:
|
||
nnbl_pos = ['blank']
|
||
else:
|
||
nnbl_pos = df['Identification_Status'][index+2].split(";")
|
||
nnbl_par = df['parenthetical'][index+2]
|
||
|
||
|
||
line_new_pos = cur_line_pos
|
||
try:
|
||
print(line_no,data)
|
||
except:
|
||
pass
|
||
print("current line pos", cur_line_pos,df['Identification_Status'][index])
|
||
try:
|
||
print("previous line pos",pnbl_pos)
|
||
print("next line pos",nnbl_pos)
|
||
except:
|
||
pass
|
||
if len(pnbl_pos) == 1 and pnbl_pos[0] != 'blank':
|
||
print("pnbl is identified as ", pnbl_pos)
|
||
## keep only possibilities which can exist with this pnbl
|
||
## filter
|
||
pnbl_eligible_pos = pnbl_eligible_df.loc[pnbl_eligible_df[pnbl_pos[0]] == 'yes','Possibilities'].to_list()
|
||
print("eligible possibilties as per pnbl",pnbl_eligible_pos)
|
||
line_new_pos = [ps for ps in line_new_pos if ps in pnbl_eligible_pos]
|
||
print("line new possibilities", line_new_pos)
|
||
else:
|
||
print("previous line not identified")
|
||
|
||
if len(nnbl_pos) == 1 and nnbl_pos[0] != 'blank':
|
||
print("nnbl is identified as ", nnbl_pos)
|
||
## keep only possibilities which can exist with this pnbl
|
||
## filter
|
||
nnbl_eligible_pos = nnbl_eligible_df.loc[nnbl_eligible_df[nnbl_pos[0]] == 'yes','Possibilities'].to_list()
|
||
print("eligible possibilties as per nnbl",nnbl_eligible_pos)
|
||
line_new_pos = [ps for ps in line_new_pos if ps in nnbl_eligible_pos]
|
||
print("line new possibilities", line_new_pos)
|
||
else:
|
||
print("next line not identified")
|
||
|
||
## make null as special term
|
||
if len(line_new_pos) == 0:
|
||
print("making null possibility special term ps17")
|
||
line_new_pos = ['ps17']
|
||
|
||
if len(line_new_pos) == 1:
|
||
df['isIdentified'][index] = 'Yes'
|
||
lines_identified += 1
|
||
df['Identification_Status'][index] = (";").join(line_new_pos)
|
||
total_pos_after += len(line_new_pos)
|
||
print(total_pos_before,total_pos_after)
|
||
|
||
print(total_pos_before,total_pos_after)
|
||
pos_decreased = True if total_pos_after < total_pos_before else False
|
||
return df,pos_decreased,lines_identified
|
||
|
||
|
||
|
||
def do_while_examine_using_identified_pnnbl(df):
|
||
|
||
|
||
pos_decreased = True
|
||
total_lines_identified = 0
|
||
|
||
iteration = 0
|
||
while pos_decreased :
|
||
iteration += 1
|
||
df,pos_decreased,lines_identified = check_eligibility_using_identified_pnnbl(df)
|
||
total_lines_identified += lines_identified
|
||
print(iteration,total_lines_identified)
|
||
print(iteration,total_lines_identified)
|
||
|
||
return df
|
||
|
||
|
||
|
||
|
||
def start_top_identifications_part1(df):
|
||
|
||
|
||
# loop through to examine speaker extension
|
||
for index in df.index[1:-1]:
|
||
if df['isIdentified'][index] == 'Yes':
|
||
continue
|
||
|
||
pnbl_pos = []
|
||
nnbl_pos = []
|
||
pnbl_index = index -1
|
||
nnbl_index = index +1
|
||
if index == 0:
|
||
pnbl_pos = ['blank']
|
||
elif df['plb'][index] == 'N' :
|
||
pnbl_pos = df['Identification_Status'][index-1].split(";")
|
||
pnbl_index = index -1
|
||
elif index - 1 == 0:
|
||
pnpl_pos = ['blank']
|
||
else:
|
||
pnbl_pos = df['Identification_Status'][index-2].split(";")
|
||
pnbl_index = index -2
|
||
|
||
if index == df.index[-1]:
|
||
nnbl_pos = ['blank']
|
||
elif df['nlb'][index] == 'N' :
|
||
nnbl_pos = df['Identification_Status'][index+1].split(";")
|
||
nnbl_par = df['parenthetical'][index+1]
|
||
nnbl_index = index +1
|
||
elif index+1 == df.index[-1]:
|
||
nnbl_pos = ['blank']
|
||
else:
|
||
nnbl_pos = df['Identification_Status'][index+2].split(";")
|
||
nnbl_par = df['parenthetical'][index+2]
|
||
nnbl_index = index +2
|
||
|
||
|
||
cur_indent = df['ssc'][index]
|
||
pnbl_indent = df['ssc'][pnbl_index]
|
||
nnbl_indent = df['ssc'][nnbl_index]
|
||
|
||
try:
|
||
if df['plb'][pnbl_index] == 'N' :
|
||
ppnbl_pos = df['Identification_Status'][pnbl_index-1].split(";")
|
||
else:
|
||
ppnbl_pos = df['Identification_Status'][pnbl_index-2].split(";")
|
||
ppnbl_exists = True
|
||
except:
|
||
ppnbl_exists = False
|
||
pass
|
||
|
||
|
||
line_no = df['line_no'][index]
|
||
data = df['data'][index]
|
||
cur_line_pos = df['Identification_Status'][index].split(";")
|
||
cur_par = df['parenthetical'][index]
|
||
|
||
try:
|
||
|
||
pnbl_par = df['parenthetical'][pnbl_index]
|
||
pnbl_case = df['case'][pnbl_index]
|
||
pnbl_data = df['data'][pnbl_index]
|
||
except:
|
||
pass
|
||
|
||
try:
|
||
pnbl_top2 = pnbl_pos[1]
|
||
|
||
except:
|
||
pnbl_top2 = ''
|
||
|
||
# try:
|
||
# nnbl_top2 = nnbl_pos[1]
|
||
|
||
# except:
|
||
# nnbl_top2 = ''
|
||
|
||
# print(line_no,data)
|
||
# print(pnbl_pos)
|
||
# print(cur_line_pos)
|
||
# print(nnbl_pos)
|
||
line_new_pos = []
|
||
#using pnbl and nnbl identified lines refine/identify current line
|
||
# if "".join(pnbl_pos) in ('ps15','ps6') and cur_line_pos[0] == 'ps4':
|
||
# print(line_no,data)
|
||
# print("pnbl is 15 or 6 and current top is 'ps4'")
|
||
# print("Identifying as ps4")
|
||
# df['Identification_Status'][index] = 'ps4'
|
||
# cur_line_pos = ['ps4']
|
||
# df['When_Identified'][index] = 'RefiningActionPossibilities'
|
||
|
||
|
||
cur_line_pos = df['Identification_Status'][index].split(";")
|
||
top1 = cur_line_pos[0]
|
||
top2 = top1
|
||
top3 = top1
|
||
top4 = top1
|
||
top5 = top1
|
||
if len(cur_line_pos) == 5:
|
||
top5 = cur_line_pos[4]
|
||
if len(cur_line_pos) >= 4:
|
||
top4 = cur_line_pos[3]
|
||
|
||
if len(cur_line_pos) >= 3:
|
||
top3 = cur_line_pos[2]
|
||
if len(cur_line_pos) >= 2:
|
||
top2 = cur_line_pos[1]
|
||
|
||
|
||
|
||
|
||
|
||
## if top is 1,6,7,16 identify them
|
||
## identify as 7 where 9 is top and 7 is 2nd and parenthetical absent
|
||
if cur_line_pos[0] == 'ps1' or (cur_line_pos[0] =='ps6' and pnbl_par =='Absent' and "".join(nnbl_pos) != 'ps6' and nnbl_pos[0] != 'ps5' and nnbl_pos[0] != 'ps6') or cur_line_pos[0] == 'ps7' or cur_line_pos[0] == 'ps9' or cur_line_pos[0] == 'ps8' or (cur_line_pos[0] == 'ps16' and nnbl_pos[0] != 'ps13' and nnbl_pos[0] != 'ps15' and nnbl_pos[0] != 'ps10'):
|
||
try:
|
||
print(line_no,data)
|
||
except:
|
||
pass
|
||
if(len(cur_line_pos) > 1):
|
||
if cur_line_pos[0] == 'ps6' and 'ps15' in (top1,top2,top3,top4,top5):
|
||
print("not identifying as ps6 can also be ps15 ")
|
||
continue
|
||
if cur_line_pos[0] == 'ps6' and ( pnbl_pos[0] == 'ps13' or pnbl_pos == 'ps14'):
|
||
print("not identifying as ps6 as could be ps15")
|
||
continue
|
||
if cur_line_pos[0] == 'ps6' and (pnbl_case == 'AllUpper' and len(pnbl_data.split()) == 1) :
|
||
print("not identifying as ps6 can also be ps15 ")
|
||
continue
|
||
if cur_line_pos[0] == 'ps6' and (len(data.split()) == 1 and cur_indent > pnbl_indent ) :
|
||
print("not identifying as ps6 can also be ps7 ")
|
||
continue
|
||
|
||
print("identifying as top",cur_line_pos[0])
|
||
|
||
if cur_line_pos[0] == 'ps9' and top2 == 'ps7' and cur_par == 'Absent':
|
||
line_new_pos.append(top2)
|
||
else:
|
||
line_new_pos.append(cur_line_pos[0])
|
||
|
||
try:
|
||
|
||
if nnbl_pos[0] == 'ps1' and 'ps2' in cur_line_pos :
|
||
line_new_pos.append('ps2')
|
||
print("added ps2 to ps1")
|
||
except:
|
||
print("possiblity next line pos not available")
|
||
pass
|
||
|
||
try:
|
||
|
||
if pnbl_pos[0] == 'ps1' and 'ps3' in cur_line_pos :
|
||
line_new_pos.append('ps3')
|
||
print("added ps3 to ps1")
|
||
except:
|
||
print("possiblity previous line pos not available")
|
||
pass
|
||
|
||
if top1 == 'ps1' and (top2 == 'ps6' or top3 == 'ps6' or top2 == 'ps8'):
|
||
## not indentifying as ps1
|
||
continue
|
||
|
||
if cur_line_pos[0] == 'ps1' and 'ps30' in cur_line_pos :
|
||
line_new_pos.append('ps30')
|
||
print("added ps30 to ps1")
|
||
|
||
|
||
df['Identification_Status'][index] = ";".join(line_new_pos)
|
||
df['When_Identified'][index] = 'StartIdentifyingTopsPart1'
|
||
continue
|
||
|
||
ppnbl_top_not_16 = True
|
||
if ppnbl_exists:
|
||
ppnbl_top_not_16 = False if ppnbl_pos[0] == 'ps16' else True
|
||
|
||
if cur_line_pos[0] == 'ps5' or cur_line_pos[1] == 'ps5':
|
||
if 'ps16' not in pnbl_top2:
|
||
if (pnbl_pos[0] == 'ps4' or (pnbl_pos[0] == 'ps5' and ppnbl_top_not_16 )) and df['nlb'][index] == 'N' and cur_indent == pnbl_indent and cur_indent == nnbl_indent :
|
||
print("code commented")
|
||
|
||
# print("identifying current as ps5")
|
||
# print(line_no,data)
|
||
# df['Identification_Status'][index] = 'ps5'
|
||
# df['When_Identified'][index] = 'StartIdentifyingTops'
|
||
elif pnbl_pos[0] == 'ps4' and df['nlb'][index] == 'Y' and cur_indent == pnbl_indent :
|
||
print("identifying current as ps6 as next also blank")
|
||
try:
|
||
print(line_no,data)
|
||
except:
|
||
pass
|
||
df['Identification_Status'][index] = 'ps6'
|
||
df['When_Identified'][index] = 'StartIdentifyingTopsPart1'
|
||
|
||
## additonally identify the ps8
|
||
#speaker_list = df.loc[df['Identification_Status'] == 'ps7','data'].to_list()
|
||
for index in df.index[1:-1]:
|
||
if df['isIdentified'][index] == 'Yes':
|
||
continue
|
||
cur_line_pos = df['Identification_Status'][index].split(";")
|
||
data = df['data'][index]
|
||
|
||
extn_found = False
|
||
extn_list = ['O.S.','V.O.',"CONT'D","CONT’D",'VOICE','CONT.']
|
||
for extn in extn_list:
|
||
if extn in str(data):
|
||
extn_found = True
|
||
break
|
||
|
||
|
||
if cur_line_pos[0] == 'ps8' and extn_found:
|
||
df['Identification_Status'][index] = 'ps8'
|
||
df['When_Identified'][index] = 'StartIdentifyingTopsPart1'
|
||
|
||
return df
|
||
|
||
|
||
def start_top_identifications_part1_diluted(df):
|
||
|
||
|
||
print("in tops diluted")
|
||
# loop through to examine speaker extension
|
||
for index in df.index[1:-1]:
|
||
if df['isIdentified'][index] == 'Yes':
|
||
continue
|
||
print(index)
|
||
pnbl_pos = []
|
||
nnbl_pos = []
|
||
pnbl_index = index -1
|
||
nnbl_index = index +1
|
||
if index == 0:
|
||
pnbl_pos = ['blank']
|
||
elif df['plb'][index] == 'N' :
|
||
pnbl_pos = df['Identification_Status'][index-1].split(";")
|
||
pnbl_index = index -1
|
||
elif index - 1 == 0:
|
||
pnpl_pos = ['blank']
|
||
else:
|
||
pnbl_pos = df['Identification_Status'][index-2].split(";")
|
||
pnbl_index = index -2
|
||
|
||
if index == df.index[-1]:
|
||
nnbl_pos = ['blank']
|
||
elif df['nlb'][index] == 'N' :
|
||
nnbl_pos = df['Identification_Status'][index+1].split(";")
|
||
nnbl_par = df['parenthetical'][index+1]
|
||
nnbl_index = index +1
|
||
elif index+1 == df.index[-1]:
|
||
nnbl_pos = ['blank']
|
||
else:
|
||
nnbl_pos = df['Identification_Status'][index+2].split(";")
|
||
nnbl_par = df['parenthetical'][index+2]
|
||
nnbl_index = index +2
|
||
|
||
|
||
cur_indent = df['ssc'][index]
|
||
try:
|
||
pnbl_indent = df['ssc'][pnbl_index]
|
||
pnbl_case = df['case'][pnbl_index]
|
||
except:
|
||
pnbl_indent = -1
|
||
pnbl_case = ''
|
||
try:
|
||
nnbl_indent = df['ssc'][nnbl_index]
|
||
nnbl_case = df['case'][nnbl_index]
|
||
except:
|
||
nnbl_indent = -1
|
||
nnbl_case = ''
|
||
|
||
|
||
#nnbl_indent = df['ssc'][nnbl_index]
|
||
|
||
try:
|
||
if df['plb'][pnbl_index] == 'N' :
|
||
ppnbl_pos = df['Identification_Status'][pnbl_index-1].split(";")
|
||
else:
|
||
ppnbl_pos = df['Identification_Status'][pnbl_index-2].split(";")
|
||
ppnbl_exists = True
|
||
except:
|
||
ppnbl_exists = False
|
||
pass
|
||
|
||
|
||
line_no = df['line_no'][index]
|
||
data = df['data'][index]
|
||
cur_line_pos = df['Identification_Status'][index].split(";")
|
||
cur_line_case = df['case'][index]
|
||
pnbl_par = df['parenthetical'][pnbl_index]
|
||
|
||
try:
|
||
pnbl_top2 = pnbl_pos[1]
|
||
|
||
except:
|
||
pnbl_top2 = ''
|
||
|
||
# try:
|
||
# nnbl_top2 = nnbl_pos[1]
|
||
|
||
# except:
|
||
# nnbl_top2 = ''
|
||
|
||
# print(line_no,data)
|
||
# print(pnbl_pos)
|
||
# print(cur_line_pos)
|
||
# print(nnbl_pos)
|
||
line_new_pos = []
|
||
#using pnbl and nnbl identified lines refine/identify current line
|
||
# if "".join(pnbl_pos) in ('ps15','ps6') and cur_line_pos[0] == 'ps4':
|
||
# print(line_no,data)
|
||
# print("pnbl is 15 or 6 and current top is 'ps4'")
|
||
# print("Identifying as ps4")
|
||
# df['Identification_Status'][index] = 'ps4'
|
||
# cur_line_pos = ['ps4']
|
||
# df['When_Identified'][index] = 'RefiningActionPossibilities'
|
||
|
||
|
||
cur_line_pos = df['Identification_Status'][index].split(";")
|
||
top1 = cur_line_pos[0]
|
||
top2 = top1
|
||
top3 = top1
|
||
top4 = top1
|
||
top5 = top1
|
||
if len(cur_line_pos) == 5:
|
||
top5 = cur_line_pos[4]
|
||
if len(cur_line_pos) >= 4:
|
||
top4 = cur_line_pos[3]
|
||
|
||
if len(cur_line_pos) >= 3:
|
||
top3 = cur_line_pos[2]
|
||
if len(cur_line_pos) >= 2:
|
||
top2 = cur_line_pos[1]
|
||
|
||
|
||
|
||
if top1 == 'ps1' and (top2 == 'ps6' or top3 == 'ps6' or top2 == 'ps8'):
|
||
## not indentifying as ps1
|
||
continue
|
||
|
||
|
||
|
||
## if top is 1,6,7,16 identify them
|
||
if cur_line_pos[0] == 'ps1' or (cur_line_pos[0] =='ps6' and pnbl_par =='Absent' and "".join(nnbl_pos) != 'ps6' and nnbl_pos[0] != 'ps5') or cur_line_pos[0] == 'ps7' or (cur_line_pos[0] == 'ps16' and nnbl_pos[0] != 'ps15'):
|
||
try:
|
||
print(line_no,data)
|
||
except:
|
||
pass
|
||
if(len(cur_line_pos) > 1):
|
||
if cur_line_pos[0] == 'ps6' and 'ps15' in (top1,top2,top3,top4,top5):
|
||
print("not identifying as ps6 can also be ps15 ")
|
||
continue
|
||
print("identifying as top",cur_line_pos[0])
|
||
|
||
line_new_pos.append(cur_line_pos[0])
|
||
if cur_line_pos[0] == 'ps1' and 'ps30' in cur_line_pos :
|
||
line_new_pos.append('ps30')
|
||
print("added ps30 to ps1")
|
||
|
||
df['Identification_Status'][index] = ";".join(line_new_pos)
|
||
df['When_Identified'][index] = 'StartIdentifyingTopsDiluted'
|
||
continue
|
||
|
||
ppnbl_top_not_16 = True
|
||
if ppnbl_exists:
|
||
ppnbl_top_not_16 = False if ppnbl_pos[0] == 'ps16' else True
|
||
|
||
if cur_line_pos[0] == 'ps5' or cur_line_pos[1] == 'ps5':
|
||
if 'ps16' not in pnbl_top2:
|
||
|
||
if (pnbl_pos[0] == 'ps4' or (pnbl_pos[0] == 'ps5' and ppnbl_top_not_16 )) and df['nlb'][index] == 'N' and cur_indent == pnbl_indent and cur_indent == nnbl_indent and pnbl_case != 'AllUpper' and cur_line_case != 'AllUpper' and nnbl_case !='AllUpper':
|
||
print("Lenient: code not commented")
|
||
|
||
print("identifying current as ps5")
|
||
try:
|
||
print(line_no,data)
|
||
except:
|
||
pass
|
||
df['Identification_Status'][index] = 'ps5'
|
||
df['When_Identified'][index] = 'StartIdentifyingTopsDiluted'
|
||
elif pnbl_pos[0] == 'ps4' and df['nlb'][index] == 'Y' and cur_indent == pnbl_indent :
|
||
print("identifying current as ps6 as next also blank")
|
||
try:
|
||
print(line_no,data)
|
||
except:
|
||
pass
|
||
|
||
df['Identification_Status'][index] = 'ps6'
|
||
df['When_Identified'][index] = 'StartIdentifyingTopsDiluted'
|
||
|
||
|
||
return df
|
||
|
||
def examine_speaker_mix_part1(df,audit_df):
|
||
|
||
df = df.sort_index().reset_index(drop=True)
|
||
audit_df = df.sort_index().reset_index(drop=True)
|
||
|
||
#df = df.sort_index().reset_index(drop=True)
|
||
for index in df.index:
|
||
if df['isIdentified'][index] == 'Yes' or pd.isna(df['Identification_Status'][index]):
|
||
continue
|
||
|
||
|
||
line_no = df['line_no'][index]
|
||
data = df['data'][index]
|
||
cur_line_pos = df['Identification_Status'][index].split(";")
|
||
|
||
|
||
## if parenthetical at last then split to new line
|
||
if 'ps30' not in cur_line_pos:
|
||
continue
|
||
|
||
if cur_line_pos[0] == 'ps1' or cur_line_pos[0] == 'ps2' or cur_line_pos[0] == 'ps14' or cur_line_pos[0] == 'ps5' or cur_line_pos[0] == 'ps13' or cur_line_pos[0] == 'ps4' :
|
||
# skipping as could be slugline
|
||
continue
|
||
|
||
extn_found = False
|
||
extn_list = ['O.S.','V.O.',"CONT'D","CONT’D",'VOICE']
|
||
for extn in extn_list:
|
||
if extn in str(data):
|
||
extn_found = True
|
||
break
|
||
|
||
if df['parenthetical'][index] == 'PartMidEnd' and not extn_found :
|
||
try:
|
||
print(data)
|
||
except:
|
||
pass
|
||
if re.search('\(',data,re.IGNORECASE):
|
||
pos_starts = re.search('\(',data,re.IGNORECASE).start()
|
||
#pos_end = re.search('(',data,re.IGNORECASE).end()
|
||
before_par = data[:pos_starts]
|
||
after_par = data[pos_starts:]
|
||
print("before_par = data[:pos_starts] line 5557:", before_par)
|
||
print("after_par = data[pos_starts:] line 5558 :", after_par)
|
||
print ("Seperating Parenthetical")
|
||
print("Identifying as speaker mix with dialogue and current pos")
|
||
print(cur_line_pos)
|
||
print(index)
|
||
if not before_par.isupper():
|
||
# skip as possibly not speaker
|
||
continue
|
||
# try:
|
||
# print(before_par)
|
||
# except:
|
||
# pass
|
||
#print("df['data'][index]:",df['data'][index])
|
||
try:
|
||
df['data'][index] = before_par
|
||
except:
|
||
df['data'][int(index)] = before_par
|
||
|
||
df['parenthetical'][index] = 'Absent'
|
||
|
||
df['When_Identified'][index] = 'ExaminingSpeakerMix'
|
||
|
||
df['Identification_Status'][index] = ";".join(cur_line_pos)
|
||
nlb = df['nlb'][index]
|
||
df['nlb'][index] = 'N'
|
||
|
||
line_no = df['line_no'][index]
|
||
next_line_no = df['line_no'][index+1]
|
||
try:
|
||
new_line_no = (line_no + next_line_no) / 2
|
||
except:
|
||
new_line_no = (int(line_no) + int(next_line_no)) / 2
|
||
if new_line_no in audit_df.index:
|
||
new_line_no = (new_line_no + next_line_no)/2
|
||
audit_df.loc[new_line_no] = np.nan
|
||
audit_df.loc[new_line_no]['line_removed'] = 'No'
|
||
print(
|
||
"index:",index,"\n",
|
||
"df['data'][index]:",df['data'][index],"\n",
|
||
"df['parenthetical'][index]:",df['parenthetical'][index],"\n",
|
||
"df['When_Identified'][index]:",df['When_Identified'][index],"\n",
|
||
"df['Identification_Status'][index]:",df['Identification_Status'][index],"\n",
|
||
"df['nlb'][index]:",df['nlb'][index],"\n",
|
||
)
|
||
try:
|
||
|
||
audit_df.loc[line_no]['line_broken_into_multiple_lines'] = 'Separated Speaker Mixed with Parenthetical'
|
||
except:
|
||
|
||
audit_df.loc[np.float64(line_no)]['line_broken_into_multiple_lines'] = 'Separated Speaker Mixed with Parenthetical'
|
||
print(df['Identification_Status'][index])
|
||
try:
|
||
print(after_par)
|
||
except:
|
||
pass
|
||
print("identifying parenthetical")
|
||
df.loc[index + 0.25] = np.nan
|
||
df.loc[index + 0.25,'data'] = after_par
|
||
df.loc[index + 0.25,'parenthetical'] = 'Complete'
|
||
df.loc[index + 0.25,'When_Identified'] ='ExaminingSpeakerMix'
|
||
df.loc[index + 0.25,'Identification_Status'] = 'ps10'
|
||
df.loc[index + 0.25,'case'] = ''
|
||
df.loc[index + 0.25,'plb'] = 'N'
|
||
df.loc[index + 0.25,'nlb'] = nlb
|
||
df.loc[index + 0.25,'line_no'] = new_line_no
|
||
|
||
df = df.sort_index().reset_index(drop=True)
|
||
continue
|
||
|
||
return df
|
||
# df.to_csv(p.output_file_path,index=False)
|
||
|
||
# lines_not_removed = audit_df.loc[audit_df['line_removed'] != 'Yes'].index.to_list()
|
||
# audit_df.sort_index(inplace= True)
|
||
# audit_df.reset_index(inplace= True)
|
||
|
||
# for line in lines_not_removed:
|
||
# new_data = ''
|
||
# try:
|
||
# new_data =df.loc[df['line_no'] == line, 'data'].values[0]
|
||
# except:
|
||
# pass
|
||
# #print(new_data)
|
||
# audit_df.loc[audit_df['line_no'] == line, 'data_corrected'] = new_data
|
||
# #print(audit_df.loc[audit_df['line_no'] == line, 'data_corrected'])
|
||
|
||
|
||
|
||
# audit_df.to_csv(p.audit_report_path, index = False)
|
||
|
||
def examine_speaker_mix_part2(df,audit_df):
|
||
|
||
## examine the ps30s and split with colon and all caps speaker
|
||
print("Start speaker mix part2")
|
||
for index in df.index:
|
||
|
||
line_no = df['line_no'][index]
|
||
data = df['data'][index]
|
||
cur_line_pos = df['Identification_Status'][index].split(";")
|
||
|
||
if 'ps30' not in cur_line_pos[0]:
|
||
continue
|
||
try:
|
||
if 'ps30' not in cur_line_pos[1]:
|
||
continue
|
||
except:
|
||
pass
|
||
|
||
if cur_line_pos[0] == 'ps1' or cur_line_pos[0] == 'ps2' :
|
||
# skipping as could be slugline
|
||
continue
|
||
|
||
speaker = ''
|
||
dialogue = ''
|
||
## search colon and spearate after colon
|
||
try:
|
||
print("data:\n",data)
|
||
except:
|
||
pass
|
||
extn_found = False
|
||
extn_list = ['O.S.','V.O.',"CONT'D","CONT’D",'VOICE']
|
||
print(extn_list)
|
||
for extn in extn_list:
|
||
if extn in str(data):
|
||
extn_found = True
|
||
break
|
||
|
||
if re.search('\:',data,re.IGNORECASE) and not extn_found:
|
||
pos_starts = re.search('\:',data,re.IGNORECASE).start()
|
||
#pos_end = re.search('(',data,re.IGNORECASE).end()
|
||
before_colon = data[:pos_starts]
|
||
after_colon = data[pos_starts+1:]
|
||
|
||
if not before_colon or before_colon.strip().isspace():
|
||
print ("nothing before colon")
|
||
continue
|
||
print ("Seperating speaker dialogue separated by colon")
|
||
print(index)
|
||
try:
|
||
print(before_colon)
|
||
except:
|
||
pass
|
||
df['data'][index] = before_colon
|
||
df['parenthetical'][index] = 'Absent'
|
||
df['When_Identified'][index] = 'ExaminingSpeakerMixDialogue'
|
||
#df['case_format'][index] = 'AllUpper'
|
||
df['Identification_Status'][index] = 'ps7'
|
||
nlb = df['nlb'][index]
|
||
df['nlb'][index] = 'N'
|
||
|
||
line_no = df['line_no'][index]
|
||
next_line_no = df['line_no'][index+1]
|
||
new_line_no = (line_no + next_line_no) / 2
|
||
if new_line_no in audit_df.index:
|
||
new_line_no = (new_line_no + next_line_no)/2
|
||
audit_df.loc[new_line_no] = ''
|
||
audit_df.loc[new_line_no]['line_removed'] = 'No'
|
||
|
||
audit_df.loc[line_no]['line_broken_into_multiple_lines'] = 'Separated Speaker and Dialogue mixed with colon:'
|
||
|
||
|
||
#print(df['Identification_Status'][index])
|
||
try:
|
||
print(after_colon)
|
||
except:
|
||
pass
|
||
print("identifying after colon as dialogue end")
|
||
df.loc[index + 0.25] = ''
|
||
df.loc[index + 0.25,'data'] = after_colon
|
||
df.loc[index + 0.25,'parenthetical'] = 'Absent'
|
||
df.loc[index + 0.25,'When_Identified'] ='ExaminingSpeakerMixDialogue'
|
||
df.loc[index + 0.25,'Identification_Status'] = 'ps15'
|
||
df.loc[index + 0.25,'case'] = ''
|
||
df.loc[index + 0.25,'plb'] = 'N'
|
||
df.loc[index + 0.25,'nlb'] = nlb
|
||
df.loc[index + 0.25,'line_no'] = new_line_no
|
||
|
||
df = df.sort_index().reset_index(drop=True)
|
||
continue
|
||
|
||
|
||
elif cur_line_pos[0] == 'ps30':
|
||
words = data.lstrip().split(" ")
|
||
k = 0
|
||
for word in words:
|
||
try:
|
||
print(word)
|
||
except:
|
||
pass
|
||
if word.isupper():
|
||
k += 1
|
||
else:
|
||
break
|
||
print(k)
|
||
if k != 0:
|
||
for i in range(0,k):
|
||
speaker += words[i] + ' '
|
||
for j in range(k,len(words)):
|
||
dialogue += words[j] + ''
|
||
|
||
|
||
print ("Seperating speaker dialogue for ps30")
|
||
print(index)
|
||
try:
|
||
print(speaker.strip())
|
||
except:
|
||
pass
|
||
if not speaker or not speaker.strip() or not dialogue.strip():
|
||
print("unable to separate speaker from line, speaker possibly blank or line is not speaker dialogue mix",index)
|
||
continue
|
||
|
||
df['data'][index] = speaker.strip()
|
||
print("df['data'][index]:",df['data'][index])
|
||
|
||
df['parenthetical'][index] = 'Absent'
|
||
print("df['parenthetical'][index]:",df['parenthetical'][index])
|
||
|
||
df['When_Identified'][index] = 'ExaminingSpeakerMixDialogue'
|
||
print("df['When_Identified'][index]:",df['When_Identified'][index])
|
||
|
||
df['case'][index] = 'AllUpper'
|
||
print("df['case'][index]:",df['case'][index])
|
||
|
||
df['Identification_Status'][index] = 'ps7'
|
||
print("df['Identification_Status'][index]:",df['Identification_Status'][index])
|
||
|
||
nlb = df['nlb'][index]
|
||
print("nlb",nlb)
|
||
|
||
df['nlb'][index] = 'N'
|
||
print("df['nlb'][index]:",df['nlb'][index])
|
||
|
||
#print(df['Identification_Status'][index])
|
||
line_no = df['line_no'][index]
|
||
print("line_no", line_no)
|
||
|
||
next_line_no = df['line_no'][index+1]
|
||
print("next_line_no:", next_line_no)
|
||
try:
|
||
print("entering")
|
||
new_line_no = (float(line_no) + float(next_line_no)) / 2
|
||
except:
|
||
a = float(line_no)
|
||
b = float(next_line_no)
|
||
c = float((line_no + new_line_no)/2)
|
||
print(type(c))
|
||
new_line_no = c
|
||
|
||
print("new_line_no:", new_line_no)
|
||
|
||
try:
|
||
print("try block")
|
||
if new_line_no in audit_df.index:
|
||
print("inside if block")
|
||
new_line_no = (new_line_no + next_line_no)/2
|
||
except Exception as e:
|
||
print(f"An error occurred: {e}")
|
||
|
||
# if new_line_no in audit_df.index:
|
||
# print("inside if block")
|
||
# new_line_no = (new_line_no + next_line_no)/2
|
||
|
||
print("after if block")
|
||
#audit_df.loc[new_line_no] = ''
|
||
#audit_df.loc[new_line_no]['line_removed'] = 'No'
|
||
try:
|
||
print("try")
|
||
audit_df.loc[new_line_no, 'line_removed'] = 'No'
|
||
except:
|
||
print("except")
|
||
audit_df.loc[new_line_no] = ''
|
||
audit_df.loc[new_line_no, 'line_removed'] = 'No'
|
||
print("audit_df.loc[new_line_no]['line_removed']:",audit_df.loc[new_line_no]['line_removed'])
|
||
|
||
try:
|
||
print("try")
|
||
audit_df.loc[line_no ,'line_broken_into_multiple_lines'] = 'Separated Speaker and Dialogue '
|
||
except:
|
||
print("except")
|
||
audit_df.loc[line_no] = ''
|
||
audit_df.loc[line_no ,'line_broken_into_multiple_lines'] = 'Separated Speaker and Dialogue '
|
||
print("audit_df.loc[line_no]['line_broken_into_multiple_lines']:",audit_df.loc[line_no]['line_broken_into_multiple_lines'])
|
||
|
||
#audit_df.loc[line_no]['line_broken_into_multiple_lines'] = 'Separated Speaker and Dialogue '
|
||
|
||
|
||
print("identifying dialogue from ps30 as ps13;ps15")
|
||
try:
|
||
print(dialogue.strip())
|
||
except:
|
||
pass
|
||
df.loc[index + 0.25] = ''
|
||
df.loc[index + 0.25,'data'] = dialogue.strip()
|
||
df.loc[index + 0.25,'parenthetical'] = 'Absent'
|
||
df.loc[index + 0.25,'When_Identified'] ='ExaminingSpeakerMixDialogue'
|
||
df.loc[index + 0.25,'Identification_Status'] = 'ps15;ps13'
|
||
df.loc[index + 0.25,'case'] = ''
|
||
df.loc[index + 0.25,'plb'] = 'N'
|
||
df.loc[index + 0.25,'nlb'] = nlb
|
||
df.loc[index + 0.25,'line_no'] = new_line_no
|
||
|
||
|
||
|
||
df = df.sort_index().reset_index(drop=True)
|
||
continue
|
||
|
||
return df
|
||
# df.to_csv(p.output_file_path, index = False)
|
||
|
||
|
||
# lines_not_removed = audit_df.loc[audit_df['line_removed'] != 'Yes'].index.to_list()
|
||
# audit_df.sort_index(inplace= True)
|
||
# audit_df.reset_index(inplace= True)
|
||
|
||
# for line in lines_not_removed:
|
||
# new_data = ''
|
||
# try:
|
||
# new_data =df.loc[df['line_no'] == line, 'data'].values[0]
|
||
# except:
|
||
# pass
|
||
# #print(new_data)
|
||
# audit_df.loc[audit_df['line_no'] == line, 'data_corrected'] = new_data
|
||
# #print(audit_df.loc[audit_df['line_no'] == line, 'data_corrected'])
|
||
|
||
|
||
|
||
# audit_df.to_csv(p.audit_report_path, index = False)
|
||
|
||
|
||
def start_top_identifications_part2(df):
|
||
|
||
for index in df.index:
|
||
|
||
if df['isIdentified'][index] == 'Yes' or pd.isna(df['Identification_Status'][index]):
|
||
continue
|
||
|
||
|
||
|
||
pnbl_pos = []
|
||
nnbl_pos = []
|
||
pnbl_index = -1
|
||
prev_flag = False
|
||
next_flag = False
|
||
pnbl_index = index -1
|
||
nnbl_index = index +1
|
||
|
||
if index == 0:
|
||
pnbl_pos = ['blank']
|
||
pnbl_index = 'first'
|
||
elif df['plb'][index] == 'N' :
|
||
pnbl_pos = df['Identification_Status'][index-1].split(";")
|
||
pnbl_index = index -1
|
||
elif index - 1 == 0:
|
||
pnpl_pos = ['blank']
|
||
pnbl_index = 'first'
|
||
else:
|
||
pnbl_pos = df['Identification_Status'][index-2].split(";")
|
||
pnbl_index = index -2
|
||
|
||
if index == df.index[-1]:
|
||
nnbl_pos = ['blank']
|
||
nnbl_index = 'last'
|
||
elif df['nlb'][index] == 'N' :
|
||
nnbl_pos = df['Identification_Status'][index+1].split(";")
|
||
nnbl_par = df['parenthetical'][index+1]
|
||
nnbl_index = index + 1
|
||
elif index+1 == df.index[-1]:
|
||
nnbl_pos = ['blank']
|
||
nnbl_index = 'last'
|
||
else:
|
||
nnbl_pos = df['Identification_Status'][index+2].split(";")
|
||
nnbl_par = df['parenthetical'][index+2]
|
||
nnbl_index = index + 2
|
||
|
||
|
||
cur_indent = df['ssc'][index]
|
||
try:
|
||
pnbl_indent = df['ssc'][pnbl_index]
|
||
except:
|
||
pnbl_indent = -1
|
||
try:
|
||
nnbl_indent = df['ssc'][nnbl_index]
|
||
except:
|
||
nnbl_indent = -1
|
||
|
||
# try:
|
||
# if df['prvious_line_blank'][pnbl_index] == 'N' :
|
||
# ppnbl_pos = df['Identification_Status'][pnbl_index-1].split(";")
|
||
# else:
|
||
# ppnbl_pos = df['Identification_Status'][pnbl_index-2].split(";")
|
||
# except:
|
||
# pass
|
||
|
||
|
||
line_no = df['line_no'][index]
|
||
data = df['data'][index]
|
||
cur_line_pos = df['Identification_Status'][index].split(";")
|
||
pnbl_par = df['parenthetical'][pnbl_index] if pnbl_index != 'first' else False
|
||
|
||
|
||
try:
|
||
pnbl_top2 = pnbl_pos[1]
|
||
|
||
except:
|
||
pnbl_top2 = ''
|
||
|
||
# try:
|
||
# nnbl_top2 = nnbl_pos[1]
|
||
|
||
# except:
|
||
# nnbl_top2 = ''
|
||
|
||
cur_indent = df['ssc'][index]
|
||
#print(cur_indent)
|
||
|
||
#print("examining")
|
||
|
||
#print(line_no,cur_indent,data)
|
||
#print(cur_line_pos)
|
||
# print(pnbl_pos)
|
||
# print(cur_line_pos)
|
||
# print(nnbl_pos)
|
||
line_new_pos = []
|
||
pdil_pos = []
|
||
ndil_pos = []
|
||
## lets find previous different indent line
|
||
#print("looking for previous different indent line")
|
||
pdil_index = index
|
||
while pdil_index != 0:
|
||
pdil_indent = df['ssc'][pdil_index]
|
||
pdil_line_no = df['line_no'][pdil_index]
|
||
if df['Identification_Status'][pdil_index] != 'blank' and pdil_indent != cur_indent:
|
||
#print(pdil_line_no,pdil_indent,df['Identification_Status'][pdil_index])
|
||
#print(df['data'][pdil_index])
|
||
pdil_pos = df['Identification_Status'][pdil_index].split(";")
|
||
break
|
||
|
||
pdil_index -= 1
|
||
|
||
if pdil_index == 0:
|
||
prev_flag = 'start'
|
||
#print(prev_flag)
|
||
|
||
#print("looking for next different indent line")
|
||
ndil_index = index
|
||
while ndil_index != df.index[-1]:
|
||
ndil_indent = df['ssc'][ndil_index]
|
||
ndil_line_no = df['line_no'][ndil_index]
|
||
if df['Identification_Status'][ndil_index] != 'blank' and ndil_indent != cur_indent:
|
||
#print(ndil_line_no,ndil_indent,df['Identification_Status'][ndil_index])
|
||
#print(df['data'][ndil_index])
|
||
ndil_pos = df['Identification_Status'][ndil_index].split(";")
|
||
break
|
||
|
||
ndil_index += 1
|
||
|
||
if ndil_index == df.index[-1]:
|
||
next_flag = 'end'
|
||
#print(next_flag)
|
||
|
||
cur_line_pos = df['Identification_Status'][index].split(";")
|
||
top1 = cur_line_pos[0]
|
||
top2 = top1
|
||
top3 = top1
|
||
top4 = top1
|
||
top5 = top1
|
||
if len(cur_line_pos) == 5:
|
||
top5 = cur_line_pos[4]
|
||
if len(cur_line_pos) >= 4:
|
||
top4 = cur_line_pos[3]
|
||
|
||
if len(cur_line_pos) >= 3:
|
||
top3 = cur_line_pos[2]
|
||
if len(cur_line_pos) >= 2:
|
||
top2 = cur_line_pos[1]
|
||
|
||
|
||
## if cur line contains both 15 and 6
|
||
if 'ps6' in cur_line_pos and 'ps15' in cur_line_pos:
|
||
print("CURRENT CONATINS 15 6")
|
||
try:
|
||
print(data)
|
||
except:
|
||
pass
|
||
print("check pdil , ndil possibilties")
|
||
print(pdil_pos)
|
||
print(cur_line_pos)
|
||
print(ndil_pos)
|
||
print(prev_flag)
|
||
print(next_flag)
|
||
line_new_pos = cur_line_pos
|
||
if prev_flag != 'start' and next_flag != 'end' :
|
||
if pdil_pos[0] == 'ps15' or pdil_pos[0] == 'ps16' :
|
||
if ndil_pos[0] == 'ps7' or ndil_pos[0] == 'ps10':
|
||
print("remove ps15")
|
||
line_new_pos.remove('ps15')
|
||
print(line_new_pos)
|
||
df['Identification_Status'][index] = ";".join(line_new_pos)
|
||
|
||
elif pdil_pos[0] == 'ps7' or pdil_pos[0] == 'ps10' :
|
||
if ndil_pos[0] == 'ps1' or ndil_pos[0] == 'ps4' or ndil_pos[0] == 'ps6' or ndil_pos[0] == 'ps16':
|
||
print("remove ps6")
|
||
line_new_pos.remove('ps6')
|
||
df['Identification_Status'][index] = ";".join(line_new_pos)
|
||
|
||
print("\n")
|
||
|
||
|
||
lcp = df['lcp'][index]
|
||
|
||
if top1 == 'ps1' or top2 == 'ps1' or top3 == 'ps1' or top4 == 'ps1' or top5 == 'ps1':
|
||
if lcp < 60 :
|
||
print(pnbl_pos)
|
||
print(nnbl_pos)
|
||
print("pssible slug",data)
|
||
print(top1,top2,top3,top4,top5)
|
||
if pnbl_pos == 'ps6' or pnbl_pos == 'ps15' or pnbl_pos == 'ps16' or pnbl_pos == 'ps17':
|
||
if nnbl_pos == 'ps4':
|
||
print("line is ps1")
|
||
|
||
cur_line_pos = df['Identification_Status'][index].split(";")
|
||
line_new_pos = []
|
||
# print(df['line_no'][index])
|
||
# print("CHEKING")
|
||
# print(data)
|
||
# print(pnbl_pos)
|
||
# print(nnbl_pos)
|
||
if "".join(nnbl_pos) == 'ps6' and df['nlb'][index] == 'N':
|
||
line_new_pos = [ps for ps in cur_line_pos if ps != 'ps6']
|
||
print(line_new_pos)
|
||
df['Identification_Status'][index] = ";".join(line_new_pos)
|
||
print("\n")
|
||
|
||
cur_line_pos = df['Identification_Status'][index].split(";")
|
||
line_new_pos = []
|
||
if (nnbl_pos[0] == 'ps4' and top1 != 'ps1' and top2 != 'ps1') or nnbl_pos[0] == 'ps1' or nnbl_pos[0] == 'ps7':
|
||
print(pnbl_pos)
|
||
if pnbl_index != 'first' :
|
||
if pnbl_pos[0] == 'ps5' or pnbl_top2 == 'ps5':
|
||
if cur_indent == pnbl_indent:
|
||
try:
|
||
print(line_no,data,"identifying as PS6")
|
||
except:
|
||
pass
|
||
df['Identification_Status'][index] = 'ps6'
|
||
df['When_Identified'][index] = 'StartTopIdentificationPart2'
|
||
continue
|
||
|
||
if len(df['Identification_Status'][index].split(";")) == 1 :
|
||
continue
|
||
|
||
cur_line_pos = df['Identification_Status'][index].split(";")
|
||
line_new_pos = []
|
||
if cur_line_pos[0] in ('ps5','ps6') and cur_line_pos[1] in ('ps5','ps6'):
|
||
if df['nlb'][index] == 'Y' and "".join(nnbl_pos) == 'ps6':
|
||
if df['plb'][index] == 'N':
|
||
if cur_indent == pnbl_indent:
|
||
try:
|
||
print(line_no,data,"identifying as ps6")
|
||
except:
|
||
pass
|
||
df['Identification_Status'][index] = 'ps6'
|
||
continue
|
||
else:
|
||
# remove ps5
|
||
line_new_pos = [ps for ps in cur_line_pos if ps != 'ps5']
|
||
try:
|
||
print(line_no,data,"removed ps5")
|
||
except:
|
||
pass
|
||
df['Identification_Status'][index] = ";".join(line_new_pos)
|
||
continue
|
||
|
||
return df
|
||
|
||
|
||
def start_slug_identification(df):
|
||
|
||
|
||
# loop through to examine slug
|
||
for index in df.index[1:-1]:
|
||
if df['isIdentified'][index] == 'Yes':
|
||
continue
|
||
|
||
|
||
|
||
pnbl_pos = []
|
||
nnbl_pos = []
|
||
pnbl_index = index -1
|
||
if index == 0:
|
||
pnbl_pos = ['blank']
|
||
elif df['plb'][index] == 'N' :
|
||
pnbl_pos = df['Identification_Status'][index-1].split(";")
|
||
pnbl_index = index -1
|
||
elif index - 1 == 0:
|
||
pnpl_pos = ['blank']
|
||
else:
|
||
pnbl_pos = df['Identification_Status'][index-2].split(";")
|
||
pnbl_index = index -2
|
||
|
||
if index == df.index[-1]:
|
||
nnbl_pos = ['blank']
|
||
elif df['nlb'][index] == 'N' :
|
||
nnbl_pos = df['Identification_Status'][index+1].split(";")
|
||
nnbl_par = df['parenthetical'][index+1]
|
||
elif index+1 == df.index[-1]:
|
||
nnbl_pos = ['blank']
|
||
else:
|
||
nnbl_pos = df['Identification_Status'][index+2].split(";")
|
||
nnbl_par = df['parenthetical'][index+2]
|
||
|
||
# try:
|
||
# if df['prvious_line_blank'][pnbl_index] == 'N' :
|
||
# ppnbl_pos = df['Identification_Status'][pnbl_index-1].split(";")
|
||
# else:
|
||
# ppnbl_pos = df['Identification_Status'][pnbl_index-2].split(";")
|
||
# except:
|
||
# pass
|
||
|
||
|
||
line_no = df['line_no'][index]
|
||
data = df['data'][index]
|
||
cur_line_pos = df['Identification_Status'][index].split(";")
|
||
pnbl_par = df['parenthetical'][pnbl_index]
|
||
|
||
try:
|
||
pnbl_top2 = pnbl_pos[1]
|
||
|
||
except:
|
||
pnbl_top2 = ''
|
||
|
||
# try:
|
||
# nnbl_top2 = nnbl_pos[1]
|
||
|
||
# except:
|
||
# nnbl_top2 = ''
|
||
|
||
# print(line_no,data)
|
||
# print(pnbl_pos)
|
||
# print(cur_line_pos)
|
||
# print(nnbl_pos)
|
||
line_new_pos = []
|
||
|
||
if "".join(pnbl_pos) == 'ps16' and not ('ps1' in nnbl_pos):
|
||
print(nnbl_pos)
|
||
if 'ps1' in cur_line_pos and 'ps18' in cur_line_pos:
|
||
wt1 = int(df['ps1'][index])
|
||
wt18 = int(df['ps18'][index])
|
||
if wt1 > wt18:
|
||
print("identifying current as ps1 ")
|
||
try:
|
||
print(line_no,data)
|
||
except:
|
||
pass
|
||
df['Identification_Status'][index] = 'ps1'
|
||
df['When_Identified'][index] = 'StartIdentifyingSlug'
|
||
continue
|
||
|
||
# if len(cur_line_pos) == 2:
|
||
# if cur_line_pos[0] == 'ps1' and cur_line_pos[1] == 'ps17':
|
||
# wt1 = int(df['ps1'][index])
|
||
# wt17 = int(df['ps17'][index])
|
||
# if wt1 - wt17 > 20:
|
||
# print("identifying current as ps1 ")
|
||
# try:
|
||
# print(line_no,data)
|
||
# except:
|
||
# pass
|
||
# df['Identification_Status'][index] = 'ps1'
|
||
# df['When_Identified'][index] = 'StartIdentifyingSlug'
|
||
# continue
|
||
|
||
# if len(cur_line_pos) == 3:
|
||
# if cur_line_pos[0] == 'ps1' and cur_line_pos[1] == 'ps2' and cur_line_pos[2] == 'ps17':
|
||
# wt1 = int(df['ps1'][index])
|
||
# wt17 = int(df['ps17'][index])
|
||
# if wt1 - wt17 > 20:
|
||
# print("removing ps17 ")
|
||
# try:
|
||
# print(line_no,data)
|
||
# except:
|
||
# pass
|
||
# cur_line_pos = [ps != 'ps17' for ps in cur_line_pos]
|
||
|
||
# df['Identification_Status'][index] = ';'.join(cur_line_pos)
|
||
# df['When_Identified'][index] = 'StartIdentifyingSlug'
|
||
# continue
|
||
|
||
|
||
|
||
|
||
return df
|
||
|
||
|
||
def start_top_identifications_part3(df):
|
||
|
||
|
||
for index in df.index:
|
||
|
||
if df['isIdentified'][index] == 'Yes' or pd.isna(df['Identification_Status'][index]):
|
||
continue
|
||
|
||
|
||
|
||
pnbl_pos = []
|
||
nnbl_pos = []
|
||
pnbl_index = -1
|
||
prev_flag = False
|
||
next_flag = False
|
||
|
||
if index == 0:
|
||
pnbl_pos = ['blank']
|
||
pnbl_index = 'first'
|
||
elif df['plb'][index] == 'N' :
|
||
pnbl_pos = df['Identification_Status'][index-1].split(";")
|
||
pnbl_index = index -1
|
||
elif index - 1 == 0:
|
||
pnpl_pos = ['blank']
|
||
pnbl_index = 'first'
|
||
else:
|
||
pnbl_pos = df['Identification_Status'][index-2].split(";")
|
||
pnbl_index = index -2
|
||
|
||
if index == df.index[-1]:
|
||
nnbl_pos = ['blank']
|
||
nnbl_index = 'last'
|
||
elif df['nlb'][index] == 'N' :
|
||
nnbl_pos = df['Identification_Status'][index+1].split(";")
|
||
nnbl_par = df['parenthetical'][index+1]
|
||
nnbl_index = index + 1
|
||
elif index+1 == df.index[-1]:
|
||
nnbl_pos = ['blank']
|
||
nnbl_index = 'last'
|
||
else:
|
||
nnbl_pos = df['Identification_Status'][index+2].split(";")
|
||
nnbl_par = df['parenthetical'][index+2]
|
||
nnbl_index = index + 2
|
||
|
||
# try:
|
||
# if df['prvious_line_blank'][pnbl_index] == 'N' :
|
||
# ppnbl_pos = df['Identification_Status'][pnbl_index-1].split(";")
|
||
# else:
|
||
# ppnbl_pos = df['Identification_Status'][pnbl_index-2].split(";")
|
||
# except:
|
||
# pass
|
||
|
||
|
||
line_no = df['line_no'][index]
|
||
data = df['data'][index]
|
||
cur_line_pos = df['Identification_Status'][index].split(";")
|
||
pnbl_par = df['parenthetical'][pnbl_index] if pnbl_index != 'first' else False
|
||
|
||
|
||
try:
|
||
pnbl_top2 = pnbl_pos[1]
|
||
|
||
except:
|
||
pnbl_top2 = ''
|
||
|
||
# try:
|
||
# nnbl_top2 = nnbl_pos[1]
|
||
|
||
# except:
|
||
# nnbl_top2 = ''
|
||
|
||
cur_indent = df['ssc'][index]
|
||
#print(cur_indent)
|
||
|
||
#print("examining")
|
||
|
||
#print(line_no,cur_indent,data)
|
||
#print(cur_line_pos)
|
||
# print(pnbl_pos)
|
||
# print(cur_line_pos)
|
||
# print(nnbl_pos)
|
||
line_new_pos = []
|
||
|
||
print("\n")
|
||
|
||
cur_line_pos = df['Identification_Status'][index].split(";")
|
||
top1 = cur_line_pos[0]
|
||
top2 = top1
|
||
top3 = top1
|
||
top4 = top1
|
||
top5 = top1
|
||
if len(cur_line_pos) == 5:
|
||
top5 = cur_line_pos[4]
|
||
if len(cur_line_pos) >= 4:
|
||
top4 = cur_line_pos[3]
|
||
|
||
if len(cur_line_pos) >= 3:
|
||
top3 = cur_line_pos[2]
|
||
if len(cur_line_pos) >= 2:
|
||
top2 = cur_line_pos[1]
|
||
|
||
lcp = df['lcp'][index]
|
||
|
||
|
||
if len(pnbl_pos) == 0:
|
||
pnbl_pos = ['blank']
|
||
|
||
|
||
pnbl_top1 = pnbl_pos[0]
|
||
pnbl_top2 = pnbl_top1
|
||
pnbl_top3 = pnbl_top1
|
||
pnbl_top4 = pnbl_top1
|
||
pnbl_top5 = pnbl_top1
|
||
if len(pnbl_pos) == 5:
|
||
pnbl_top5 = pnbl_pos[4]
|
||
if len(pnbl_pos) >= 4:
|
||
pnbl_top4 = pnbl_pos[3]
|
||
print(pnbl_pos[3])
|
||
|
||
if len(pnbl_pos) >= 3:
|
||
pnbl_top3 = pnbl_pos[2]
|
||
if len(pnbl_pos) >= 2:
|
||
pnbl_top2 = pnbl_pos[1]
|
||
|
||
# lcp = df['last_character_placement'][index]
|
||
|
||
|
||
nnbl_top1 = nnbl_pos[0]
|
||
nnbl_top2 = nnbl_top1
|
||
nnbl_top3 = nnbl_top1
|
||
# nnbl_top4 = nnbl_top1
|
||
# nnbl_top5 = nnbl_top1
|
||
|
||
if len(nnbl_pos) >= 3:
|
||
nnbl_top3 = nnbl_pos[2]
|
||
if len(nnbl_pos) >= 2:
|
||
nnbl_top2 = nnbl_pos[1]
|
||
|
||
|
||
if top1 == 'ps1':
|
||
try:
|
||
print("possible slug",data)
|
||
except:
|
||
pass
|
||
print(pnbl_pos)
|
||
print(pnbl_top4)
|
||
print(nnbl_pos)
|
||
if pnbl_top1 == 'ps16' or pnbl_top2 == 'ps16' or pnbl_top3 == 'ps16' or pnbl_top4 == 'ps16' or pnbl_top5 == 'ps16' :
|
||
if nnbl_top1 == 'ps4' or nnbl_top2 == 'ps4' or nnbl_top3 == 'ps4':
|
||
print("identifying current as ps1 as between top transitiona and action")
|
||
try:
|
||
print(line_no,data)
|
||
except:
|
||
pass
|
||
df['Identification_Status'][index] = 'ps1'
|
||
df['When_Identified'][index] = 'StartIdentifyingTopsPart3'
|
||
|
||
|
||
return df
|
||
|
||
|
||
def start_top_identifications_part4(df):
|
||
|
||
for index in df.index:
|
||
|
||
if df['isIdentified'][index] == 'Yes' or pd.isna(df['Identification_Status'][index]):
|
||
continue
|
||
|
||
|
||
|
||
pnbl_pos = []
|
||
nnbl_pos = []
|
||
pnbl_index = -1
|
||
prev_flag = False
|
||
next_flag = False
|
||
pnbl_index = index -1
|
||
nnbl_index = index +1
|
||
|
||
if index == 0:
|
||
pnbl_pos = ['blank']
|
||
pnbl_index = 'first'
|
||
elif df['plb'][index] == 'N' :
|
||
pnbl_pos = df['Identification_Status'][index-1].split(";")
|
||
pnbl_index = index -1
|
||
elif index - 1 == 0:
|
||
pnpl_pos = ['blank']
|
||
pnbl_index = 'first'
|
||
else:
|
||
pnbl_pos = df['Identification_Status'][index-2].split(";")
|
||
pnbl_index = index -2
|
||
|
||
if index == df.index[-1]:
|
||
nnbl_pos = ['blank']
|
||
nnbl_index = 'last'
|
||
elif df['nlb'][index] == 'N' :
|
||
nnbl_pos = df['Identification_Status'][index+1].split(";")
|
||
nnbl_par = df['parenthetical'][index+1]
|
||
nnbl_index = index + 1
|
||
elif index+1 == df.index[-1]:
|
||
nnbl_pos = ['blank']
|
||
nnbl_index = 'last'
|
||
else:
|
||
nnbl_pos = df['Identification_Status'][index+2].split(";")
|
||
nnbl_par = df['parenthetical'][index+2]
|
||
nnbl_index = index + 2
|
||
|
||
cur_indent = df['ssc'][index]
|
||
try:
|
||
pnbl_indent = df['ssc'][pnbl_index]
|
||
except:
|
||
pnbl_indent = -1
|
||
try:
|
||
nnbl_indent = df['ssc'][nnbl_index]
|
||
except:
|
||
nnbl_indent = -1
|
||
|
||
# try:
|
||
# if df['prvious_line_blank'][pnbl_index] == 'N' :
|
||
# ppnbl_pos = df['Identification_Status'][pnbl_index-1].split(";")
|
||
# else:
|
||
# ppnbl_pos = df['Identification_Status'][pnbl_index-2].split(";")
|
||
# except:
|
||
# pass
|
||
|
||
|
||
line_no = df['line_no'][index]
|
||
data = df['data'][index]
|
||
case = df['case'][index]
|
||
cur_line_pos = df['Identification_Status'][index].split(";")
|
||
pnbl_par = df['parenthetical'][pnbl_index] if pnbl_index != 'first' else False
|
||
|
||
|
||
try:
|
||
pnbl_top2 = pnbl_pos[1]
|
||
|
||
except:
|
||
pnbl_top2 = ''
|
||
|
||
# try:
|
||
# nnbl_top2 = nnbl_pos[1]
|
||
|
||
# except:
|
||
# nnbl_top2 = ''
|
||
|
||
#cur_indent = df['data_begins/Space count'][index]
|
||
#print(cur_indent)
|
||
|
||
#print("examining")
|
||
|
||
#print(line_no,cur_indent,data)
|
||
#print(cur_line_pos)
|
||
# print(pnbl_pos)
|
||
# print(cur_line_pos)
|
||
# print(nnbl_pos)
|
||
line_new_pos = []
|
||
|
||
print("\n")
|
||
|
||
cur_line_pos = df['Identification_Status'][index].split(";")
|
||
top1 = cur_line_pos[0]
|
||
top2 = top1
|
||
top3 = top1
|
||
top4 = top1
|
||
top5 = top1
|
||
if len(cur_line_pos) == 5:
|
||
top5 = cur_line_pos[4]
|
||
if len(cur_line_pos) >= 4:
|
||
top4 = cur_line_pos[3]
|
||
|
||
if len(cur_line_pos) >= 3:
|
||
top3 = cur_line_pos[2]
|
||
if len(cur_line_pos) >= 2:
|
||
top2 = cur_line_pos[1]
|
||
|
||
lcp = df['lcp'][index]
|
||
|
||
|
||
if len(pnbl_pos) == 0:
|
||
pnbl_pos = ['blank']
|
||
|
||
|
||
pnbl_top1 = pnbl_pos[0]
|
||
pnbl_top2 = pnbl_top1
|
||
pnbl_top3 = pnbl_top1
|
||
pnbl_top4 = pnbl_top1
|
||
pnbl_top5 = pnbl_top1
|
||
if len(pnbl_pos) == 5:
|
||
pnbl_top5 = pnbl_pos[4]
|
||
if len(pnbl_pos) >= 4:
|
||
pnbl_top4 = pnbl_pos[3]
|
||
print(pnbl_pos[3])
|
||
|
||
if len(pnbl_pos) >= 3:
|
||
pnbl_top3 = pnbl_pos[2]
|
||
if len(pnbl_pos) >= 2:
|
||
pnbl_top2 = pnbl_pos[1]
|
||
|
||
# lcp = df['last_character_placement'][index]
|
||
|
||
|
||
nnbl_top1 = nnbl_pos[0]
|
||
nnbl_top2 = nnbl_top1
|
||
nnbl_top3 = nnbl_top1
|
||
# nnbl_top4 = nnbl_top1
|
||
# nnbl_top5 = nnbl_top1
|
||
|
||
if len(nnbl_pos) >= 3:
|
||
nnbl_top3 = nnbl_pos[2]
|
||
if len(nnbl_pos) >= 2:
|
||
nnbl_top2 = nnbl_pos[1]
|
||
|
||
## between 15 and 6 , top 4, nlb=N
|
||
if top1 == 'ps4' and ";".join(pnbl_pos) == 'ps15' and ";".join(nnbl_pos) == 'ps6' and case != 'AllUpper':
|
||
if cur_indent == nnbl_indent and df['nlb'][index] == 'N':
|
||
print("identifying current as ps4 as between dialogue and action end and top action begin")
|
||
try:
|
||
print(line_no,data)
|
||
except:
|
||
pass
|
||
df['Identification_Status'][index] = 'ps4'
|
||
df['When_Identified'][index] = 'StartIdentifyingTopsPart4'
|
||
continue
|
||
|
||
## between 15,6 and 1 , top 3 has 16, nlb=Y , plb =Y
|
||
if ('ps16' in (top1,top2,top3)) and (";".join(pnbl_pos) == 'ps15' or ";".join(pnbl_pos) == 'ps6') and ";".join(nnbl_pos) == 'ps1':
|
||
if df['plb'][index] == 'Y' and df['nlb'][index] == 'Y' and top1 != 'ps6':
|
||
print("identifying current as transition ")
|
||
try:
|
||
print(line_no,data)
|
||
except:
|
||
pass
|
||
df['Identification_Status'][index] = 'ps16'
|
||
df['When_Identified'][index] = 'StartIdentifyingTopsPart4'
|
||
continue
|
||
|
||
last_line_index = df.index[-1]
|
||
if df['Identification_Status'][last_line_index] == 'blank':
|
||
last_line_index -= 1
|
||
|
||
cur_line_pos = df['Identification_Status'][last_line_index].split(";")
|
||
|
||
if len(cur_line_pos) > 1 :
|
||
if cur_line_pos[0] == 'ps6' or cur_line_pos[0] == 'ps15':
|
||
print("Identifying last line as top",cur_line_pos[0])
|
||
try:
|
||
print(df['line_no'][last_line_index],df['data'][last_line_index])
|
||
except:
|
||
pass
|
||
df['Identification_Status'][last_line_index] = cur_line_pos[0]
|
||
df['When_Identified'][last_line_index] = 'IdentifyingLastLine'
|
||
|
||
|
||
for index in df.index:
|
||
cur_line_pos = df['Identification_Status'][index].split(";")
|
||
if len(cur_line_pos) != 1 :
|
||
df['isIdentified'][index] == 'No'
|
||
else:
|
||
df['isIdentified'][index] == 'Yes'
|
||
|
||
return df
|
||
|
||
|
||
def start_top_identifications_part5(df):
|
||
|
||
for index in df.index:
|
||
|
||
if df['isIdentified'][index] == 'Yes' or pd.isna(df['Identification_Status'][index]):
|
||
continue
|
||
|
||
|
||
|
||
pnbl_pos = []
|
||
nnbl_pos = []
|
||
pnbl_index = -1
|
||
prev_flag = False
|
||
next_flag = False
|
||
pnbl_index = index -1
|
||
nnbl_index = index +1
|
||
|
||
if index == 0:
|
||
pnbl_pos = ['blank']
|
||
pnbl_index = 'first'
|
||
elif df['plb'][index] == 'N' :
|
||
print(pnbl_pos)
|
||
|
||
pnbl_pos = df['Identification_Status'][index-1].split(";")
|
||
pnbl_index = index -1
|
||
|
||
elif index - 1 == 0:
|
||
pnpl_pos = ['blank']
|
||
pnbl_index = 'first'
|
||
else:
|
||
pnbl_pos = df['Identification_Status'][index-2].split(";")
|
||
pnbl_index = index -2
|
||
|
||
if index == df.index[-1]:
|
||
nnbl_pos = ['blank']
|
||
nnbl_index = 'last'
|
||
elif df['nlb'][index] == 'N' :
|
||
nnbl_pos = df['Identification_Status'][index+1].split(";")
|
||
nnbl_par = df['parenthetical'][index+1]
|
||
nnbl_index = index + 1
|
||
elif index+1 == df.index[-1]:
|
||
nnbl_pos = ['blank']
|
||
nnbl_index = 'last'
|
||
else:
|
||
nnbl_pos = df['Identification_Status'][index+2].split(";")
|
||
nnbl_par = df['parenthetical'][index+2]
|
||
nnbl_index = index + 2
|
||
|
||
cur_indent = df['ssc'][index]
|
||
try:
|
||
pnbl_indent = df['ssc'][pnbl_index]
|
||
except:
|
||
pnbl_indent = -1
|
||
try:
|
||
nnbl_indent = df['ssc'][nnbl_index]
|
||
except:
|
||
nnbl_indent = -1
|
||
|
||
|
||
# try:
|
||
# if df['prvious_line_blank'][pnbl_index] == 'N' :
|
||
# ppnbl_pos = df['Identification_Status'][pnbl_index-1].split(";")
|
||
# else:
|
||
# ppnbl_pos = df['Identification_Status'][pnbl_index-2].split(";")
|
||
# except:
|
||
# pass
|
||
|
||
|
||
line_no = df['line_no'][index]
|
||
data = df['data'][index]
|
||
cur_line_pos = df['Identification_Status'][index].split(";")
|
||
pnbl_par = df['parenthetical'][pnbl_index] if pnbl_index != 'first' else False
|
||
|
||
|
||
try:
|
||
pnbl_top2 = pnbl_pos[1]
|
||
|
||
except:
|
||
pnbl_top2 = ''
|
||
|
||
# try:
|
||
# nnbl_top2 = nnbl_pos[1]
|
||
|
||
# except:
|
||
# nnbl_top2 = ''
|
||
|
||
#cur_indent = df['data_begins/Space count'][index]
|
||
#print(cur_indent)
|
||
|
||
#print("examining")
|
||
|
||
#print(line_no,cur_indent,data)
|
||
#print(cur_line_pos)
|
||
# print(pnbl_pos)
|
||
# print(cur_line_pos)
|
||
# print(nnbl_pos)
|
||
line_new_pos = []
|
||
|
||
print("\n")
|
||
|
||
cur_line_pos = df['Identification_Status'][index].split(";")
|
||
top1 = cur_line_pos[0]
|
||
top2 = top1
|
||
top3 = top1
|
||
top4 = top1
|
||
top5 = top1
|
||
if len(cur_line_pos) == 5:
|
||
top5 = cur_line_pos[4]
|
||
if len(cur_line_pos) >= 4:
|
||
top4 = cur_line_pos[3]
|
||
|
||
if len(cur_line_pos) >= 3:
|
||
top3 = cur_line_pos[2]
|
||
if len(cur_line_pos) >= 2:
|
||
top2 = cur_line_pos[1]
|
||
|
||
lcp = df['lcp'][index]
|
||
|
||
|
||
if len(pnbl_pos) == 0:
|
||
pnbl_pos = ['blank']
|
||
|
||
|
||
pnbl_top1 = pnbl_pos[0]
|
||
pnbl_top2 = pnbl_top1
|
||
pnbl_top3 = pnbl_top1
|
||
pnbl_top4 = pnbl_top1
|
||
pnbl_top5 = pnbl_top1
|
||
if len(pnbl_pos) == 5:
|
||
pnbl_top5 = pnbl_pos[4]
|
||
if len(pnbl_pos) >= 4:
|
||
pnbl_top4 = pnbl_pos[3]
|
||
print(pnbl_pos[3])
|
||
|
||
if len(pnbl_pos) >= 3:
|
||
pnbl_top3 = pnbl_pos[2]
|
||
if len(pnbl_pos) >= 2:
|
||
pnbl_top2 = pnbl_pos[1]
|
||
|
||
# lcp = df['last_character_placement'][index]
|
||
|
||
|
||
nnbl_top1 = nnbl_pos[0]
|
||
nnbl_top2 = nnbl_top1
|
||
nnbl_top3 = nnbl_top1
|
||
nnbl_top4 = nnbl_top1
|
||
# nnbl_top5 = nnbl_top1
|
||
|
||
if len(nnbl_pos) >= 4:
|
||
nnbl_top4 = nnbl_pos[3]
|
||
if len(nnbl_pos) >= 3:
|
||
nnbl_top3 = nnbl_pos[2]
|
||
if len(nnbl_pos) >= 2:
|
||
nnbl_top2 = nnbl_pos[1]
|
||
|
||
|
||
## pnbl is ps5 or 4 , cur top 5 , next top 2 has 6, cur_indent = pvs indent ;then current is ps5
|
||
if "".join(pnbl_pos) == 'ps5' or "".join(pnbl_pos) == 'ps4':
|
||
if top1 == 'ps5' and cur_indent == nnbl_indent:
|
||
if (nnbl_top1 == 'ps6' or nnbl_top2 == 'ps6') and (nnbl_top1 !='ps1' and nnbl_top1 !='ps2'):
|
||
print("identifying current as ps5 as between actions")
|
||
try:
|
||
print(line_no,data)
|
||
except:
|
||
pass
|
||
df['Identification_Status'][index] = 'ps5'
|
||
df['When_Identified'][index] = 'StartIdentifyingTopsPart5'
|
||
continue
|
||
elif nnbl_top1 == 'ps5' :
|
||
print("identifying current as ps5 as between actions")
|
||
try:
|
||
print(line_no,data)
|
||
except:
|
||
pass
|
||
df['Identification_Status'][index] = 'ps5'
|
||
df['When_Identified'][index] = 'StartIdentifyingTopsPart5'
|
||
continue
|
||
|
||
## pnbl is ps16 , cur top2 has ps1 ,##next top 2 has 4 or 6 , declare ps1 , same indent ?
|
||
if "".join(pnbl_pos) == 'ps16':
|
||
if top1 == 'ps1' or top2 == 'ps1':
|
||
if nnbl_top1 == 'ps4' or nnbl_top2 == 'ps4' or nnbl_top1 == 'ps6' or nnbl_top2 == 'ps6':
|
||
print("identifying current as ps1 as between transition and action")
|
||
try:
|
||
print(line_no,data)
|
||
except:
|
||
pass
|
||
df['Identification_Status'][index] = 'ps1'
|
||
df['When_Identified'][index] = 'StartIdentifyingTopsPart5'
|
||
continue
|
||
|
||
line_new_pos = []
|
||
## nnbl top2 does not have ps4 remove 1,3 from current line
|
||
## dont remove if next line is identified as speaker and ps1 is top
|
||
if nnbl_top1 != 'ps4' and nnbl_top2 != 'ps4' and nnbl_top3 != 'ps4' and nnbl_top4 != 'ps4' and nnbl_top1 != 'ps6' and nnbl_top2 != 'ps6':
|
||
if (nnbl_top1 == 'ps7' or nnbl_top1 =='ps8') and (top1 == 'ps1' or top1 == 'ps3'):
|
||
print("not removing ps1 as next is speaker and current top is slugline")
|
||
else:
|
||
line_new_pos = [ps for ps in cur_line_pos if ps != 'ps1' ]
|
||
line_new_pos = [ps for ps in line_new_pos if ps != 'ps3' ]
|
||
print("Removing ps1 ps3 from current as next does not have ps4 in top4 ps6 in top2")
|
||
try:
|
||
print(line_no,data)
|
||
except:
|
||
pass
|
||
df['Identification_Status'][index] = ";".join(line_new_pos)
|
||
#df['When_Identified'][index] = ''
|
||
|
||
cur_line_pos = df['Identification_Status'][index].split(";")
|
||
line_new_pos = []
|
||
## remove ps2 and ps18 as a possibility if right indent < 75
|
||
lcp = df['lcp'][index]
|
||
if lcp < 68 and ('ps2' in cur_line_pos or 'ps18' in cur_line_pos):
|
||
print("Removing ps2 ps18 from current as lcp < 75")
|
||
try:
|
||
print(line_no,data)
|
||
except:
|
||
pass
|
||
line_new_pos = [ps for ps in cur_line_pos if ps != 'ps2' ]
|
||
line_new_pos = [ps for ps in line_new_pos if ps != 'ps18' ]
|
||
df['Identification_Status'][index] = ";".join(line_new_pos)
|
||
|
||
for index in df.index:
|
||
#print(index)
|
||
cur_line_pos = df['Identification_Status'][index].split(";")
|
||
if len(cur_line_pos) != 1 :
|
||
df['isIdentified'][index] == 'No'
|
||
else:
|
||
df['isIdentified'][index] == 'Yes'
|
||
|
||
return df
|
||
|
||
|
||
def start_top_identifications_part6(df):
|
||
|
||
for index in df.index:
|
||
|
||
if df['isIdentified'][index] == 'Yes' or pd.isna(df['Identification_Status'][index]):
|
||
continue
|
||
|
||
|
||
|
||
pnbl_pos = []
|
||
nnbl_pos = []
|
||
pnbl_index = -1
|
||
prev_flag = False
|
||
next_flag = False
|
||
pnbl_index = index -1
|
||
nnbl_index = index +1
|
||
|
||
if index == 0:
|
||
pnbl_pos = ['blank']
|
||
pnbl_index = 'first'
|
||
elif df['plb'][index] == 'N' :
|
||
print(pnbl_pos)
|
||
|
||
pnbl_pos = df['Identification_Status'][index-1].split(";")
|
||
pnbl_index = index -1
|
||
|
||
elif index - 1 == 0:
|
||
pnpl_pos = ['blank']
|
||
pnbl_index = 'first'
|
||
else:
|
||
pnbl_pos = df['Identification_Status'][index-2].split(";")
|
||
pnbl_index = index -2
|
||
|
||
if index == df.index[-1]:
|
||
nnbl_pos = ['blank']
|
||
nnbl_index = 'last'
|
||
elif df['nlb'][index] == 'N' :
|
||
nnbl_pos = df['Identification_Status'][index+1].split(";")
|
||
nnbl_par = df['parenthetical'][index+1]
|
||
nnbl_index = index + 1
|
||
elif index+1 == df.index[-1]:
|
||
nnbl_pos = ['blank']
|
||
nnbl_index = 'last'
|
||
else:
|
||
nnbl_pos = df['Identification_Status'][index+2].split(";")
|
||
nnbl_par = df['parenthetical'][index+2]
|
||
nnbl_index = index + 2
|
||
|
||
cur_indent = df['ssc'][index]
|
||
try:
|
||
pnbl_indent = df['ssc'][pnbl_index]
|
||
except:
|
||
pnbl_indent = -1
|
||
try:
|
||
nnbl_indent = df['ssc'][nnbl_index]
|
||
except:
|
||
nnbl_indent = -1
|
||
|
||
|
||
# try:
|
||
# if df['prvious_line_blank'][pnbl_index] == 'N' :
|
||
# ppnbl_pos = df['Identification_Status'][pnbl_index-1].split(";")
|
||
# else:
|
||
# ppnbl_pos = df['Identification_Status'][pnbl_index-2].split(";")
|
||
# except:
|
||
# pass
|
||
|
||
|
||
line_no = df['line_no'][index]
|
||
data = df['data'][index]
|
||
cur_line_pos = df['Identification_Status'][index].split(";")
|
||
pnbl_par = df['parenthetical'][pnbl_index] if pnbl_index != 'first' else False
|
||
|
||
|
||
try:
|
||
pnbl_top2 = pnbl_pos[1]
|
||
|
||
except:
|
||
pnbl_top2 = ''
|
||
|
||
# try:
|
||
# nnbl_top2 = nnbl_pos[1]
|
||
|
||
# except:
|
||
# nnbl_top2 = ''
|
||
|
||
#cur_indent = df['data_begins/Space count'][index]
|
||
#print(cur_indent)
|
||
|
||
#print("examining")
|
||
|
||
#print(line_no,cur_indent,data)
|
||
#print(cur_line_pos)
|
||
# print(pnbl_pos)
|
||
# print(cur_line_pos)
|
||
# print(nnbl_pos)
|
||
line_new_pos = []
|
||
|
||
print("\n")
|
||
|
||
cur_line_pos = df['Identification_Status'][index].split(";")
|
||
top1 = cur_line_pos[0]
|
||
top2 = top1
|
||
top3 = top1
|
||
top4 = top1
|
||
top5 = top1
|
||
if len(cur_line_pos) == 5:
|
||
top5 = cur_line_pos[4]
|
||
if len(cur_line_pos) >= 4:
|
||
top4 = cur_line_pos[3]
|
||
|
||
if len(cur_line_pos) >= 3:
|
||
top3 = cur_line_pos[2]
|
||
if len(cur_line_pos) >= 2:
|
||
top2 = cur_line_pos[1]
|
||
|
||
lcp = df['lcp'][index]
|
||
|
||
|
||
if len(pnbl_pos) == 0:
|
||
pnbl_pos = ['blank']
|
||
|
||
|
||
pnbl_top1 = pnbl_pos[0]
|
||
pnbl_top2 = pnbl_top1
|
||
pnbl_top3 = pnbl_top1
|
||
pnbl_top4 = pnbl_top1
|
||
pnbl_top5 = pnbl_top1
|
||
if len(pnbl_pos) == 5:
|
||
pnbl_top5 = pnbl_pos[4]
|
||
if len(pnbl_pos) >= 4:
|
||
pnbl_top4 = pnbl_pos[3]
|
||
print(pnbl_pos[3])
|
||
|
||
if len(pnbl_pos) >= 3:
|
||
pnbl_top3 = pnbl_pos[2]
|
||
if len(pnbl_pos) >= 2:
|
||
pnbl_top2 = pnbl_pos[1]
|
||
|
||
# lcp = df['last_character_placement'][index]
|
||
|
||
|
||
nnbl_top1 = nnbl_pos[0]
|
||
nnbl_top2 = nnbl_top1
|
||
nnbl_top3 = nnbl_top1
|
||
# nnbl_top4 = nnbl_top1
|
||
# nnbl_top5 = nnbl_top1
|
||
|
||
if len(nnbl_pos) >= 3:
|
||
nnbl_top3 = nnbl_pos[2]
|
||
if len(nnbl_pos) >= 2:
|
||
nnbl_top2 = nnbl_pos[1]
|
||
|
||
## top 1 is ps1 pnbl is 17 nnbl has ps4 in top2
|
||
if top1 == 'ps1' and "".join(pnbl_pos) == 'ps17' :
|
||
if nnbl_top1 == 'ps4' or nnbl_top2 == 'ps4':
|
||
print("identifying current as ps1 as between special term and action")
|
||
try:
|
||
print(line_no,data)
|
||
except:
|
||
pass
|
||
df['Identification_Status'][index] = 'ps1'
|
||
df['When_Identified'][index] = 'StartIdentifyingTopsPart6'
|
||
continue
|
||
|
||
|
||
|
||
|
||
|
||
for index in df.index:
|
||
#print(index)
|
||
try:
|
||
cur_line_pos = df['Identification_Status'][index].split(";")
|
||
except:
|
||
print("JJJJ",index)
|
||
if len(cur_line_pos) != 1 :
|
||
df['isIdentified'][index] == 'No'
|
||
else:
|
||
df['isIdentified'][index] == 'Yes'
|
||
|
||
return df
|
||
|
||
|
||
def start_top_identifications_part7(df):
|
||
|
||
for index in df.index:
|
||
|
||
if df['isIdentified'][index] == 'Yes' or pd.isna(df['Identification_Status'][index]):
|
||
continue
|
||
|
||
|
||
|
||
pnbl_pos = []
|
||
nnbl_pos = []
|
||
pnbl_index = -1
|
||
prev_flag = False
|
||
next_flag = False
|
||
pnbl_index = index -1
|
||
nnbl_index = index +1
|
||
|
||
if index == 0:
|
||
pnbl_pos = ['blank']
|
||
pnbl_index = 'first'
|
||
elif df['plb'][index] == 'N' :
|
||
print(pnbl_pos)
|
||
|
||
pnbl_pos = df['Identification_Status'][index-1].split(";")
|
||
pnbl_index = index -1
|
||
|
||
elif index - 1 == 0:
|
||
pnbl_pos = ['blank']
|
||
pnbl_index = 'first'
|
||
else:
|
||
pnbl_pos = df['Identification_Status'][index-2].split(";")
|
||
pnbl_index = index -2
|
||
|
||
if index == df.index[-1]:
|
||
nnbl_pos = ['blank']
|
||
nnbl_index = 'last'
|
||
elif df['nlb'][index] == 'N' :
|
||
nnbl_pos = df['Identification_Status'][index+1].split(";")
|
||
nnbl_par = df['parenthetical'][index+1]
|
||
nnbl_index = index + 1
|
||
elif index+1 == df.index[-1]:
|
||
nnbl_pos = ['blank']
|
||
nnbl_index = 'last'
|
||
else:
|
||
nnbl_pos = df['Identification_Status'][index+2].split(";")
|
||
nnbl_par = df['parenthetical'][index+2]
|
||
nnbl_index = index + 2
|
||
|
||
cur_indent = df['ssc'][index]
|
||
try:
|
||
pnbl_indent = df['ssc'][pnbl_index]
|
||
except:
|
||
pnbl_indent = -1
|
||
try:
|
||
nnbl_indent = df['ssc'][nnbl_index]
|
||
except:
|
||
nnbl_indent = -1
|
||
|
||
|
||
# try:
|
||
# if df['prvious_line_blank'][pnbl_index] == 'N' :
|
||
# ppnbl_pos = df['Identification_Status'][pnbl_index-1].split(";")
|
||
# else:
|
||
# ppnbl_pos = df['Identification_Status'][pnbl_index-2].split(";")
|
||
# except:
|
||
# pass
|
||
|
||
|
||
line_no = df['line_no'][index]
|
||
data = df['data'][index]
|
||
cur_line_pos = df['Identification_Status'][index].split(";")
|
||
pnbl_par = df['parenthetical'][pnbl_index] if pnbl_index != 'first' else False
|
||
|
||
|
||
try:
|
||
pnbl_top2 = pnbl_pos[1]
|
||
|
||
except:
|
||
pnbl_top2 = ''
|
||
|
||
# try:
|
||
# nnbl_top2 = nnbl_pos[1]
|
||
|
||
# except:
|
||
# nnbl_top2 = ''
|
||
|
||
#cur_indent = df['data_begins/Space count'][index]
|
||
#print(cur_indent)
|
||
|
||
#print("examining")
|
||
|
||
#print(line_no,cur_indent,data)
|
||
#print(cur_line_pos)
|
||
# print(pnbl_pos)
|
||
# print(cur_line_pos)
|
||
# print(nnbl_pos)
|
||
line_new_pos = []
|
||
|
||
print("\n")
|
||
|
||
cur_line_pos = df['Identification_Status'][index].split(";")
|
||
top1 = cur_line_pos[0]
|
||
top2 = top1
|
||
top3 = top1
|
||
top4 = top1
|
||
top5 = top1
|
||
if len(cur_line_pos) == 5:
|
||
top5 = cur_line_pos[4]
|
||
if len(cur_line_pos) >= 4:
|
||
top4 = cur_line_pos[3]
|
||
|
||
if len(cur_line_pos) >= 3:
|
||
top3 = cur_line_pos[2]
|
||
if len(cur_line_pos) >= 2:
|
||
top2 = cur_line_pos[1]
|
||
|
||
lcp = df['lcp'][index]
|
||
|
||
|
||
if len(pnbl_pos) == 0:
|
||
pnbl_pos = ['blank']
|
||
|
||
|
||
pnbl_top1 = pnbl_pos[0]
|
||
pnbl_top2 = pnbl_top1
|
||
pnbl_top3 = pnbl_top1
|
||
pnbl_top4 = pnbl_top1
|
||
pnbl_top5 = pnbl_top1
|
||
if len(pnbl_pos) == 5:
|
||
pnbl_top5 = pnbl_pos[4]
|
||
if len(pnbl_pos) >= 4:
|
||
pnbl_top4 = pnbl_pos[3]
|
||
print(pnbl_pos[3])
|
||
|
||
if len(pnbl_pos) >= 3:
|
||
pnbl_top3 = pnbl_pos[2]
|
||
if len(pnbl_pos) >= 2:
|
||
pnbl_top2 = pnbl_pos[1]
|
||
|
||
# lcp = df['last_character_placement'][index]
|
||
|
||
|
||
nnbl_top1 = nnbl_pos[0]
|
||
nnbl_top2 = nnbl_top1
|
||
nnbl_top3 = nnbl_top1
|
||
# nnbl_top4 = nnbl_top1
|
||
# nnbl_top5 = nnbl_top1
|
||
|
||
if len(nnbl_pos) >= 3:
|
||
nnbl_top3 = nnbl_pos[2]
|
||
if len(nnbl_pos) >= 2:
|
||
nnbl_top2 = nnbl_pos[1]
|
||
|
||
## top 1 and 2 are (ps6 and ps15)
|
||
ppnbl_indent = 0
|
||
print(top1,top2)
|
||
if (top1 == 'ps15' and top2 == 'ps6') or (top1 == 'ps6' and top2 == 'ps15') :
|
||
if cur_indent < pnbl_indent:
|
||
if df['plb'][pnbl_index] == 'N':
|
||
ppnbl_indent = df['ssc'][pnbl_index-1]
|
||
else:
|
||
ppnbl_indent = df['ssc'][pnbl_index-2]
|
||
if str(ppnbl_indent) < str(pnbl_indent):
|
||
print("identifying current as ps15 as possibly followed by speaker")
|
||
try:
|
||
print(line_no,data)
|
||
except:
|
||
pass
|
||
df['Identification_Status'][index] = 'ps15'
|
||
df['When_Identified'][index] = 'StartIdentifyingTopsPart7'
|
||
continue
|
||
## commentting as previous could be dialogue middle also
|
||
# elif cur_indent == pnbl_indent:
|
||
|
||
# print("identifying current as ps6 previous has same indent")
|
||
# try:
|
||
# print(line_no,data)
|
||
# except:
|
||
# pass
|
||
# df['Identification_Status'][index] = 'ps6'
|
||
# df['When_Identified'][index] = 'StartIdentifyingTopsPart7'
|
||
# continue
|
||
|
||
|
||
|
||
|
||
for index in df.index:
|
||
#print(index)
|
||
cur_line_pos = df['Identification_Status'][index].split(";")
|
||
if len(cur_line_pos) != 1 :
|
||
df['isIdentified'][index] == 'No'
|
||
else:
|
||
df['isIdentified'][index] == 'Yes'
|
||
|
||
return df
|
||
|
||
|
||
def start_top_identifications_part8(df):
|
||
|
||
for index in df.index:
|
||
|
||
if df['isIdentified'][index] == 'Yes' or pd.isna(df['Identification_Status'][index]):
|
||
continue
|
||
|
||
|
||
pnbl_pos = []
|
||
nnbl_pos = []
|
||
pnbl_index = -1
|
||
prev_flag = False
|
||
next_flag = False
|
||
pnbl_index = index -1
|
||
nnbl_index = index +1
|
||
|
||
if index == 0:
|
||
pnbl_pos = ['blank']
|
||
pnbl_index = 'first'
|
||
elif df['plb'][index] == 'N' :
|
||
print(pnbl_pos)
|
||
|
||
pnbl_pos = df['Identification_Status'][index-1].split(";")
|
||
pnbl_index = index -1
|
||
|
||
elif index - 1 == 0:
|
||
pnbl_pos = ['blank']
|
||
pnbl_index = 'first'
|
||
else:
|
||
pnbl_pos = df['Identification_Status'][index-2].split(";")
|
||
pnbl_index = index -2
|
||
|
||
if index == df.index[-1]:
|
||
nnbl_pos = ['blank']
|
||
nnbl_index = 'last'
|
||
elif df['nlb'][index] == 'N' :
|
||
nnbl_pos = df['Identification_Status'][index+1].split(";")
|
||
nnbl_par = df['parenthetical'][index+1]
|
||
nnbl_index = index + 1
|
||
elif index+1 == df.index[-1]:
|
||
nnbl_pos = ['blank']
|
||
nnbl_index = 'last'
|
||
else:
|
||
nnbl_pos = df['Identification_Status'][index+2].split(";")
|
||
nnbl_par = df['parenthetical'][index+2]
|
||
nnbl_index = index + 2
|
||
|
||
cur_indent = df['ssc'][index]
|
||
try:
|
||
pnbl_indent = df['ssc'][pnbl_index]
|
||
except:
|
||
pnbl_indent = -1
|
||
try:
|
||
nnbl_indent = df['ssc'][nnbl_index]
|
||
except:
|
||
nnbl_indent = -1
|
||
|
||
|
||
# try:
|
||
# if df['prvious_line_blank'][pnbl_index] == 'N' :
|
||
# ppnbl_pos = df['Identification_Status'][pnbl_index-1].split(";")
|
||
# else:
|
||
# ppnbl_pos = df['Identification_Status'][pnbl_index-2].split(";")
|
||
# except:
|
||
# pass
|
||
|
||
|
||
line_no = df['line_no'][index]
|
||
data = df['data'][index]
|
||
cur_line_pos = df['Identification_Status'][index].split(";")
|
||
pnbl_par = df['parenthetical'][pnbl_index] if pnbl_index != 'first' else False
|
||
|
||
|
||
try:
|
||
pnbl_top2 = pnbl_pos[1]
|
||
|
||
except:
|
||
pnbl_top2 = ''
|
||
|
||
# try:
|
||
# nnbl_top2 = nnbl_pos[1]
|
||
|
||
# except:
|
||
# nnbl_top2 = ''
|
||
|
||
#cur_indent = df['data_begins/Space count'][index]
|
||
#print(cur_indent)
|
||
|
||
#print("examining")
|
||
|
||
#print(line_no,cur_indent,data)
|
||
#print(cur_line_pos)
|
||
# print(pnbl_pos)
|
||
# print(cur_line_pos)
|
||
# print(nnbl_pos)
|
||
line_new_pos = []
|
||
|
||
print("\n")
|
||
|
||
cur_line_pos = df['Identification_Status'][index].split(";")
|
||
top1 = cur_line_pos[0]
|
||
top2 = top1
|
||
top3 = top1
|
||
top4 = top1
|
||
top5 = top1
|
||
if len(cur_line_pos) == 5:
|
||
top5 = cur_line_pos[4]
|
||
if len(cur_line_pos) >= 4:
|
||
top4 = cur_line_pos[3]
|
||
|
||
if len(cur_line_pos) >= 3:
|
||
top3 = cur_line_pos[2]
|
||
if len(cur_line_pos) >= 2:
|
||
top2 = cur_line_pos[1]
|
||
|
||
lcp = df['lcp'][index]
|
||
|
||
|
||
if len(pnbl_pos) == 0:
|
||
pnbl_pos = ['blank']
|
||
|
||
|
||
pnbl_top1 = pnbl_pos[0]
|
||
pnbl_top2 = pnbl_top1
|
||
pnbl_top3 = pnbl_top1
|
||
pnbl_top4 = pnbl_top1
|
||
pnbl_top5 = pnbl_top1
|
||
if len(pnbl_pos) == 5:
|
||
pnbl_top5 = pnbl_pos[4]
|
||
if len(pnbl_pos) >= 4:
|
||
pnbl_top4 = pnbl_pos[3]
|
||
print(pnbl_pos[3])
|
||
|
||
if len(pnbl_pos) >= 3:
|
||
pnbl_top3 = pnbl_pos[2]
|
||
if len(pnbl_pos) >= 2:
|
||
pnbl_top2 = pnbl_pos[1]
|
||
|
||
# lcp = df['last_character_placement'][index]
|
||
|
||
|
||
nnbl_top1 = nnbl_pos[0]
|
||
nnbl_top2 = nnbl_top1
|
||
nnbl_top3 = nnbl_top1
|
||
# nnbl_top4 = nnbl_top1
|
||
# nnbl_top5 = nnbl_top1
|
||
|
||
if len(nnbl_pos) >= 3:
|
||
nnbl_top3 = nnbl_pos[2]
|
||
if len(nnbl_pos) >= 2:
|
||
nnbl_top2 = nnbl_pos[1]
|
||
|
||
## top 1 and 2 are (ps6 and ps15)
|
||
ppnbl_indent = 0
|
||
print(top1,top2)
|
||
if (top1 == 'ps4' and top2 == 'ps6') or (top1 == 'ps6' and top2 == 'ps4') :
|
||
if "".join(pnbl_pos) == 'ps1' :
|
||
if "".join(nnbl_pos) == 'ps6' and df['nlb'][index] == 'N':
|
||
print("identifying current as ps4 ")
|
||
try:
|
||
print(line_no,data)
|
||
except:
|
||
pass
|
||
df['Identification_Status'][index] = 'ps4'
|
||
df['When_Identified'][index] = 'StartIdentifyingTopsPart8'
|
||
continue
|
||
|
||
elif 'ps5' not in nnbl_pos and nnbl_top1 != 'ps6' :
|
||
print("identifying current as ps6 as possibly between slug and speaker")
|
||
try:
|
||
print(line_no,data)
|
||
except:
|
||
pass
|
||
df['Identification_Status'][index] = 'ps6'
|
||
df['When_Identified'][index] = 'StartIdentifyingTopsPart8'
|
||
continue
|
||
|
||
|
||
|
||
|
||
|
||
for index in df.index:
|
||
#print(index)
|
||
cur_line_pos = df['Identification_Status'][index].split(";")
|
||
if len(cur_line_pos) != 1 :
|
||
df['isIdentified'][index] == 'No'
|
||
else:
|
||
df['isIdentified'][index] == 'Yes'
|
||
|
||
|
||
return df
|
||
|
||
|
||
#1.1
|
||
def decrease_wt_dial_between_action(df):
|
||
|
||
def useWeights(ps):
|
||
return int(ps.split("-")[1])
|
||
|
||
|
||
for index in df.index:
|
||
wt_changed = False
|
||
if df['isIdentified'][index] == 'Yes' or pd.isna(df['Identification_Status'][index]):
|
||
continue
|
||
|
||
|
||
|
||
pnbl_pos = []
|
||
nnbl_pos = []
|
||
pnbl_index = -1
|
||
prev_flag = False
|
||
next_flag = False
|
||
pnbl_index = index -1
|
||
nnbl_index = index +1
|
||
|
||
if index == 0:
|
||
pnbl_pos = ['blank']
|
||
pnbl_index = 'first'
|
||
elif df['plb'][index] == 'N' :
|
||
|
||
|
||
pnbl_pos = df['Identification_Status'][index-1].split(";")
|
||
print(pnbl_pos)
|
||
pnbl_index = index -1
|
||
|
||
elif index - 1 == 0:
|
||
pnpl_pos = ['blank']
|
||
pnbl_index = 'first'
|
||
else:
|
||
pnbl_pos = df['Identification_Status'][index-2].split(";")
|
||
pnbl_index = index -2
|
||
|
||
if index == df.index[-1]:
|
||
nnbl_pos = ['blank']
|
||
nnbl_index = 'last'
|
||
elif df['nlb'][index] == 'N' :
|
||
nnbl_pos = df['Identification_Status'][index+1].split(";")
|
||
nnbl_par = df['parenthetical'][index+1]
|
||
nnbl_index = index + 1
|
||
elif index+1 == df.index[-1]:
|
||
nnbl_pos = ['blank']
|
||
nnbl_index = 'last'
|
||
else:
|
||
nnbl_pos = df['Identification_Status'][index+2].split(";")
|
||
nnbl_par = df['parenthetical'][index+2]
|
||
nnbl_index = index + 2
|
||
|
||
|
||
|
||
|
||
line_no = df['line_no'][index]
|
||
data = df['data'][index]
|
||
cur_line_pos = df['Identification_Status'][index].split(";")
|
||
|
||
line_new_pos = []
|
||
|
||
print("\n")
|
||
|
||
cur_line_pos = df['Identification_Status'][index].split(";")
|
||
top1 = cur_line_pos[0]
|
||
top2 = top1
|
||
if len(cur_line_pos) >= 2:
|
||
top2 = cur_line_pos[1]
|
||
|
||
|
||
|
||
if len(pnbl_pos) == 0:
|
||
pnbl_pos = ['blank']
|
||
|
||
|
||
pnbl_top1 = pnbl_pos[0]
|
||
nnbl_top1 = nnbl_pos[0]
|
||
print("checking dialogue between action",index,pnbl_pos,cur_line_pos,nnbl_pos)
|
||
## if previous top is action start and current top (top 2 ) is dia middle then decrease weight of dialogue middle by 5
|
||
## also decrease wt of dialogue start by 11
|
||
if pnbl_top1 == 'ps4' and (top1 == 'ps14' or top2 == 'ps14'):
|
||
print(str(int(df['ps14'][index]) - 5))
|
||
df['ps14'][index] = str(int(df['ps14'][index]) - 5)
|
||
wt_changed = True
|
||
|
||
## if previous top is action start and next top1 is ps6 then increase wt of ps5 by 11
|
||
## also decrease wt of dialogue start by 11
|
||
if pnbl_top1 == 'ps4' and nnbl_top1 == 'ps6':
|
||
df['ps5'][index] = str(int(df['ps5'][index]) + 11)
|
||
#df['ps13'][index] = str(int(df['ps13'][index]) - 11)
|
||
wt_changed = True
|
||
|
||
|
||
if not wt_changed:
|
||
continue
|
||
else:
|
||
## append the weight to the possibilites
|
||
pos_with_weights = []
|
||
for pos in cur_line_pos:
|
||
print(pos)
|
||
wt = 0
|
||
pos_wt = str(pos)
|
||
try:
|
||
wt = df[pos][index].astype(int)
|
||
pos_wt += '-' + str(wt)
|
||
except:
|
||
try:
|
||
wt = int(df[pos][index])
|
||
pos_wt += '-' + str(wt)
|
||
except:
|
||
try:
|
||
wt = df[pos][index]
|
||
pos_wt += '-' + str(wt)
|
||
except:
|
||
continue
|
||
|
||
print(pos_wt)
|
||
pos_with_weights.append(pos_wt)
|
||
|
||
# now sort in descending order using the weights as key
|
||
pos_with_weights = sorted(pos_with_weights,key=useWeights , reverse = True)
|
||
|
||
line_pos_string_with_weights = ';'.join([str(elem) for elem in pos_with_weights])
|
||
|
||
df['Identification_Status_with_weights'][index] = line_pos_string_with_weights
|
||
|
||
## copy over to identification status without the weights but in order of decreasing weights
|
||
pos_without_weight = []
|
||
for pos in pos_with_weights:
|
||
pos_without_weight.append(pos.split("-")[0])
|
||
|
||
line_pos_string = ';'.join([str(elem) for elem in pos_without_weight])
|
||
print(line_pos_string)
|
||
df['Identification_Status'][index] = line_pos_string
|
||
|
||
|
||
|
||
|
||
for index in df.index:
|
||
#print(index)
|
||
cur_line_pos = df['Identification_Status'][index].split(";")
|
||
if len(cur_line_pos) != 1 :
|
||
df['isIdentified'][index] == 'No'
|
||
else:
|
||
df['isIdentified'][index] == 'Yes'
|
||
return df
|
||
|
||
|
||
def examine_among_two(df):
|
||
|
||
for index in df.index:
|
||
|
||
if df['isIdentified'][index] == 'Yes' or pd.isna(df['Identification_Status'][index]):
|
||
continue
|
||
|
||
|
||
|
||
pnbl_pos = []
|
||
nnbl_pos = []
|
||
pnbl_index = -1
|
||
prev_flag = False
|
||
next_flag = False
|
||
pnbl_index = index -1
|
||
nnbl_index = index +1
|
||
|
||
if index == 0:
|
||
pnbl_pos = ['blank']
|
||
pnbl_index = 'first'
|
||
elif df['plb'][index] == 'N' :
|
||
print(pnbl_pos)
|
||
|
||
pnbl_pos = df['Identification_Status'][index-1].split(";")
|
||
pnbl_index = index -1
|
||
|
||
elif index - 1 == 0:
|
||
pnpl_pos = ['blank']
|
||
pnbl_index = 'first'
|
||
else:
|
||
pnbl_pos = df['Identification_Status'][index-2].split(";")
|
||
pnbl_index = index -2
|
||
|
||
if index == df.index[-1]:
|
||
nnbl_pos = ['blank']
|
||
nnbl_index = 'last'
|
||
elif df['nlb'][index] == 'N' :
|
||
nnbl_pos = df['Identification_Status'][index+1].split(";")
|
||
nnbl_par = df['parenthetical'][index+1]
|
||
nnbl_index = index + 1
|
||
elif index+1 == df.index[-1]:
|
||
nnbl_pos = ['blank']
|
||
nnbl_index = 'last'
|
||
else:
|
||
nnbl_pos = df['Identification_Status'][index+2].split(";")
|
||
nnbl_par = df['parenthetical'][index+2]
|
||
nnbl_index = index + 2
|
||
|
||
cur_indent = df['ssc'][index]
|
||
try:
|
||
pnbl_indent = df['ssc'][pnbl_index]
|
||
except:
|
||
pnbl_indent = -1
|
||
try:
|
||
nnbl_indent = df['ssc'][nnbl_index]
|
||
nnbl_case = df['case'][nnbl_index]
|
||
except:
|
||
nnbl_indent = -1
|
||
|
||
|
||
# try:
|
||
# if df['prvious_line_blank'][pnbl_index] == 'N' :
|
||
# ppnbl_pos = df['Identification_Status'][pnbl_index-1].split(";")
|
||
# else:
|
||
# ppnbl_pos = df['Identification_Status'][pnbl_index-2].split(";")
|
||
# except:
|
||
# pass
|
||
|
||
|
||
line_no = df['line_no'][index]
|
||
data = df['data'][index]
|
||
cur_line_pos = df['Identification_Status'][index].split(";")
|
||
pnbl_par = df['parenthetical'][pnbl_index] if pnbl_index != 'first' else False
|
||
cur_line_par = df['parenthetical'][index]
|
||
cur_line_case = df['case'][index]
|
||
|
||
try:
|
||
pnbl_top2 = pnbl_pos[1]
|
||
|
||
except:
|
||
pnbl_top2 = ''
|
||
|
||
# try:
|
||
# nnbl_top2 = nnbl_pos[1]
|
||
|
||
# except:
|
||
# nnbl_top2 = ''
|
||
|
||
#cur_indent = df['data_begins/Space count'][index]
|
||
#print(cur_indent)
|
||
|
||
#print("examining")
|
||
|
||
#print(line_no,cur_indent,data)
|
||
#print(cur_line_pos)
|
||
# print(pnbl_pos)
|
||
# print(cur_line_pos)
|
||
# print(nnbl_pos)
|
||
line_new_pos = []
|
||
|
||
print("\n")
|
||
|
||
cur_line_pos = df['Identification_Status'][index].split(";")
|
||
top1 = cur_line_pos[0]
|
||
top2 = top1
|
||
top3 = top1
|
||
top4 = top1
|
||
top5 = top1
|
||
if len(cur_line_pos) == 5:
|
||
top5 = cur_line_pos[4]
|
||
if len(cur_line_pos) >= 4:
|
||
top4 = cur_line_pos[3]
|
||
|
||
if len(cur_line_pos) >= 3:
|
||
top3 = cur_line_pos[2]
|
||
if len(cur_line_pos) >= 2:
|
||
top2 = cur_line_pos[1]
|
||
|
||
lcp = df['lcp'][index]
|
||
|
||
|
||
if len(pnbl_pos) == 0:
|
||
pnbl_pos = ['blank']
|
||
|
||
|
||
pnbl_top1 = pnbl_pos[0]
|
||
pnbl_top2 = pnbl_top1
|
||
pnbl_top3 = pnbl_top1
|
||
pnbl_top4 = pnbl_top1
|
||
pnbl_top5 = pnbl_top1
|
||
if len(pnbl_pos) == 5:
|
||
pnbl_top5 = pnbl_pos[4]
|
||
if len(pnbl_pos) >= 4:
|
||
pnbl_top4 = pnbl_pos[3]
|
||
print(pnbl_pos[3])
|
||
|
||
if len(pnbl_pos) >= 3:
|
||
pnbl_top3 = pnbl_pos[2]
|
||
if len(pnbl_pos) >= 2:
|
||
pnbl_top2 = pnbl_pos[1]
|
||
|
||
# lcp = df['last_character_placement'][index]
|
||
|
||
|
||
nnbl_top1 = nnbl_pos[0]
|
||
nnbl_top2 = nnbl_top1
|
||
nnbl_top3 = nnbl_top1
|
||
# nnbl_top4 = nnbl_top1
|
||
# nnbl_top5 = nnbl_top1
|
||
|
||
if len(nnbl_pos) >= 3:
|
||
nnbl_top3 = nnbl_pos[2]
|
||
if len(nnbl_pos) >= 2:
|
||
nnbl_top2 = nnbl_pos[1]
|
||
|
||
try:
|
||
print(index,data,cur_line_case,top1,top2,nnbl_case,nnbl_top1)
|
||
except:
|
||
pass
|
||
|
||
|
||
nnbl_new_data = ''
|
||
before = ''
|
||
## if 6 and 22 left
|
||
## split after full stop to new line.. merge with subsequesnt transiton
|
||
if len(cur_line_pos) == 2 and (top1 == 'ps6' or top2 == 'ps22') or (top2 == 'ps6' or top1 == 'ps22'):
|
||
if re.search(".",data):
|
||
print("found full stop,separating")
|
||
parts = data.split(".")
|
||
last = parts[-1]
|
||
try:
|
||
print(last)
|
||
except:
|
||
pass
|
||
before = parts[0:-1]
|
||
print(" ".join(before))
|
||
print(nnbl_indent)
|
||
if len(last.split()) == 1:
|
||
print("single word after full stop")
|
||
if "".join(nnbl_pos) == 'ps16':
|
||
print("next is transition , merging")
|
||
nnbl_data = df['data'][nnbl_index]
|
||
try:
|
||
print(nnbl_data)
|
||
except:
|
||
pass
|
||
nnbl_new_data = last.strip() + ' ' + nnbl_data.strip()
|
||
try:
|
||
print(nnbl_new_data)
|
||
except:
|
||
pass
|
||
|
||
nnbl_new_data = nnbl_new_data.rjust(len(nnbl_new_data) + int(nnbl_indent))
|
||
df['data'][nnbl_index] = nnbl_new_data
|
||
|
||
|
||
print("Splitting current and Identifying current action end")
|
||
df['data'][index] = " ".join(before)
|
||
df['Identification_Status'][index] = 'ps6'
|
||
df['When_Identified'][index] = 'ExamineLastTwo'
|
||
continue
|
||
|
||
|
||
## if 7 and 8.. make 7 if no parenthtical
|
||
if len(cur_line_pos) == 2 and ((top1 == 'ps7' and top2 == 'ps8') or (top1 == 'ps8' and top2 == 'ps7')):
|
||
if df['parenthetical'][index] == 'Absent':
|
||
try:
|
||
print("Identifying as speaker as no parenthtical",data)
|
||
except:
|
||
pass
|
||
df['Identification_Status'][index] = 'ps7'
|
||
df['When_Identified'][index] = 'ExamineLastTwo'
|
||
continue
|
||
|
||
## if 1/2 and 30 left keep 1 if lcp <
|
||
if len(cur_line_pos) == 2 and (( (top1 == 'ps1' or top1 == 'ps2') and top2 == 'ps30') or (top1 == 'ps30' and top2 == 'ps1')):
|
||
print(index,cur_line_case,nnbl_case,nnbl_top1)
|
||
if df['lcp'][index] <= 63:
|
||
try:
|
||
print("Identifying as slugline",data)
|
||
except:
|
||
pass
|
||
df['Identification_Status'][index] = 'ps1'
|
||
df['When_Identified'][index] = 'ExamineLastTwo'
|
||
elif cur_line_case == 'AllUpper' and nnbl_case == 'AllUpper' and (nnbl_top1 == 'ps1' or nnbl_top1 == 'ps3'):
|
||
try:
|
||
print("Identifying as slugline beginning",data)
|
||
except:
|
||
pass
|
||
df['Identification_Status'][index] = 'ps2'
|
||
df['When_Identified'][index] = 'ExamineLastTwo'
|
||
try:
|
||
print("Identifying as slugline end",df['data'][nnbl_index])
|
||
except:
|
||
pass
|
||
df['Identification_Status'][nnbl_index] = 'ps3'
|
||
df['When_Identified'][nnbl_index] = 'ExamineLastTwo'
|
||
continue
|
||
|
||
## if 15 and 29 left keep 15 if lcp <
|
||
if len(cur_line_pos) == 2 and ((top1 == 'ps15' and top2 == 'ps29') or (top1 == 'ps29' and top2 == 'ps15')):
|
||
if df['lcp'][index] <= 51:
|
||
try:
|
||
print("Identifying as dialogue ending",data)
|
||
except:
|
||
pass
|
||
df['Identification_Status'][index] = 'ps15'
|
||
df['When_Identified'][index] = 'ExamineLastTwo'
|
||
continue
|
||
|
||
if len(cur_line_pos) == 2 and ((top1 == 'ps13' and top2 == 'ps9') or (top1 == 'ps9' and top2 == 'ps13')) :
|
||
if cur_line_par == 'Absent':
|
||
try:
|
||
print("Identifying as dialogue beginning",data)
|
||
except:
|
||
pass
|
||
df['Identification_Status'][index] = 'ps13'
|
||
df['When_Identified'][index] = 'ExamineLastTwo'
|
||
continue
|
||
|
||
if len(cur_line_pos) == 2:
|
||
if cur_line_pos[0] == 'ps1' and cur_line_pos[1] == 'ps17':
|
||
wt1 = int(df['ps1'][index])
|
||
wt17 = int(df['ps17'][index])
|
||
if wt1 - wt17 > 20:
|
||
print("identifying current as ps1 ")
|
||
try:
|
||
print(line_no,data)
|
||
except:
|
||
pass
|
||
df['Identification_Status'][index] = 'ps1'
|
||
df['When_Identified'][index] = 'ExamineLastTwo'
|
||
continue
|
||
### remove ps7 ,8 if in stopwords
|
||
elif cur_line_pos[0] == 'ps7':
|
||
line_new_pos = cur_line_pos
|
||
print("Checking stop words")
|
||
skip_words = ['ON THE SCREEN','ON THE TV','MORNING','AT HOTEL','TV','MONTAGES','MUSICAL MONTAGES','ESSENTIALS','LATER','ESSENTIAL']
|
||
search_data = data.replace(":","")
|
||
found_match = False
|
||
for word in skip_words:
|
||
if re.match(word,search_data.strip()):
|
||
found_match = True
|
||
break
|
||
if found_match:
|
||
try:
|
||
line_new_pos.remove('ps7')
|
||
line_new_pos.remove('ps8')
|
||
print("ps7,ps8 removed")
|
||
df['Identification_Status'][index] = ";".join(line_new_pos)
|
||
df['When_Identified'][index] = 'ExamineSpeakerSkipWords'
|
||
continue
|
||
|
||
except:
|
||
print("Could not remove speaker pos")
|
||
|
||
### remove ps3 if pnbl top 2 does not have ps2
|
||
else:
|
||
line_new_pos = cur_line_pos
|
||
print("Checking sluglineend")
|
||
if not (pnbl_top1 == 'ps2' or pnbl_top2 == 'ps2') and cur_line_pos[0] == 'ps3':
|
||
line_new_pos.remove('ps3')
|
||
print("ps3 removed")
|
||
df['Identification_Status'][index] = ";".join(line_new_pos)
|
||
df['When_Identified'][index] = 'ExamineSluglineEnd'
|
||
continue
|
||
|
||
|
||
for index in df.index:
|
||
#print(index)
|
||
cur_line_pos = df['Identification_Status'][index].split(";")
|
||
if len(cur_line_pos) != 1 :
|
||
df['isIdentified'][index] == 'No'
|
||
else:
|
||
df['isIdentified'][index] == 'Yes'
|
||
return df
|
||
|
||
|
||
|
||
def examine_action_using_top2_wt_diff(df):
|
||
|
||
for index in df.index[1:-1]:
|
||
if df['isIdentified'][index] == 'Yes':
|
||
continue
|
||
|
||
|
||
|
||
pnbl_pos = []
|
||
nnbl_pos = []
|
||
pnbl_index = index -1
|
||
nnbl_index = index +1
|
||
|
||
if index == 0:
|
||
pnbl_pos = ['blank']
|
||
elif df['plb'][index] == 'N' :
|
||
pnbl_pos = df['Identification_Status'][index-1].split(";")
|
||
pnbl_index = index -1
|
||
elif index - 1 == 0:
|
||
pnpl_pos = ['blank']
|
||
else:
|
||
pnbl_pos = df['Identification_Status'][index-2].split(";")
|
||
pnbl_index = index -2
|
||
|
||
if index == df.index[-1]:
|
||
nnbl_pos = ['blank']
|
||
elif df['nlb'][index] == 'N' :
|
||
nnbl_pos = df['Identification_Status'][index+1].split(";")
|
||
nnbl_par = df['parenthetical'][index+1]
|
||
nnbl_index = index +1
|
||
elif index+1 == df.index[-1]:
|
||
nnbl_pos = ['blank']
|
||
else:
|
||
nnbl_pos = df['Identification_Status'][index+2].split(";")
|
||
nnbl_par = df['parenthetical'][index+2]
|
||
nnbl_index = index +2
|
||
|
||
|
||
cur_indent = df['ssc'][index]
|
||
cur_case = df['case'][index]
|
||
try:
|
||
pnbl_indent = df['ssc'][pnbl_index]
|
||
pnbl_case = df['case'][pnbl_index]
|
||
except:
|
||
pnbl_indent = -1
|
||
pnbl_case = ''
|
||
#nnbl_indent = df['ssc'][nnbl_index]
|
||
|
||
try:
|
||
nnbl_indent = df['ssc'][nnbl_index]
|
||
nnbl_case = df['case'][nnbl_index]
|
||
except:
|
||
nnbl_indent = -1
|
||
nnbl_case = ''
|
||
|
||
try:
|
||
if df['plb'][pnbl_index] == 'N' :
|
||
ppnbl_pos = df['Identification_Status'][pnbl_index-1].split(";")
|
||
else:
|
||
ppnbl_pos = df['Identification_Status'][pnbl_index-2].split(";")
|
||
ppnbl_exists = True
|
||
except:
|
||
ppnbl_exists = False
|
||
pass
|
||
|
||
|
||
line_no = df['line_no'][index]
|
||
data = df['data'][index]
|
||
cur_line_pos = df['Identification_Status'][index].split(";")
|
||
pnbl_par = df['parenthetical'][pnbl_index]
|
||
|
||
try:
|
||
pnbl_top2 = pnbl_pos[1]
|
||
|
||
except:
|
||
pnbl_top2 = ''
|
||
|
||
# try:
|
||
# nnbl_top2 = nnbl_pos[1]
|
||
|
||
# except:
|
||
# nnbl_top2 = ''
|
||
|
||
# print(line_no,data)
|
||
# print(pnbl_pos)
|
||
# print(cur_line_pos)
|
||
# print(nnbl_pos)
|
||
line_new_pos = []
|
||
#using pnbl and nnbl identified lines refine/identify current line
|
||
# if "".join(pnbl_pos) in ('ps15','ps6') and cur_line_pos[0] == 'ps4':
|
||
# print(line_no,data)
|
||
# print("pnbl is 15 or 6 and current top is 'ps4'")
|
||
# print("Identifying as ps4")
|
||
# df['Identification_Status'][index] = 'ps4'
|
||
# cur_line_pos = ['ps4']
|
||
# df['When_Identified'][index] = 'RefiningActionPossibilities'
|
||
|
||
|
||
cur_line_pos = df['Identification_Status'][index].split(";")
|
||
top1 = cur_line_pos[0]
|
||
top2 = top1
|
||
top3 = top1
|
||
top4 = top1
|
||
top5 = top1
|
||
if len(cur_line_pos) == 5:
|
||
top5 = cur_line_pos[4]
|
||
if len(cur_line_pos) >= 4:
|
||
top4 = cur_line_pos[3]
|
||
|
||
if len(cur_line_pos) >= 3:
|
||
top3 = cur_line_pos[2]
|
||
if len(cur_line_pos) >= 2:
|
||
top2 = cur_line_pos[1]
|
||
|
||
|
||
|
||
top1_wt = df[top1][index]
|
||
top2_wt = df[top2][index]
|
||
|
||
top2_wt_diff = top1_wt - top2_wt
|
||
|
||
## if top is 6
|
||
if cur_line_pos[0] == 'ps6' :
|
||
print("top 2 wt diff",top2_wt_diff)
|
||
if cur_indent < 25 and "".join(nnbl_pos) == 'ps1' and top2_wt_diff > 15:
|
||
print("identifying as ps6")
|
||
df['Identification_Status'][index] = 'ps6'
|
||
df['When_Identified'][index] = 'ExamineActionUsingTop2Wt'
|
||
|
||
## if top is 5
|
||
if cur_line_pos[0] == 'ps5' and cur_case != 'AllUpper':
|
||
print("top 2 wt diff",top2_wt_diff)
|
||
if pnbl_indent == cur_indent and cur_indent == nnbl_indent and (("".join(pnbl_pos) == 'ps4' or "".join(pnbl_pos) == 'ps5') or ("".join(nnbl_pos) == 'ps6' or "".join(nnbl_pos) == 'ps5'))and top2_wt_diff > 10 and pnbl_case != 'AllUpper' and nnbl_case != 'AllUpper' :
|
||
print("identifying as ps5")
|
||
df['Identification_Status'][index] = 'ps5'
|
||
df['When_Identified'][index] = 'ExamineActionUsingTop2Wt'
|
||
return df
|
||
|
||
|
||
|
||
def identify_top_as_final(df):
|
||
|
||
|
||
#take the top possibility as final
|
||
for index in df.index:
|
||
|
||
cur_line_pos = df['Identification_Status'][index].split(";")
|
||
top1 = cur_line_pos[0]
|
||
top2 = top1
|
||
top3 = top1
|
||
top4 = top1
|
||
top5 = top1
|
||
if len(cur_line_pos) == 5:
|
||
top5 = cur_line_pos[4]
|
||
if len(cur_line_pos) >= 4:
|
||
top4 = cur_line_pos[3]
|
||
if len(cur_line_pos) >= 3:
|
||
top3 = cur_line_pos[2]
|
||
if len(cur_line_pos) >= 2:
|
||
top2 = cur_line_pos[1]
|
||
|
||
|
||
if df['isIdentified'][index] == 'Yes':
|
||
continue
|
||
contains_slug_words = False
|
||
data = df['data'][index]
|
||
sp_words3 = ['INT.','EXT.','I/E','E/I','EXT-','INT-']
|
||
for sp_word in sp_words3:
|
||
print(sp_word)
|
||
#search_data = data.replace(":","")
|
||
found = re.search(sp_word,data.strip()[0:8])
|
||
if found:
|
||
contains_slug_words = True
|
||
break
|
||
|
||
#line_pos = df['Identification_Status'][index].split(";")
|
||
if (top1 == 'ps1' or top1 == 'ps2') and not contains_slug_words:
|
||
df['Identification_Status'][index] = top2
|
||
continue
|
||
|
||
|
||
df['Identification_Status'][index] = top1
|
||
# df['isIdentified'][index] = 'No'
|
||
|
||
return df
|
||
|
||
|
||
def run_audit_on_identified_backup(df,audit_df):
|
||
|
||
def correct_case(df,audit_df,index,new_case):
|
||
##
|
||
line_no = df['line_no'][index]
|
||
print("correcting case to",new_case)
|
||
|
||
if new_case == 'AllUpper':
|
||
df['data'][index] = df['data'][index].upper()
|
||
elif new_case == 'AllLower':
|
||
df['data'][index] = df['data'][index].lower()
|
||
|
||
df['case'][index] = new_case
|
||
#audit_df['case_format'][line_no] = new_case
|
||
audit_df['case_corrected'][line_no] = 'Corrected to ' + str(new_case)
|
||
|
||
|
||
|
||
def correct_left_indent(df,audit_df,index,new_indent):
|
||
##
|
||
line_no = df['line_no'][index]
|
||
data = df['data'][index]
|
||
data = data.strip()
|
||
print("Correcting left indent to",new_indent)
|
||
df['data'][index] = data.rjust(len(data)+new_indent)
|
||
df['ssc'][index] = new_indent
|
||
df['lcp'][index] = new_indent + len(data) - 1
|
||
|
||
audit_df['left_indent_corrected'][line_no] = 'Left indent Corrected to ' + str(new_indent)
|
||
|
||
def correct_right_indent(df,audit_df,index,new_lcp):
|
||
##
|
||
line_no = df['line_no'][index]
|
||
data = df['data'][index]
|
||
data = data.strip()
|
||
new_indent = 0
|
||
print("Correcting right indent to",83 - new_lcp -1)
|
||
new_indent = new_lcp - len(data) + 1
|
||
df['data'][index] = data.rjust(len(data) + new_indent)
|
||
df['ssc'][index] = new_indent
|
||
df['lcp'][index] = new_lcp
|
||
|
||
audit_df['right_indent_corrected'][line_no] = 'Right indent Corrected to ' + str(83 - new_lcp -1)
|
||
|
||
def delete_line_after(df,audit_df,index):
|
||
line_no = df['line_no'][index]
|
||
removed_line_no = df['line_no'][index+1]
|
||
df.drop(index + 1, inplace= True)
|
||
print("line deleted after",line_no)
|
||
print("line no deleted ",removed_line_no)
|
||
|
||
audit_df['blank_deleted_after'][line_no] = 'Yes'
|
||
audit_df['line_removed'][removed_line_no] = 'Yes'
|
||
|
||
|
||
def delete_line_before(df,audit_df,line_no):
|
||
line_no = df['line_no'][index]
|
||
removed_line_no = df['line_no'][index-1]
|
||
df.drop(index - 1, inplace= True)
|
||
print("line deleted before",line_no)
|
||
|
||
audit_df['blank_deleted_before'][line_no] = 'Yes'
|
||
audit_df['line_removed'][removed_line_no] = 'Yes'
|
||
|
||
def insert_line_after(df,audit_df,index):
|
||
line_no = df['line_no'][index]
|
||
next_line_no = df['line_no'][index+1]
|
||
new_line_no = (line_no + next_line_no) / 2
|
||
if new_line_no in audit_df.index:
|
||
new_line_no = (new_line_no + next_line_no)/2
|
||
print("inserted blank line after ", line_no)
|
||
df.loc[index + 0.25] = np.nan
|
||
df.loc[index + 0.25,'Identification_Status'] = 'blank'
|
||
df.loc[index + 0.25,'case'] = ''
|
||
df.loc[index + 0.25,'plb'] = 'N'
|
||
df.loc[index + 0.25,'nlb'] = 'N'
|
||
df.loc[index + 0.25,'line_no'] = new_line_no
|
||
|
||
df['plb'][index + 1] = 'Y'
|
||
|
||
audit_df['blank_inserted_after'][line_no] = 'Yes'
|
||
|
||
audit_df.loc[new_line_no] = 'No'
|
||
audit_df.loc[new_line_no]['data'] = ''
|
||
audit_df.loc[new_line_no]['data_corrected'] = ''
|
||
audit_df.loc[new_line_no]['line_removed'] = 'No'
|
||
|
||
print("line inserted after ",line_no)
|
||
|
||
def insert_line_before(df,audit_df,index):
|
||
line_no = df['line_no'][index]
|
||
pvs_line_no = df['line_no'][index-1]
|
||
new_line_no = (line_no + pvs_line_no) / 2
|
||
if new_line_no in audit_df.index:
|
||
new_line_no = (new_line_no + line_no)/2
|
||
|
||
print("inserted blank line before",line_no)
|
||
df.loc[index - 0.25] = np.nan
|
||
df.loc[index - 0.25,'Identification_Status'] = 'blank'
|
||
df.loc[index - 0.25,'case'] = 'None'
|
||
df.loc[index - 0.25,'plb'] = 'N'
|
||
df.loc[index - 0.25,'nlb'] = 'N'
|
||
df.loc[index - 0.25,'line_no'] = new_line_no
|
||
df['nlb'][index - 1] = 'Y'
|
||
|
||
audit_df['blank_inserted_before'][line_no] = 'Yes'
|
||
|
||
audit_df.loc[new_line_no] = ''
|
||
audit_df.loc[new_line_no]['line_removed'] = 'No'
|
||
|
||
|
||
def check_and_remove_numbers(df,audit_df,index):
|
||
data = df['data'][index]
|
||
|
||
start_is_num = True
|
||
scene_num = ''
|
||
## check if number at start
|
||
while start_is_num:
|
||
sub_num = re.search('\d',data.lstrip())
|
||
if sub_num:
|
||
if sub_num.start() == 0:
|
||
data = data.replace(sub_num.group(0),'')
|
||
df['data'][index] = data
|
||
print(data)
|
||
scene_num += sub_num.group(0)
|
||
continue
|
||
start_is_num = False
|
||
print("scene num",scene_num)
|
||
|
||
def audit_ps1(df,audit_df,index):
|
||
|
||
print("Auditing Slugline")
|
||
|
||
cur_indent = df['ssc'][index]
|
||
cur_data = df['data'][index]
|
||
cur_case = df['case'][index]
|
||
|
||
try:
|
||
print(cur_data)
|
||
except:
|
||
pass
|
||
new_indent = 15
|
||
if cur_indent != new_indent:
|
||
correct_left_indent(df,audit_df,index,new_indent)
|
||
else:
|
||
print("indent already",new_indent)
|
||
|
||
#check and correct case
|
||
new_case = 'AllUpper'
|
||
if cur_case != new_case:
|
||
correct_case(df,audit_df,index,new_case)
|
||
else:
|
||
print("Case already",new_case)
|
||
|
||
## plb nlb
|
||
plb = df['plb'][index]
|
||
nlb = df['nlb'][index]
|
||
|
||
if plb == 'N':
|
||
if index != 0 :
|
||
insert_line_before(df,audit_df,index)
|
||
df['plb'][index] = 'Y'
|
||
else:
|
||
print("previous line already blank")
|
||
|
||
if nlb == 'N':
|
||
insert_line_after(df,audit_df,index)
|
||
df['nlb'][index] = 'Y'
|
||
else:
|
||
print("next line already blank")
|
||
|
||
## remove numbers if found at start
|
||
check_and_remove_numbers(df,audit_df,index)
|
||
|
||
|
||
|
||
def audit_ps4(df,audit_df,index):
|
||
|
||
print("Auditing Action Beginning")
|
||
nl_deleted = False
|
||
cur_indent = df['ssc'][index]
|
||
cur_data = df['data'][index]
|
||
cur_case = df['case'][index]
|
||
|
||
new_indent = 15
|
||
if cur_indent != new_indent:
|
||
correct_left_indent(df,audit_df,index,new_indent)
|
||
else:
|
||
print("indent already",new_indent)
|
||
|
||
#check and correct case
|
||
# new_case = 'AllLower'
|
||
# if cur_case != new_case:
|
||
# correct_case(df,audit_df,index,new_case)
|
||
# else:
|
||
# print("Case already",new_case)
|
||
|
||
## plb nlb
|
||
plb = df['plb'][index]
|
||
nlb = df['nlb'][index]
|
||
|
||
if plb == 'N':
|
||
insert_line_before(df,audit_df,index)
|
||
df['plb'][index] = 'Y'
|
||
else:
|
||
print("previous line already blank")
|
||
|
||
if nlb == 'Y':
|
||
delete_line_after(df,audit_df,index)
|
||
nl_deleted = True
|
||
df['nlb'][index] = 'N'
|
||
else:
|
||
print("next line not blank")
|
||
|
||
return nl_deleted
|
||
|
||
def audit_ps5(df,audit_df,index):
|
||
|
||
print("Auditing Action Middle")
|
||
nl_deleted = False
|
||
cur_indent = df['ssc'][index]
|
||
cur_data = df['data'][index]
|
||
cur_case = df['case'][index]
|
||
|
||
new_indent = 15
|
||
if cur_indent != new_indent:
|
||
correct_left_indent(df,audit_df,index,new_indent)
|
||
else:
|
||
print("indent already",new_indent)
|
||
|
||
#check and correct case
|
||
# new_case = 'AllLower'
|
||
# if cur_case != new_case:
|
||
# correct_case(df,audit_df,index,new_case)
|
||
# else:
|
||
# print("Case already",new_case)
|
||
|
||
## plb nlb
|
||
plb = df['plb'][index]
|
||
nlb = df['nlb'][index]
|
||
|
||
if plb == 'Y':
|
||
delete_line_before(df,audit_df,index)
|
||
df['plb'][index] = 'N'
|
||
else:
|
||
print("previous line already non blank")
|
||
|
||
if nlb == 'Y':
|
||
delete_line_after(df,audit_df,index)
|
||
nl_deleted = True
|
||
df['nlb'][index] = 'N'
|
||
else:
|
||
print("next line not blank")
|
||
|
||
return nl_deleted
|
||
|
||
|
||
def audit_ps6(df,audit_df,index):
|
||
|
||
print("Auditing Action Ending")
|
||
nl_deleted = False
|
||
cur_indent = df['ssc'][index]
|
||
cur_data = df['data'][index]
|
||
cur_case = df['case'][index]
|
||
|
||
new_indent = 15
|
||
if cur_indent != new_indent:
|
||
correct_left_indent(df,audit_df,index,new_indent)
|
||
else:
|
||
print("indent already",new_indent)
|
||
|
||
#check and correct case
|
||
# new_case = 'AllLower'
|
||
# if cur_case != new_case:
|
||
# correct_case(df,audit_df,index,new_case)
|
||
# else:
|
||
# print("Case already",new_case)
|
||
|
||
## plb nlb
|
||
plb = df['plb'][index]
|
||
nlb = df['nlb'][index]
|
||
|
||
if df['Identification_Status'][index - 1] in ('ps4','ps5'):
|
||
if plb == 'Y':
|
||
delete_line_before(df,audit_df,index)
|
||
df['plb'][index] = 'N'
|
||
else:
|
||
print("previous line already non blank")
|
||
else:
|
||
## later move this to insert line before
|
||
pnbl_line_no = df['pnbl_line_no'][index]
|
||
try:
|
||
pnbl_identified = True if df.loc[df['line_no'] == pnbl_line_no,'isIdentified'] == 'Yes' else False
|
||
except:
|
||
pnbl_identified = False
|
||
|
||
if plb == 'N' and pnbl_identified:
|
||
insert_line_before(df,audit_df,index)
|
||
df['plb'][index] = 'Y'
|
||
else:
|
||
print("previous line already blank")
|
||
|
||
|
||
if nlb == 'N':
|
||
insert_line_after(df,audit_df,index)
|
||
df['nlb'][index] = 'Y'
|
||
else:
|
||
print("next line already blank")
|
||
|
||
|
||
def audit_ps7(df,audit_df,index):
|
||
|
||
print("Auditing Speaker")
|
||
nl_deleted = False
|
||
cur_indent = df['ssc'][index]
|
||
cur_data = df['data'][index]
|
||
cur_case = df['case'][index]
|
||
|
||
new_indent = 35
|
||
if cur_indent != new_indent:
|
||
correct_left_indent(df,audit_df,index,new_indent)
|
||
else:
|
||
print("indent already",new_indent)
|
||
|
||
#check and correct case
|
||
new_case = 'AllUpper'
|
||
if cur_case != new_case:
|
||
correct_case(df,audit_df,index,new_case)
|
||
else:
|
||
print("Case already",new_case)
|
||
|
||
## plb nlb
|
||
plb = df['plb'][index]
|
||
nlb = df['nlb'][index]
|
||
|
||
if plb == 'N':
|
||
insert_line_before(df,audit_df,index)
|
||
df['plb'][index] = 'Y'
|
||
else:
|
||
print("previous line already blank")
|
||
|
||
if nlb == 'Y':
|
||
delete_line_after(df,audit_df,index)
|
||
nl_deleted = True
|
||
df['nlb'][index] = 'N'
|
||
else:
|
||
print("next line not blank")
|
||
|
||
return nl_deleted
|
||
|
||
|
||
def audit_ps10(df,audit_df,index):
|
||
|
||
print("Auditing Parenthetical complete")
|
||
nl_deleted = False
|
||
cur_indent = df['ssc'][index]
|
||
cur_data = df['data'][index]
|
||
cur_case = df['case'][index]
|
||
|
||
new_indent = 30
|
||
if cur_indent != new_indent:
|
||
correct_left_indent(df,audit_df,index,new_indent)
|
||
else:
|
||
print("indent already",new_indent)
|
||
|
||
#check and correct case
|
||
new_case = 'AllLower'
|
||
if cur_case != new_case:
|
||
correct_case(df,audit_df,index,new_case)
|
||
else:
|
||
print("Case already",new_case)
|
||
|
||
## plb nlb
|
||
plb = df['plb'][index]
|
||
nlb = df['nlb'][index]
|
||
|
||
if plb == 'Y':
|
||
delete_line_before(df,audit_df,index)
|
||
df['plb'][index] = 'N'
|
||
else:
|
||
print("previous line already blank")
|
||
|
||
if nlb == 'Y':
|
||
delete_line_after(df,audit_df,index)
|
||
nl_deleted = True
|
||
df['nlb'][index] = 'N'
|
||
else:
|
||
print("next line not blank")
|
||
|
||
return nl_deleted
|
||
|
||
def audit_ps11(df,audit_df,index):
|
||
|
||
print("Auditing Parenthetical beginning")
|
||
nl_deleted = False
|
||
cur_indent = df['ssc'][index]
|
||
cur_data = df['data'][index]
|
||
cur_case = df['case'][index]
|
||
|
||
new_indent = 30
|
||
if cur_indent != new_indent:
|
||
correct_left_indent(df,audit_df,index,new_indent)
|
||
else:
|
||
print("indent already",new_indent)
|
||
|
||
#check and correct case
|
||
new_case = 'AllLower'
|
||
if cur_case != new_case:
|
||
correct_case(df,audit_df,index,new_case)
|
||
else:
|
||
print("Case already",new_case)
|
||
|
||
## plb nlb
|
||
plb = df['plb'][index]
|
||
nlb = df['nlb'][index]
|
||
|
||
if plb == 'Y':
|
||
delete_line_before(df,audit_df,index)
|
||
df['plb'][index] = 'N'
|
||
else:
|
||
print("previous line already blank")
|
||
|
||
if nlb == 'Y':
|
||
delete_line_after(df,audit_df,index)
|
||
nl_deleted = True
|
||
df['nlb'][index] = 'N'
|
||
else:
|
||
print("next line not blank")
|
||
|
||
return nl_deleted
|
||
|
||
def audit_ps20(df,audit_df,index):
|
||
|
||
print("Auditing Parenthetical middle")
|
||
nl_deleted = False
|
||
cur_indent = df['ssc'][index]
|
||
cur_data = df['data'][index]
|
||
cur_case = df['case'][index]
|
||
|
||
new_indent = 30
|
||
if cur_indent != new_indent:
|
||
correct_left_indent(df,audit_df,index,new_indent)
|
||
else:
|
||
print("indent already",new_indent)
|
||
|
||
#check and correct case
|
||
new_case = 'AllLower'
|
||
if cur_case != new_case:
|
||
correct_case(df,audit_df,index,new_case)
|
||
else:
|
||
print("Case already",new_case)
|
||
|
||
## plb nlb
|
||
plb = df['plb'][index]
|
||
nlb = df['nlb'][index]
|
||
|
||
if plb == 'Y':
|
||
delete_line_before(df,audit_df,index)
|
||
df['plb'][index] = 'N'
|
||
else:
|
||
print("previous line already blank")
|
||
|
||
if nlb == 'Y':
|
||
delete_line_after(df,audit_df,index)
|
||
nl_deleted = True
|
||
df['nlb'][index] = 'N'
|
||
else:
|
||
print("next line not blank")
|
||
|
||
return nl_deleted
|
||
|
||
def audit_ps12(df,audit_df,index):
|
||
|
||
print("Auditing Parenthetical end")
|
||
nl_deleted = False
|
||
cur_indent = df['ssc'][index]
|
||
cur_data = df['data'][index]
|
||
cur_case = df['case'][index]
|
||
|
||
new_indent = 30
|
||
if cur_indent != new_indent:
|
||
correct_left_indent(df,audit_df,index,new_indent)
|
||
else:
|
||
print("indent already",new_indent)
|
||
|
||
#check and correct case
|
||
new_case = 'AllLower'
|
||
if cur_case != new_case:
|
||
correct_case(df,audit_df,index,new_case)
|
||
else:
|
||
print("Case already",new_case)
|
||
|
||
## plb nlb
|
||
plb = df['plb'][index]
|
||
nlb = df['nlb'][index]
|
||
|
||
if plb == 'Y':
|
||
delete_line_before(df,audit_df,index)
|
||
df['plb'][index] = 'N'
|
||
else:
|
||
print("previous line already blank")
|
||
|
||
if nlb == 'Y':
|
||
delete_line_after(df,audit_df,index)
|
||
nl_deleted = True
|
||
df['nlb'][index] = 'N'
|
||
else:
|
||
print("next line not blank")
|
||
|
||
return nl_deleted
|
||
|
||
def audit_ps13(df,audit_df,index):
|
||
|
||
print("Auditing Dialogue Beginning")
|
||
nl_deleted = False
|
||
cur_indent = df['ssc'][index]
|
||
cur_data = df['data'][index]
|
||
cur_case = df['case'][index]
|
||
|
||
new_indent = 25
|
||
if cur_indent != new_indent:
|
||
correct_left_indent(df,audit_df,index,new_indent)
|
||
else:
|
||
print("indent already",new_indent)
|
||
|
||
#check and correct case
|
||
# new_case = 'AllLower'
|
||
# if cur_case != new_case:
|
||
# correct_case(df,audit_df,index,new_case)
|
||
# else:
|
||
# print("Case already",new_case)
|
||
|
||
## plb nlb
|
||
plb = df['plb'][index]
|
||
nlb = df['nlb'][index]
|
||
|
||
if plb == 'Y':
|
||
delete_line_before(df,audit_df,index)
|
||
df['plb'][index] = 'N'
|
||
else:
|
||
print("previous line already blank")
|
||
|
||
if nlb == 'Y':
|
||
delete_line_after(df,audit_df,index)
|
||
nl_deleted = True
|
||
df['nlb'][index] = 'N'
|
||
else:
|
||
print("next line not blank")
|
||
|
||
return nl_deleted
|
||
|
||
def audit_ps14(df,audit_df,index):
|
||
|
||
print("Auditing Dialogue Middle")
|
||
nl_deleted = False
|
||
cur_indent = df['ssc'][index]
|
||
cur_data = df['data'][index]
|
||
cur_case = df['case'][index]
|
||
|
||
new_indent = 25
|
||
if cur_indent != new_indent:
|
||
correct_left_indent(df,audit_df,index,new_indent)
|
||
else:
|
||
print("indent already",new_indent)
|
||
|
||
#check and correct case
|
||
# new_case = 'AllLower'
|
||
# if cur_case != new_case:
|
||
# correct_case(df,audit_df,index,new_case)
|
||
# else:
|
||
# print("Case already",new_case)
|
||
|
||
## plb nlb
|
||
plb = df['plb'][index]
|
||
nlb = df['nlb'][index]
|
||
|
||
if plb == 'Y':
|
||
delete_line_before(df,audit_df,index)
|
||
df['plb'][index] = 'N'
|
||
else:
|
||
print("previous line already blank")
|
||
|
||
if nlb == 'Y':
|
||
delete_line_after(df,audit_df,index)
|
||
nl_deleted = True
|
||
df['nlb'][index] = 'N'
|
||
else:
|
||
print("next line not blank")
|
||
|
||
return nl_deleted
|
||
|
||
def audit_ps15(df,audit_df,index):
|
||
|
||
print("Auditing Dialogue End")
|
||
nl_deleted = False
|
||
cur_indent = df['ssc'][index]
|
||
cur_data = df['data'][index]
|
||
cur_case = df['case'][index]
|
||
|
||
|
||
new_indent = 25
|
||
if cur_indent != new_indent:
|
||
correct_left_indent(df,audit_df,index,new_indent)
|
||
else:
|
||
print("indent already",new_indent)
|
||
|
||
# #check and correct case
|
||
# new_case = 'AllLower'
|
||
# if cur_case != new_case:
|
||
# correct_case(df,audit_df,index,new_case)
|
||
# else:
|
||
# print("Case already",new_case)
|
||
|
||
## plb nlb
|
||
plb = df['plb'][index]
|
||
nlb = df['nlb'][index]
|
||
|
||
|
||
if plb == 'Y':
|
||
delete_line_before(df,audit_df,index)
|
||
df['plb'][index] = 'N'
|
||
else:
|
||
print("previous line already not blank")
|
||
|
||
if nlb == 'N':
|
||
nl_pos = df['Identification_Status'][index+1]
|
||
if nl_pos == 'ps10':
|
||
print("not inserting blank as next is parenthtical")
|
||
else:
|
||
insert_line_after(df,audit_df,index)
|
||
df['nlb'][index] = 'Y'
|
||
else:
|
||
print("next line already blank")
|
||
|
||
|
||
def audit_ps16(df,audit_df,index):
|
||
|
||
print("Auditing Transition")
|
||
|
||
cur_indent = df['ssc'][index]
|
||
cur_data = df['data'][index]
|
||
cur_case = df['case'][index]
|
||
cur_lcp = df['lcp'][index]
|
||
|
||
new_lcp = 72
|
||
if cur_lcp != new_lcp:
|
||
correct_right_indent(df,audit_df,index,new_lcp)
|
||
else:
|
||
print("indent already",new_lcp)
|
||
|
||
#check and correct case
|
||
print(cur_case,"123")
|
||
new_case = 'AllUpper'
|
||
if cur_case != new_case:
|
||
correct_case(df,audit_df,index,new_case)
|
||
else:
|
||
print("Case already",new_case)
|
||
|
||
## plb nlb
|
||
plb = df['plb'][index]
|
||
nlb = df['nlb'][index]
|
||
|
||
if plb == 'N':
|
||
insert_line_before(df,audit_df,index)
|
||
df['plb'][index] = 'Y'
|
||
else:
|
||
print("previous line already blank")
|
||
|
||
if nlb == 'N':
|
||
insert_line_after(df,audit_df,index)
|
||
df['nlb'][index] = 'Y'
|
||
else:
|
||
print("next line already blank")
|
||
|
||
def audit_ps17(df,audit_df,index):
|
||
|
||
print("Auditing Special Term")
|
||
|
||
cur_indent = df['ssc'][index]
|
||
cur_data = df['data'][index]
|
||
cur_case = df['case'][index]
|
||
|
||
new_indent = 15
|
||
if cur_indent != new_indent:
|
||
correct_left_indent(df,audit_df,index,new_indent)
|
||
else:
|
||
print("indent already",new_indent)
|
||
|
||
#check and correct case
|
||
new_case = 'AllUpper'
|
||
if cur_case != new_case:
|
||
correct_case(df,audit_df,index,new_case)
|
||
else:
|
||
print("Case already",new_case)
|
||
|
||
## plb nlb
|
||
plb = df['plb'][index]
|
||
nlb = df['nlb'][index]
|
||
|
||
if plb == 'N':
|
||
insert_line_before(df,audit_df,index)
|
||
df['plb'][index] = 'Y'
|
||
else:
|
||
print("previous line already blank")
|
||
|
||
if nlb == 'N':
|
||
insert_line_after(df,audit_df,index)
|
||
df['nlb'][index] = 'Y'
|
||
else:
|
||
print("next line already blank")
|
||
|
||
|
||
index_iter = iter(df.index)
|
||
|
||
|
||
for index in index_iter:
|
||
|
||
if (df['Identification_Status'][index] == 'blank'):
|
||
continue
|
||
|
||
nl_deleted = False
|
||
cur_line_pos = df['Identification_Status'][index]
|
||
fn_name = 'audit_' + cur_line_pos
|
||
line_no = df['line_no'][index]
|
||
print("\n")
|
||
print("line no",line_no)
|
||
print("index ",index)
|
||
print(cur_line_pos)
|
||
|
||
try:
|
||
to_call_fn = locals()[fn_name]
|
||
print(to_call_fn)
|
||
except:
|
||
continue
|
||
|
||
try:
|
||
nl_deleted = to_call_fn(df,audit_df,index)
|
||
except:
|
||
pass
|
||
|
||
if nl_deleted :
|
||
next(index_iter)
|
||
|
||
|
||
|
||
df = df.sort_index().reset_index(drop=True)
|
||
|
||
#df = df.sort_values(by=['line_no']).reset_index(drop =True)
|
||
|
||
return df
|
||
|
||
|
||
def run_audit_on_identified(df,audit_df = False):
|
||
|
||
def correct_case(df,index,new_case,audit_df = False):
|
||
##
|
||
line_no = df['line_no'][index]
|
||
print("correcting case to",new_case)
|
||
|
||
if new_case == 'AllUpper':
|
||
df['data'][index] = df['data'][index].upper()
|
||
elif new_case == 'AllLower':
|
||
df['data'][index] = df['data'][index].lower()
|
||
|
||
df['case'][index] = new_case
|
||
|
||
try:
|
||
if not audit_df.empty:
|
||
audit_df['case_corrected'][line_no] = 'Corrected to ' + str(new_case)
|
||
|
||
except:
|
||
pass
|
||
|
||
def correct_left_indent(df,index,new_indent,audit_df= False):
|
||
##
|
||
line_no = df['line_no'][index]
|
||
data = df['data'][index]
|
||
data = data.strip()
|
||
print("Correcting left indent to",new_indent)
|
||
df['data'][index] = data.rjust(len(data)+new_indent)
|
||
df['ssc'][index] = new_indent
|
||
df['lcp'][index] = new_indent + len(data) - 1
|
||
|
||
try:
|
||
if not audit_df.empty:
|
||
audit_df['left_indent_corrected'][line_no] = 'Left indent Corrected to ' + str(new_indent)
|
||
except:
|
||
pass
|
||
|
||
def correct_right_indent(df,index,new_lcp,audit_df=False):
|
||
##
|
||
line_no = df['line_no'][index]
|
||
data = df['data'][index]
|
||
data = data.strip()
|
||
new_indent = 0
|
||
print("Correcting right indent to",83 - new_lcp -1)
|
||
new_indent = new_lcp - len(data) + 1
|
||
df['data'][index] = data.rjust(len(data) + new_indent)
|
||
df['ssc'][index] = new_indent
|
||
df['lcp'][index] = new_lcp
|
||
|
||
try:
|
||
if not audit_df.empty:
|
||
audit_df['right_indent_corrected'][line_no] = 'Right indent Corrected to ' + str(83 - new_lcp -1)
|
||
except:
|
||
pass
|
||
|
||
def delete_line_after(df,index,audit_df=False):
|
||
line_no = df['line_no'][index]
|
||
removed_line_no = df['line_no'][index+1]
|
||
df.drop(index + 1, inplace= True)
|
||
print("line deleted after",line_no)
|
||
print("line no deleted ",removed_line_no)
|
||
|
||
try:
|
||
if not audit_df.empty:
|
||
|
||
audit_df['blank_deleted_after'][line_no] = 'Yes'
|
||
audit_df['line_removed'][removed_line_no] = 'Yes'
|
||
except:
|
||
pass
|
||
|
||
def delete_line_before(df,line_no,audit_df=False):
|
||
line_no = df['line_no'][index]
|
||
removed_line_no = df['line_no'][index-1]
|
||
df.drop(index - 1, inplace= True)
|
||
print("line deleted before",line_no)
|
||
|
||
try:
|
||
if not audit_df.empty:
|
||
audit_df['blank_deleted_before'][line_no] = 'Yes'
|
||
audit_df['line_removed'][removed_line_no] = 'Yes'
|
||
except:
|
||
pass
|
||
|
||
def insert_line_after(df,index,audit_df=False):
|
||
line_no = df['line_no'][index]
|
||
next_line_no = df['line_no'][index+1]
|
||
new_line_no = (line_no + next_line_no) / 2
|
||
try:
|
||
if not audit_df.empty:
|
||
if new_line_no in audit_df.index:
|
||
new_line_no = (new_line_no + next_line_no)/2
|
||
except:
|
||
pass
|
||
|
||
print("inserted blank line after ", line_no)
|
||
df.loc[index + 0.25] = np.nan
|
||
df.loc[index + 0.25,'data'] = ''
|
||
df.loc[index + 0.25,'Identification_Status'] = 'blank'
|
||
df.loc[index + 0.25,'case'] = ''
|
||
df.loc[index + 0.25,'plb'] = 'N'
|
||
df.loc[index + 0.25,'nlb'] = 'N'
|
||
df.loc[index + 0.25,'line_no'] = new_line_no
|
||
|
||
df['plb'][index + 1] = 'Y'
|
||
|
||
try:
|
||
if not audit_df.empty:
|
||
audit_df['blank_inserted_after'][line_no] = 'Yes'
|
||
|
||
audit_df.loc[new_line_no] = np.nan
|
||
audit_df.loc[new_line_no]['data'] = ''
|
||
audit_df.loc[new_line_no]['data_corrected'] = ''
|
||
audit_df.loc[new_line_no]['line_removed'] = 'No'
|
||
except:
|
||
pass
|
||
|
||
print("line inserted after ",line_no)
|
||
|
||
def insert_line_before(df,index,audit_df=False):
|
||
line_no = df['line_no'][index]
|
||
pvs_line_no = df['line_no'][index-1]
|
||
new_line_no = (line_no + pvs_line_no) / 2
|
||
try:
|
||
if not audit_df.empty:
|
||
if new_line_no in audit_df.index:
|
||
new_line_no = (new_line_no + line_no)/2
|
||
except:
|
||
pass
|
||
|
||
print("inserted blank line before",line_no)
|
||
df.loc[index - 0.25] = np.nan
|
||
df.loc[index - 0.25,'Identification_Status'] = 'blank'
|
||
df.loc[index - 0.25,'data'] = ''
|
||
df.loc[index - 0.25,'case'] = 'None'
|
||
df.loc[index - 0.25,'plb'] = 'N'
|
||
df.loc[index - 0.25,'nlb'] = 'N'
|
||
df.loc[index - 0.25,'line_no'] = new_line_no
|
||
df['nlb'][index - 1] = 'Y'
|
||
|
||
try:
|
||
if not audit_df.empty:
|
||
audit_df['blank_inserted_before'][line_no] = 'Yes'
|
||
|
||
audit_df.loc[new_line_no] = np.nan
|
||
audit_df.loc[new_line_no]['line_removed'] = 'No'
|
||
except:
|
||
pass
|
||
|
||
def check_and_remove_numbers(df,index,audit_df=False):
|
||
data = df['data'][index]
|
||
|
||
start_is_num = True
|
||
scene_num = ''
|
||
## check if number at start
|
||
while start_is_num:
|
||
sub_num = re.search('\d',data.lstrip())
|
||
if sub_num:
|
||
if sub_num.start() == 0:
|
||
data = data.replace(sub_num.group(0),'')
|
||
df['data'][index] = data
|
||
print(data)
|
||
scene_num += sub_num.group(0)
|
||
continue
|
||
start_is_num = False
|
||
print("scene num",scene_num)
|
||
|
||
def audit_ps1(df,index,audit_df=False):
|
||
|
||
print("Auditing Slugline")
|
||
|
||
cur_indent = df['ssc'][index]
|
||
cur_data = df['data'][index]
|
||
cur_case = df['case'][index]
|
||
|
||
try:
|
||
print(cur_data)
|
||
except:
|
||
pass
|
||
new_indent = 15
|
||
if cur_indent != new_indent:
|
||
correct_left_indent(df,index,new_indent,audit_df)
|
||
else:
|
||
print("indent already",new_indent)
|
||
|
||
#check and correct case
|
||
new_case = 'AllUpper'
|
||
if cur_case != new_case:
|
||
correct_case(df,index,new_case,audit_df)
|
||
else:
|
||
print("Case already",new_case)
|
||
|
||
## plb nlb
|
||
plb = df['plb'][index]
|
||
nlb = df['nlb'][index]
|
||
|
||
if plb == 'N':
|
||
if index != 0 :
|
||
insert_line_before(df,index,audit_df)
|
||
df['plb'][index] = 'Y'
|
||
else:
|
||
print("previous line already blank")
|
||
|
||
if nlb == 'N':
|
||
insert_line_after(df,index,audit_df)
|
||
df['nlb'][index] = 'Y'
|
||
else:
|
||
print("next line already blank")
|
||
|
||
## remove numbers if found at start
|
||
check_and_remove_numbers(df,index,audit_df)
|
||
|
||
|
||
|
||
def audit_ps4(df,index,audit_df=False):
|
||
|
||
print("Auditing Action Beginning")
|
||
nl_deleted = False
|
||
cur_indent = df['ssc'][index]
|
||
cur_data = df['data'][index]
|
||
cur_case = df['case'][index]
|
||
|
||
new_indent = 15
|
||
if cur_indent != new_indent:
|
||
correct_left_indent(df,index,new_indent,audit_df)
|
||
else:
|
||
print("indent already",new_indent)
|
||
|
||
#check and correct case
|
||
# new_case = 'AllLower'
|
||
# if cur_case != new_case:
|
||
# correct_case(df,audit_df,index,new_case)
|
||
# else:
|
||
# print("Case already",new_case)
|
||
|
||
## plb nlb
|
||
plb = df['plb'][index]
|
||
nlb = df['nlb'][index]
|
||
|
||
if plb == 'N':
|
||
insert_line_before(df,index,audit_df)
|
||
df['plb'][index] = 'Y'
|
||
else:
|
||
print("previous line already blank")
|
||
|
||
if nlb == 'Y':
|
||
delete_line_after(df,index,audit_df)
|
||
nl_deleted = True
|
||
df['nlb'][index] = 'N'
|
||
else:
|
||
print("next line not blank")
|
||
|
||
return nl_deleted
|
||
|
||
def audit_ps5(df,index,audit_df=False):
|
||
|
||
print("Auditing Action Middle")
|
||
nl_deleted = False
|
||
cur_indent = df['ssc'][index]
|
||
cur_data = df['data'][index]
|
||
cur_case = df['case'][index]
|
||
|
||
new_indent = 15
|
||
if cur_indent != new_indent:
|
||
correct_left_indent(df,index,new_indent,audit_df)
|
||
else:
|
||
print("indent already",new_indent)
|
||
|
||
#check and correct case
|
||
# new_case = 'AllLower'
|
||
# if cur_case != new_case:
|
||
# correct_case(df,audit_df,index,new_case)
|
||
# else:
|
||
# print("Case already",new_case)
|
||
|
||
## plb nlb
|
||
plb = df['plb'][index]
|
||
nlb = df['nlb'][index]
|
||
|
||
if plb == 'Y':
|
||
delete_line_before(df,index,audit_df)
|
||
df['plb'][index] = 'N'
|
||
else:
|
||
print("previous line already non blank")
|
||
|
||
if nlb == 'Y':
|
||
delete_line_after(df,index,audit_df)
|
||
nl_deleted = True
|
||
df['nlb'][index] = 'N'
|
||
else:
|
||
print("next line not blank")
|
||
|
||
return nl_deleted
|
||
|
||
|
||
def audit_ps6(df,index,audit_df=False):
|
||
|
||
print("Auditing Action Ending")
|
||
nl_deleted = False
|
||
cur_indent = df['ssc'][index]
|
||
cur_data = df['data'][index]
|
||
cur_case = df['case'][index]
|
||
|
||
new_indent = 15
|
||
if cur_indent != new_indent:
|
||
correct_left_indent(df,index,new_indent,audit_df)
|
||
else:
|
||
print("indent already",new_indent)
|
||
|
||
#check and correct case
|
||
# new_case = 'AllLower'
|
||
# if cur_case != new_case:
|
||
# correct_case(df,audit_df,index,new_case)
|
||
# else:
|
||
# print("Case already",new_case)
|
||
|
||
## plb nlb
|
||
plb = df['plb'][index]
|
||
nlb = df['nlb'][index]
|
||
|
||
if df['Identification_Status'][index - 1] in ('ps4','ps5'):
|
||
if plb == 'Y':
|
||
delete_line_before(df,index,audit_df)
|
||
df['plb'][index] = 'N'
|
||
else:
|
||
print("previous line already non blank")
|
||
else:
|
||
## later move this to insert line before
|
||
pnbl_line_no = df['pnbl_line_no'][index]
|
||
try:
|
||
pnbl_identified = True if df.loc[df['line_no'] == pnbl_line_no,'isIdentified'] == 'Yes' else False
|
||
except:
|
||
pnbl_identified = False
|
||
|
||
if plb == 'N' and pnbl_identified:
|
||
insert_line_before(df,index,audit_df)
|
||
df['plb'][index] = 'Y'
|
||
else:
|
||
print("previous line already blank")
|
||
|
||
|
||
if nlb == 'N':
|
||
insert_line_after(df,index,audit_df)
|
||
df['nlb'][index] = 'Y'
|
||
else:
|
||
print("next line already blank")
|
||
|
||
def audit_ps7(df,index,audit_df=False):
|
||
|
||
print("Auditing Speaker")
|
||
nl_deleted = False
|
||
cur_indent = df['ssc'][index]
|
||
cur_data = df['data'][index]
|
||
cur_case = df['case'][index]
|
||
|
||
new_indent = 35
|
||
if cur_indent != new_indent:
|
||
correct_left_indent(df,index,new_indent,audit_df)
|
||
else:
|
||
print("indent already",new_indent)
|
||
|
||
#check and correct case
|
||
new_case = 'AllUpper'
|
||
if cur_case != new_case:
|
||
correct_case(df,index,new_case,audit_df)
|
||
else:
|
||
print("Case already",new_case)
|
||
|
||
## plb nlb
|
||
plb = df['plb'][index]
|
||
nlb = df['nlb'][index]
|
||
|
||
if plb == 'N':
|
||
insert_line_before(df,index,audit_df)
|
||
df['plb'][index] = 'Y'
|
||
else:
|
||
print("previous line already blank")
|
||
|
||
if nlb == 'Y':
|
||
delete_line_after(df,index,audit_df)
|
||
nl_deleted = True
|
||
df['nlb'][index] = 'N'
|
||
else:
|
||
print("next line not blank")
|
||
|
||
return nl_deleted
|
||
|
||
|
||
def audit_ps8(df,index,audit_df=False):
|
||
|
||
print("Auditing Speaker")
|
||
nl_deleted = False
|
||
cur_indent = df['ssc'][index]
|
||
cur_data = df['data'][index]
|
||
cur_case = df['case'][index]
|
||
|
||
new_indent = 35
|
||
if cur_indent != new_indent:
|
||
correct_left_indent(df,index,new_indent,audit_df)
|
||
else:
|
||
print("indent already",new_indent)
|
||
|
||
#check and correct case
|
||
new_case = 'AllUpper'
|
||
if cur_case != new_case:
|
||
correct_case(df,index,new_case,audit_df)
|
||
else:
|
||
print("Case already",new_case)
|
||
|
||
## plb nlb
|
||
plb = df['plb'][index]
|
||
nlb = df['nlb'][index]
|
||
|
||
if plb == 'N':
|
||
insert_line_before(df,index,audit_df)
|
||
df['plb'][index] = 'Y'
|
||
else:
|
||
print("previous line already blank")
|
||
|
||
if nlb == 'Y':
|
||
delete_line_after(df,index,audit_df)
|
||
nl_deleted = True
|
||
df['nlb'][index] = 'N'
|
||
else:
|
||
print("next line not blank")
|
||
|
||
return nl_deleted
|
||
|
||
|
||
def audit_ps10(df,index,audit_df=False):
|
||
|
||
print("Auditing Parenthetical complete")
|
||
nl_deleted = False
|
||
cur_indent = df['ssc'][index]
|
||
cur_data = df['data'][index]
|
||
cur_case = df['case'][index]
|
||
|
||
new_indent = 30
|
||
if cur_indent != new_indent:
|
||
correct_left_indent(df,index,new_indent,audit_df)
|
||
else:
|
||
print("indent already",new_indent)
|
||
|
||
#check and correct case
|
||
new_case = 'AllLower'
|
||
if cur_case != new_case:
|
||
correct_case(df,index,new_case,audit_df)
|
||
else:
|
||
print("Case already",new_case)
|
||
|
||
## plb nlb
|
||
plb = df['plb'][index]
|
||
nlb = df['nlb'][index]
|
||
|
||
if plb == 'Y':
|
||
delete_line_before(df,index,audit_df)
|
||
df['plb'][index] = 'N'
|
||
else:
|
||
print("previous line already blank")
|
||
|
||
if nlb == 'Y':
|
||
delete_line_after(df,index,audit_df)
|
||
nl_deleted = True
|
||
df['nlb'][index] = 'N'
|
||
else:
|
||
print("next line not blank")
|
||
|
||
return nl_deleted
|
||
|
||
def audit_ps11(df,index,audit_df=False):
|
||
|
||
print("Auditing Parenthetical beginning")
|
||
nl_deleted = False
|
||
cur_indent = df['ssc'][index]
|
||
cur_data = df['data'][index]
|
||
cur_case = df['case'][index]
|
||
|
||
new_indent = 30
|
||
if cur_indent != new_indent:
|
||
correct_left_indent(df,index,new_indent,audit_df)
|
||
else:
|
||
print("indent already",new_indent)
|
||
|
||
#check and correct case
|
||
new_case = 'AllLower'
|
||
if cur_case != new_case:
|
||
correct_case(df,index,new_case,audit_df)
|
||
else:
|
||
print("Case already",new_case)
|
||
|
||
## plb nlb
|
||
plb = df['plb'][index]
|
||
nlb = df['nlb'][index]
|
||
|
||
if plb == 'Y':
|
||
delete_line_before(df,index,audit_df)
|
||
df['plb'][index] = 'N'
|
||
else:
|
||
print("previous line already blank")
|
||
|
||
if nlb == 'Y':
|
||
delete_line_after(df,index,audit_df)
|
||
nl_deleted = True
|
||
df['nlb'][index] = 'N'
|
||
else:
|
||
print("next line not blank")
|
||
|
||
return nl_deleted
|
||
|
||
def audit_ps20(df,index,audit_df=False):
|
||
|
||
print("Auditing Parenthetical middle")
|
||
nl_deleted = False
|
||
cur_indent = df['ssc'][index]
|
||
cur_data = df['data'][index]
|
||
cur_case = df['case'][index]
|
||
|
||
new_indent = 30
|
||
if cur_indent != new_indent:
|
||
correct_left_indent(df,index,new_indent,audit_df)
|
||
else:
|
||
print("indent already",new_indent)
|
||
|
||
#check and correct case
|
||
new_case = 'AllLower'
|
||
if cur_case != new_case:
|
||
correct_case(df,index,new_case,audit_df)
|
||
else:
|
||
print("Case already",new_case)
|
||
|
||
## plb nlb
|
||
plb = df['plb'][index]
|
||
nlb = df['nlb'][index]
|
||
|
||
if plb == 'Y':
|
||
delete_line_before(df,index,audit_df)
|
||
df['plb'][index] = 'N'
|
||
else:
|
||
print("previous line already blank")
|
||
|
||
if nlb == 'Y':
|
||
delete_line_after(df,index,audit_df)
|
||
nl_deleted = True
|
||
df['nlb'][index] = 'N'
|
||
else:
|
||
print("next line not blank")
|
||
|
||
return nl_deleted
|
||
|
||
def audit_ps12(df,index,audit_df=False):
|
||
|
||
print("Auditing Parenthetical end")
|
||
nl_deleted = False
|
||
cur_indent = df['ssc'][index]
|
||
cur_data = df['data'][index]
|
||
cur_case = df['case'][index]
|
||
|
||
new_indent = 30
|
||
if cur_indent != new_indent:
|
||
correct_left_indent(df,index,new_indent,audit_df)
|
||
else:
|
||
print("indent already",new_indent)
|
||
|
||
#check and correct case
|
||
new_case = 'AllLower'
|
||
if cur_case != new_case:
|
||
correct_case(df,index,new_case,audit_df)
|
||
else:
|
||
print("Case already",new_case)
|
||
|
||
## plb nlb
|
||
plb = df['plb'][index]
|
||
nlb = df['nlb'][index]
|
||
|
||
if plb == 'Y':
|
||
delete_line_before(df,index,audit_df)
|
||
df['plb'][index] = 'N'
|
||
else:
|
||
print("previous line already blank")
|
||
|
||
if nlb == 'Y':
|
||
delete_line_after(df,index,audit_df)
|
||
nl_deleted = True
|
||
df['nlb'][index] = 'N'
|
||
else:
|
||
print("next line not blank")
|
||
|
||
return nl_deleted
|
||
|
||
def audit_ps13(df,index,audit_df=False):
|
||
|
||
print("Auditing Dialogue Beginning")
|
||
nl_deleted = False
|
||
cur_indent = df['ssc'][index]
|
||
cur_data = df['data'][index]
|
||
cur_case = df['case'][index]
|
||
|
||
new_indent = 25
|
||
if cur_indent != new_indent:
|
||
correct_left_indent(df,index,new_indent,audit_df)
|
||
else:
|
||
print("indent already",new_indent)
|
||
|
||
#check and correct case
|
||
# new_case = 'AllLower'
|
||
# if cur_case != new_case:
|
||
# correct_case(df,audit_df,index,new_case)
|
||
# else:
|
||
# print("Case already",new_case)
|
||
|
||
## plb nlb
|
||
plb = df['plb'][index]
|
||
nlb = df['nlb'][index]
|
||
|
||
if plb == 'Y':
|
||
delete_line_before(df,index,audit_df)
|
||
df['plb'][index] = 'N'
|
||
else:
|
||
print("previous line already blank")
|
||
|
||
if nlb == 'Y':
|
||
delete_line_after(df,index,audit_df)
|
||
nl_deleted = True
|
||
df['nlb'][index] = 'N'
|
||
else:
|
||
print("next line not blank")
|
||
|
||
return nl_deleted
|
||
|
||
def audit_ps14(df,index,audit_df=False):
|
||
|
||
print("Auditing Dialogue Middle")
|
||
nl_deleted = False
|
||
cur_indent = df['ssc'][index]
|
||
cur_data = df['data'][index]
|
||
cur_case = df['case'][index]
|
||
|
||
new_indent = 25
|
||
if cur_indent != new_indent:
|
||
correct_left_indent(df,index,new_indent,audit_df)
|
||
else:
|
||
print("indent already",new_indent)
|
||
|
||
#check and correct case
|
||
# new_case = 'AllLower'
|
||
# if cur_case != new_case:
|
||
# correct_case(df,audit_df,index,new_case)
|
||
# else:
|
||
# print("Case already",new_case)
|
||
|
||
## plb nlb
|
||
plb = df['plb'][index]
|
||
nlb = df['nlb'][index]
|
||
|
||
if plb == 'Y':
|
||
delete_line_before(df,index,audit_df)
|
||
df['plb'][index] = 'N'
|
||
else:
|
||
print("previous line already blank")
|
||
|
||
if nlb == 'Y':
|
||
delete_line_after(df,index,audit_df)
|
||
nl_deleted = True
|
||
df['nlb'][index] = 'N'
|
||
else:
|
||
print("next line not blank")
|
||
|
||
return nl_deleted
|
||
|
||
def audit_ps15(df,index,audit_df=False):
|
||
|
||
print("Auditing Dialogue End")
|
||
nl_deleted = False
|
||
cur_indent = df['ssc'][index]
|
||
cur_data = df['data'][index]
|
||
cur_case = df['case'][index]
|
||
|
||
|
||
new_indent = 25
|
||
if cur_indent != new_indent:
|
||
correct_left_indent(df,index,new_indent,audit_df)
|
||
else:
|
||
print("indent already",new_indent)
|
||
|
||
# #check and correct case
|
||
# new_case = 'AllLower'
|
||
# if cur_case != new_case:
|
||
# correct_case(df,audit_df,index,new_case)
|
||
# else:
|
||
# print("Case already",new_case)
|
||
|
||
## plb nlb
|
||
plb = df['plb'][index]
|
||
nlb = df['nlb'][index]
|
||
|
||
|
||
if plb == 'Y':
|
||
delete_line_before(df,index,audit_df)
|
||
df['plb'][index] = 'N'
|
||
else:
|
||
print("previous line already not blank")
|
||
|
||
if nlb == 'N':
|
||
nl_pos = df['Identification_Status'][index+1]
|
||
if nl_pos == 'ps10':
|
||
print("not inserting blank as next is parenthtical")
|
||
else:
|
||
insert_line_after(df,index,audit_df)
|
||
df['nlb'][index] = 'Y'
|
||
else:
|
||
print("next line already blank")
|
||
|
||
|
||
def audit_ps16(df,index,audit_df=False):
|
||
|
||
print("Auditing Transition")
|
||
|
||
cur_indent = df['ssc'][index]
|
||
cur_data = df['data'][index]
|
||
cur_case = df['case'][index]
|
||
cur_lcp = df['lcp'][index]
|
||
|
||
new_lcp = 72
|
||
if cur_lcp != new_lcp:
|
||
correct_right_indent(df,index,new_lcp,audit_df)
|
||
else:
|
||
print("indent already",new_lcp)
|
||
|
||
#check and correct case
|
||
print(cur_case,"123")
|
||
new_case = 'AllUpper'
|
||
if cur_case != new_case:
|
||
correct_case(df,index,new_case,audit_df)
|
||
else:
|
||
print("Case already",new_case)
|
||
|
||
## plb nlb
|
||
plb = df['plb'][index]
|
||
nlb = df['nlb'][index]
|
||
|
||
if plb == 'N':
|
||
insert_line_before(df,index,audit_df)
|
||
df['plb'][index] = 'Y'
|
||
else:
|
||
print("previous line already blank")
|
||
|
||
if nlb == 'N':
|
||
insert_line_after(df,index,audit_df)
|
||
df['nlb'][index] = 'Y'
|
||
else:
|
||
print("next line already blank")
|
||
|
||
def audit_ps17(df,index,audit_df=False):
|
||
|
||
print("Auditing Special Term")
|
||
|
||
cur_indent = df['ssc'][index]
|
||
cur_data = df['data'][index]
|
||
cur_case = df['case'][index]
|
||
|
||
new_indent = 15
|
||
if cur_indent != new_indent:
|
||
correct_left_indent(df,index,new_indent,audit_df)
|
||
else:
|
||
print("indent already",new_indent)
|
||
|
||
#check and correct case
|
||
new_case = 'AllUpper'
|
||
if cur_case != new_case:
|
||
correct_case(df,index,new_case,audit_df)
|
||
else:
|
||
print("Case already",new_case)
|
||
|
||
## plb nlb
|
||
plb = df['plb'][index]
|
||
nlb = df['nlb'][index]
|
||
|
||
if plb == 'N':
|
||
insert_line_before(df,index,audit_df)
|
||
df['plb'][index] = 'Y'
|
||
else:
|
||
print("previous line already blank")
|
||
|
||
if nlb == 'N':
|
||
insert_line_after(df,index,audit_df)
|
||
df['nlb'][index] = 'Y'
|
||
else:
|
||
print("next line already blank")
|
||
|
||
|
||
index_iter = iter(df.index)
|
||
|
||
|
||
for index in index_iter:
|
||
|
||
if (df['Identification_Status'][index] == 'blank'):
|
||
continue
|
||
|
||
nl_deleted = False
|
||
cur_line_pos = df['Identification_Status'][index]
|
||
fn_name = 'audit_' + cur_line_pos
|
||
line_no = df['line_no'][index]
|
||
print("\n")
|
||
print("line no",line_no)
|
||
print("index ",index)
|
||
print(cur_line_pos)
|
||
print(df['line_no'].dtype)
|
||
|
||
try:
|
||
to_call_fn = locals()[fn_name]
|
||
print(to_call_fn)
|
||
except:
|
||
continue
|
||
|
||
try:
|
||
nl_deleted = to_call_fn(df,index,audit_df)
|
||
except:
|
||
pass
|
||
|
||
if nl_deleted :
|
||
next(index_iter)
|
||
|
||
|
||
|
||
df = df.sort_index().reset_index(drop=True)
|
||
|
||
#df = df.sort_values(by=['line_no']).reset_index(drop =True)
|
||
|
||
try:
|
||
if not audit_df.empty:
|
||
return df,audit_df
|
||
else:
|
||
return df
|
||
except:
|
||
return df
|
||
|
||
|
||
|
||
|
||
def merge_line_to_para(df):
|
||
|
||
## output columns
|
||
## line_no, para_no , scene_no, identified_script_element, langueage , script
|
||
para_df = pd.DataFrame()
|
||
para_df['para_no'] = ''
|
||
para_df['scene_no'] = ''
|
||
para_df['content'] = ''
|
||
para_df['script_element'] = ''
|
||
para_no = 0
|
||
scene_no = 0
|
||
|
||
|
||
|
||
index_iter = iter(df.index)
|
||
for index in index_iter:
|
||
line_pos = df['Identification_Status'][index]
|
||
data = df['data'][index]
|
||
data = data.strip()
|
||
#print(line_pos,data)
|
||
para_no += 1
|
||
para_df.loc[para_no] = ''
|
||
para_df['para_no'][para_no] = para_no
|
||
|
||
|
||
if line_pos == 'blank' :
|
||
para_df['content'][para_no] = ''
|
||
para_df['script_element'][para_no] = 'blank'
|
||
para_df['scene_no'][para_no] = scene_no
|
||
continue
|
||
|
||
|
||
if line_pos == 'ps1':
|
||
para_df['content'][para_no] = data
|
||
para_df['script_element'][para_no] = 'slugline'
|
||
scene_no += 1
|
||
para_df['scene_no'][para_no] = scene_no
|
||
continue
|
||
|
||
if line_pos == 'ps2':
|
||
##merge with ps3
|
||
if df['Identification_Status'][index+1] == 'ps3':
|
||
nl_data = df['data'][index+1]
|
||
nl_data = nl_data.strip()
|
||
merge_data = data + ' ' + nl_data
|
||
para_df['content'][para_no] = merge_data
|
||
para_df['script_element'][para_no] = 'slugline'
|
||
scene_no += 1
|
||
para_df['scene_no'][para_no] = scene_no
|
||
next(index_iter)
|
||
continue
|
||
else:
|
||
para_df['content'][para_no] = data
|
||
para_df['script_element'][para_no] = 'slugline'
|
||
scene_no += 1
|
||
para_df['scene_no'][para_no] = scene_no
|
||
continue
|
||
|
||
|
||
if line_pos == 'ps3':
|
||
para_df['content'][para_no] = data
|
||
para_df['script_element'][para_no] = 'slugline'
|
||
scene_no += 1
|
||
para_df['scene_no'][para_no] = scene_no
|
||
|
||
continue
|
||
|
||
if line_pos == 'ps4':
|
||
merge_data = data
|
||
fwd_index = index+1
|
||
nl_pos = df['Identification_Status'][fwd_index]
|
||
|
||
while nl_pos == 'ps5' or nl_pos == 'ps6':
|
||
next(index_iter)
|
||
nl_data = df['data'][fwd_index]
|
||
nl_data = nl_data.strip()
|
||
merge_data += ' ' + nl_data
|
||
fwd_index += 1
|
||
try:
|
||
nl_pos = df['Identification_Status'][fwd_index]
|
||
except:
|
||
break
|
||
|
||
|
||
para_df['content'][para_no] = merge_data
|
||
para_df['script_element'][para_no] = 'action'
|
||
para_df['scene_no'][para_no] = scene_no
|
||
continue
|
||
|
||
|
||
if line_pos == 'ps5':
|
||
merge_data = data
|
||
fwd_index = index+1
|
||
nl_pos = df['Identification_Status'][fwd_index]
|
||
|
||
while nl_pos == 'ps6':
|
||
next(index_iter)
|
||
nl_data = df['data'][fwd_index]
|
||
nl_data = nl_data.strip()
|
||
merge_data += ' ' + nl_data
|
||
fwd_index += 1
|
||
try:
|
||
nl_pos = df['Identification_Status'][fwd_index]
|
||
except:
|
||
break
|
||
|
||
|
||
para_df['content'][para_no] = merge_data
|
||
para_df['script_element'][para_no] = 'action'
|
||
para_df['scene_no'][para_no] = scene_no
|
||
continue
|
||
|
||
if line_pos == 'ps6':
|
||
para_df['content'][para_no] = data
|
||
para_df['script_element'][para_no] = 'action'
|
||
para_df['scene_no'][para_no] = scene_no
|
||
continue
|
||
|
||
if line_pos == 'ps7' or line_pos == 'ps8':
|
||
para_df['content'][para_no] = data
|
||
para_df['script_element'][para_no] = 'speaker'
|
||
para_df['scene_no'][para_no] = scene_no
|
||
continue
|
||
|
||
if line_pos == 'ps9' or line_pos == 'ps10':
|
||
para_df['content'][para_no] = data
|
||
para_df['script_element'][para_no] = 'parenthetical'
|
||
para_df['scene_no'][para_no] = scene_no
|
||
continue
|
||
|
||
|
||
|
||
if line_pos == 'ps11':
|
||
merge_data = data
|
||
fwd_index = index+1
|
||
nl_pos = df['Identification_Status'][fwd_index]
|
||
|
||
while nl_pos == 'ps20' or nl_pos == 'ps12':
|
||
next(index_iter)
|
||
nl_data = df['data'][fwd_index]
|
||
nl_data = nl_data.strip()
|
||
merge_data += ' ' + nl_data
|
||
fwd_index += 1
|
||
try:
|
||
nl_pos = df['Identification_Status'][fwd_index]
|
||
except:
|
||
break
|
||
|
||
para_df['content'][para_no] = merge_data
|
||
para_df['script_element'][para_no] = 'parenthetical'
|
||
para_df['scene_no'][para_no] = scene_no
|
||
continue
|
||
|
||
if line_pos == 'ps20':
|
||
merge_data = data
|
||
fwd_index = index+1
|
||
nl_pos = df['Identification_Status'][fwd_index]
|
||
|
||
while nl_pos == 'ps12':
|
||
next(index_iter)
|
||
nl_data = df['data'][fwd_index]
|
||
nl_data = nl_data.strip()
|
||
merge_data += ' ' + nl_data
|
||
fwd_index += 1
|
||
try:
|
||
nl_pos = df['Identification_Status'][fwd_index]
|
||
except:
|
||
break
|
||
|
||
para_df['content'][para_no] = merge_data
|
||
para_df['script_element'][para_no] = 'parenthetical'
|
||
para_df['scene_no'][para_no] = scene_no
|
||
continue
|
||
|
||
if line_pos == 'ps12' :
|
||
para_df['content'][para_no] = data
|
||
para_df['script_element'][para_no] = 'parenthetical'
|
||
para_df['scene_no'][para_no] = scene_no
|
||
continue
|
||
|
||
|
||
if line_pos == 'ps13':
|
||
merge_data = data
|
||
fwd_index = index+1
|
||
nl_pos = df['Identification_Status'][fwd_index]
|
||
|
||
while nl_pos == 'ps14' or nl_pos == 'ps15':
|
||
next(index_iter)
|
||
nl_data = df['data'][fwd_index]
|
||
nl_data = nl_data.strip()
|
||
merge_data += ' ' + nl_data
|
||
fwd_index += 1
|
||
try:
|
||
nl_pos = df['Identification_Status'][fwd_index]
|
||
except:
|
||
break
|
||
|
||
para_df['content'][para_no] = merge_data
|
||
para_df['script_element'][para_no] = 'dialogue'
|
||
para_df['scene_no'][para_no] = scene_no
|
||
continue
|
||
|
||
if line_pos == 'ps14':
|
||
merge_data = data
|
||
fwd_index = index+1
|
||
nl_pos = df['Identification_Status'][fwd_index]
|
||
|
||
while nl_pos == 'ps15':
|
||
next(index_iter)
|
||
nl_data = df['data'][fwd_index]
|
||
nl_data = nl_data.strip()
|
||
merge_data += ' ' + nl_data
|
||
fwd_index += 1
|
||
try:
|
||
nl_pos = df['Identification_Status'][fwd_index]
|
||
except:
|
||
break
|
||
|
||
para_df['content'][para_no] = merge_data
|
||
para_df['script_element'][para_no] = 'dialogue'
|
||
para_df['scene_no'][para_no] = scene_no
|
||
continue
|
||
|
||
if line_pos == 'ps15' :
|
||
para_df['content'][para_no] = data
|
||
para_df['script_element'][para_no] = 'dialogue'
|
||
para_df['scene_no'][para_no] = scene_no
|
||
continue
|
||
|
||
if line_pos == 'ps16' :
|
||
para_df['content'][para_no] = data
|
||
para_df['script_element'][para_no] = 'transition'
|
||
para_df['scene_no'][para_no] = scene_no
|
||
continue
|
||
|
||
if line_pos == 'ps17' :
|
||
para_df['content'][para_no] = data
|
||
para_df['script_element'][para_no] = 'special_term'
|
||
para_df['scene_no'][para_no] = scene_no
|
||
continue
|
||
|
||
if line_pos == 'ps27':
|
||
para_df['content'][para_no] = data
|
||
para_df['script_element'][para_no] = 'dialogue'
|
||
para_df['scene_no'][para_no] = scene_no
|
||
continue
|
||
|
||
return para_df
|
||
|
||
|
||
|
||
|
||
|
||
def wrap_text(df,audit_df):
|
||
# df.reset_index(inplace=True, drop=True)
|
||
# audit_df.reset_index(inplace=True, drop=True) #df['line_no'] = df['line_no'].astype(str).astype(float)
|
||
index_iter = iter(df.index)
|
||
print("wrapping lines")
|
||
print(df.dtypes)
|
||
for index in index_iter:
|
||
line_pos = df['Identification_Status'][index]
|
||
if line_pos == 'blank' or df['isIdentified'][index] == 'No':
|
||
continue
|
||
|
||
data = df['data'][index]
|
||
data = data.strip()
|
||
|
||
print("line no",df['line_no'][index],df['line_no'].dtype)
|
||
|
||
if line_pos == 'ps1':
|
||
print("checking Slugline")
|
||
if len(data) > 58:
|
||
print("Need to wrap line")
|
||
print("data 9808",data)
|
||
wrapped_data = textwrap.wrap(data, width = 58)
|
||
for line in wrapped_data:
|
||
try:
|
||
print("line 9812:",line)
|
||
except:
|
||
pass
|
||
|
||
|
||
if line_pos == 'ps6':
|
||
action_data = ''
|
||
action_list = []
|
||
print("checking Action line")
|
||
cur_lines_count = 0
|
||
action_index = index
|
||
while line_pos != 'blank':
|
||
data = df['data'][action_index]
|
||
line_no = df['line_no'][action_index]
|
||
try:
|
||
print("9827\n",line_pos,line_no,data)
|
||
except:
|
||
pass
|
||
action_data = data.strip() + ' ' + action_data
|
||
cur_lines_count += 1
|
||
action_list.append(line_no)
|
||
action_index -= 1
|
||
if action_index < 0:
|
||
break
|
||
try:
|
||
line_pos = df['Identification_Status'][action_index]
|
||
except:
|
||
line_pos = ''
|
||
|
||
if line_pos == '' or df['isIdentified'][action_index] == 'No':
|
||
break
|
||
|
||
if action_index < 0:
|
||
continue
|
||
|
||
if line_pos == '' or df['isIdentified'][action_index] == 'No':
|
||
continue
|
||
action_start_index = action_index + 1
|
||
action_data = action_data.strip()
|
||
print("Number of action lines",cur_lines_count)
|
||
if len(action_data) > 58:
|
||
print("Need to wrap line")
|
||
try:
|
||
print("actiob data:\n",action_data)
|
||
except:
|
||
pass
|
||
wrapped_data = textwrap.wrap(action_data, width = 58)
|
||
print("Wrapped line 9753")
|
||
wrapped_data_lines_count = len(wrapped_data)
|
||
if cur_lines_count == wrapped_data_lines_count:
|
||
#can change the original line(s) data
|
||
print("cur and wrapped number of lines same")
|
||
# for i in range(0,cur_lines_count):
|
||
# print(wrapped_data[i])
|
||
elif wrapped_data_lines_count > cur_lines_count:
|
||
lines_to_add = wrapped_data_lines_count - cur_lines_count
|
||
#Multiple action lines
|
||
print("will need to create ",lines_to_add," more lines")
|
||
pvs_line_no = df['line_no'][index-1] #float
|
||
cur_line_no = df['line_no'][index] #float
|
||
cur_line_index = index
|
||
pvs_line_index = cur_line_index -1
|
||
print("in line no 9874")
|
||
while lines_to_add != 0:
|
||
new_line_no = 0.0
|
||
|
||
pvs_line_no = df['line_no'][pvs_line_index] #float
|
||
new_line_no = (cur_line_no + pvs_line_no ) / 2
|
||
while new_line_no in audit_df.index:
|
||
new_line_no = (cur_line_no + new_line_no)/2
|
||
action_list.append(new_line_no)
|
||
new_line_index = (cur_line_index + pvs_line_index) /2
|
||
df.loc[new_line_index] = np.nan
|
||
df.loc[new_line_index,'line_no'] = new_line_no
|
||
if df['Identification_Status'][pvs_line_index] == 'blank':
|
||
df.loc[new_line_index,'Identification_Status'] = 'ps4'
|
||
else:
|
||
df.loc[new_line_index,'Identification_Status'] = 'ps5'
|
||
|
||
#cur_line_no = new_line_no
|
||
pvs_line_index = new_line_index
|
||
#cur_line_index -= 1
|
||
lines_to_add -= 1
|
||
|
||
|
||
elif wrapped_data_lines_count < cur_lines_count:
|
||
lines_to_remove = cur_lines_count - wrapped_data_lines_count
|
||
print("Will need to remove ",lines_to_remove, "lines")
|
||
remove_index = index -1
|
||
#pvs_line_no = df['line_no'][remove_index]
|
||
while lines_to_remove != 0:
|
||
pvs_line_no = df['line_no'][remove_index]
|
||
## remove pvs line
|
||
df.drop(remove_index, inplace= True)
|
||
audit_df['line_removed'][pvs_line_no] = 'Yes'
|
||
action_list.remove(pvs_line_no)
|
||
remove_index -= 1
|
||
lines_to_remove -= 1
|
||
|
||
action_list.sort()
|
||
print(action_list)
|
||
## add these lines in the original df
|
||
print("in line no 9914")
|
||
## now assign the values to these lines
|
||
wrapped_index = 0
|
||
for line_no in action_list:
|
||
df_index = df.index[df['line_no'] == line_no]
|
||
df['data'][df_index] = wrapped_data[wrapped_index]
|
||
if line_no not in audit_df.index:
|
||
audit_df.loc[line_no] = np.nan
|
||
audit_df.loc[line_no,'data'] = ''
|
||
audit_df.loc[line_no,'data_corrected'] = ''
|
||
audit_df['line_wrapped_at_prescribed_right_indent'][line_no] = 'Yes'
|
||
wrapped_index += 1
|
||
|
||
|
||
|
||
else:
|
||
print("No need to wrap line")
|
||
try:
|
||
print(action_data)
|
||
except:
|
||
pass
|
||
print(len(action_data))
|
||
print("in line no 9936")
|
||
|
||
if line_pos == 'ps15':
|
||
dialogue_data = ''
|
||
dialogue_list = []
|
||
print("\n checking Dialogue line",index)
|
||
cur_lines_count = 0
|
||
dialogue_index = int(index)
|
||
while line_pos not in ('ps7','ps8','ps10','ps12','ps5','ps6'): ## added 5 and 6 as wrong identification causes previous line to be ps5
|
||
data = df['data'][dialogue_index]
|
||
line_no = df['line_no'][dialogue_index]
|
||
try:
|
||
print(dialogue_index,line_no,line_pos,data)
|
||
except:
|
||
pass
|
||
|
||
#dialogue_data = data.strip() + ' ' + dialogue_data
|
||
try:
|
||
dialogue_data = data.strip() + ' ' + dialogue_data
|
||
except:
|
||
data = str(data)
|
||
dialogue_data = data.strip() + ' ' + dialogue_data
|
||
|
||
cur_lines_count += 1
|
||
if dialogue_index == index:
|
||
df['Identification_Status'][dialogue_index] = 'ps15'
|
||
else:
|
||
df['Identification_Status'][dialogue_index] = 'ps14'
|
||
dialogue_index -= 1
|
||
dialogue_list.append(line_no)
|
||
print("\nprinting isIdentified: ")
|
||
try:
|
||
li = df['isIdentified'][dialogue_index] == 'No'
|
||
print("dialogue bunch not fully identified")
|
||
except:
|
||
li = ''
|
||
print("dialogue bunch not fully identified")
|
||
if li == '' or df['isIdentified'][dialogue_index] == 'No':
|
||
break
|
||
|
||
# if df['isIdentified'][dialogue_index] == 'No' :
|
||
# print("dialogue bunch not fully identified")
|
||
# break
|
||
line_pos = df['Identification_Status'][dialogue_index]
|
||
|
||
if li == '' or df['isIdentified'][dialogue_index] == 'No' : #added li == ''
|
||
print("dialogue bunch not fully identified")
|
||
continue
|
||
|
||
|
||
dialogue_start_index = dialogue_index + 1
|
||
if dialogue_start_index != index:
|
||
df['Identification_Status'][dialogue_start_index] = 'ps13'
|
||
dialogue_data = dialogue_data.strip()
|
||
print("Number of dialogue lines 9990",cur_lines_count)
|
||
if len(dialogue_data) > 35:
|
||
print("Need to wrap dialogue line 9992")
|
||
try:
|
||
print(dialogue_data)
|
||
except:
|
||
pass
|
||
wrapped_data = textwrap.wrap(dialogue_data, width = 35)
|
||
|
||
wrapped_data_lines_count = len(wrapped_data)
|
||
if cur_lines_count == wrapped_data_lines_count:
|
||
#can change the original line(s) data
|
||
print("cur and wrapped number of lines same")
|
||
# for i in range(0,cur_lines_count):
|
||
# print(wrapped_data[i])
|
||
elif wrapped_data_lines_count > cur_lines_count:
|
||
lines_to_add = wrapped_data_lines_count - cur_lines_count
|
||
#Multiple action lines
|
||
print("will need to create ",lines_to_add," more lines")
|
||
pvs_line_no = float(df['line_no'][index-1])
|
||
cur_line_no = float(df['line_no'][index])
|
||
cur_line_index = index
|
||
pvs_line_index = cur_line_index -1
|
||
|
||
while lines_to_add != 0:
|
||
new_line_no = 0.0
|
||
|
||
pvs_line_no = float(df['line_no'][pvs_line_index])
|
||
new_line_no = (cur_line_no + pvs_line_no ) / 2
|
||
|
||
while (new_line_no in audit_df.index) or (new_line_no in dialogue_list):
|
||
new_line_no = (cur_line_no + new_line_no)/2
|
||
new_line_no = new_line_no
|
||
print(cur_line_index,cur_line_no,pvs_line_no,new_line_no)
|
||
dialogue_list.append(new_line_no)
|
||
new_line_index = (cur_line_index + pvs_line_index) /2
|
||
df.loc[new_line_index] = np.nan
|
||
df.loc[new_line_index,'line_no'] = new_line_no
|
||
if df['Identification_Status'][pvs_line_index] in ('ps7','ps10','ps12'):
|
||
df.loc[new_line_index,'Identification_Status'] = 'ps13'
|
||
else:
|
||
df.loc[new_line_index,'Identification_Status'] = 'ps14'
|
||
|
||
#cur_line_no = new_line_no
|
||
#cur_line_index -= 1
|
||
pvs_line_index = new_line_index
|
||
lines_to_add -= 1
|
||
|
||
|
||
elif wrapped_data_lines_count < cur_lines_count:
|
||
lines_to_remove = cur_lines_count - wrapped_data_lines_count
|
||
print("Will need to remove ",lines_to_remove, "lines")
|
||
remove_index = index -1
|
||
#pvs_line_no = df['line_no'][remove_index]
|
||
while lines_to_remove != 0:
|
||
pvs_line_no = df['line_no'][remove_index]
|
||
## remove pvs line
|
||
df.drop(remove_index, inplace= True)
|
||
audit_df['line_removed'][pvs_line_no] = 'Yes'
|
||
dialogue_list.remove(pvs_line_no)
|
||
remove_index -= 1
|
||
lines_to_remove -= 1
|
||
try:
|
||
dialogue_list.sort()
|
||
except:
|
||
print("converting dialogue_list to float")
|
||
dialogue_list = [float(value) if type(value) != int else value for value in dialogue_list]
|
||
dialogue_list.sort()
|
||
print("diaogue_list",dialogue_list)
|
||
## add these lines in the original df
|
||
print("in line no 10060")
|
||
## now assign the values to these lines
|
||
wrapped_index = 0
|
||
for line_no in dialogue_list:
|
||
df_index = df.index[df['line_no'] == line_no]
|
||
df['data'][df_index] = wrapped_data[wrapped_index]
|
||
if line_no not in audit_df.index:
|
||
audit_df.loc[line_no] = 'No'
|
||
audit_df.loc[line_no,'data'] = ''
|
||
audit_df.loc[line_no,'data_corrected'] = ''
|
||
audit_df['line_wrapped_at_prescribed_right_indent'][line_no] = 'Yes'
|
||
wrapped_index += 1
|
||
|
||
|
||
|
||
else:
|
||
print("No need to wrap line")
|
||
try:
|
||
print(dialogue_data)
|
||
except:
|
||
pass
|
||
print(len(dialogue_data))
|
||
|
||
|
||
# if all(isinstance(val, int) for val in df['line_no']):
|
||
# print("All values in 'line_no' are integers.")
|
||
# elif all(isinstance(val, str) for val in df['line_no']):
|
||
# print("All values in 'line_no' are strings. Converting to floats or integers...")
|
||
|
||
#try:
|
||
# df['line_no'] = df['line_no'].astype(int)
|
||
# print("Converted 'line_no' column to integers.")
|
||
#except ValueError:
|
||
# df['line_no'] = df['line_no'].astype(float)
|
||
# print("Converted 'line_no' column to floats.")
|
||
# else:
|
||
# print("Values in 'line_no' are of mixed types.")
|
||
|
||
df = df.sort_values(by=['line_no']).reset_index(drop =True)
|
||
|
||
|
||
index_iter = iter(df.index)
|
||
df.fillna({'data':''},inplace=True)
|
||
for index in index_iter:
|
||
print(index)
|
||
line_pos = df['Identification_Status'][index]
|
||
if line_pos == 'blank':
|
||
continue
|
||
|
||
data = df['data'][index]
|
||
try:
|
||
print("data",data)
|
||
print(type(data))
|
||
except:
|
||
pass
|
||
|
||
data = data.strip()
|
||
|
||
if line_pos == 'ps10' :
|
||
par_data = ''
|
||
par_list = []
|
||
print("checking Parenthetical line")
|
||
cur_lines_count = 0
|
||
par_index = index
|
||
|
||
data = df['data'][par_index]
|
||
line_no = df['line_no'][par_index]
|
||
try:
|
||
print(line_pos,data)
|
||
except:
|
||
pass
|
||
par_data = data.strip()
|
||
cur_lines_count += 1
|
||
|
||
par_list.append(line_no)
|
||
line_pos = df['Identification_Status'][par_index]
|
||
|
||
print("Number of parenthetical lines",cur_lines_count)
|
||
print("index",par_index,"line_no",line_no)
|
||
|
||
if len(par_data) > 20:
|
||
print("Need to wrap parenthetical line 10133")
|
||
try:
|
||
print(par_data)
|
||
except:
|
||
pass
|
||
wrapped_data = textwrap.wrap(par_data, width = 20)
|
||
wrapped_data_lines_count = len(wrapped_data)
|
||
|
||
if wrapped_data_lines_count > cur_lines_count:
|
||
lines_to_add = wrapped_data_lines_count - cur_lines_count
|
||
#Multiple par lines
|
||
print("will need to create ",lines_to_add," more lines")
|
||
pvs_line_no = df['line_no'][index-1] #float
|
||
cur_line_no = df['line_no'][index] #float
|
||
cur_line_index = index
|
||
pvs_line_index = cur_line_index -1
|
||
|
||
while lines_to_add != 0:
|
||
|
||
new_line_no = 0.0
|
||
pvs_line_no = df['line_no'][pvs_line_index]
|
||
try:
|
||
new_line_no = (cur_line_no + pvs_line_no ) / 2
|
||
except:
|
||
new_line_no = (float(cur_line_no) + pvs_line_no ) / 2
|
||
|
||
while new_line_no in audit_df.index:
|
||
new_line_no = (cur_line_no + new_line_no)/2
|
||
new_line_no = (new_line_no)
|
||
par_list.append(new_line_no)
|
||
new_line_index = (cur_line_index + pvs_line_index) /2
|
||
df.loc[new_line_index] = np.nan
|
||
df.loc[new_line_index,'line_no'] = new_line_no
|
||
if df['Identification_Status'][pvs_line_index] in ('ps7','ps8','ps15'):
|
||
df.loc[new_line_index,'Identification_Status'] = 'ps11'
|
||
df.loc[new_line_index,'isIdentified'] = 'Yes'
|
||
else:
|
||
df.loc[new_line_index,'Identification_Status'] = 'ps20'
|
||
df.loc[new_line_index,'isIdentified'] = 'Yes'
|
||
|
||
cur_line_no = new_line_no
|
||
cur_line_index = new_line_index
|
||
lines_to_add -= 1
|
||
|
||
|
||
df['Identification_Status'][index] = 'ps12'
|
||
try:
|
||
par_list.sort()
|
||
except :
|
||
print("exception accepted:")
|
||
par_list = [np.array([float(x)]) if isinstance(x, str) else x for x in par_list]
|
||
par_list.sort()
|
||
|
||
print("\n\npar_list:",par_list,"\n\n")
|
||
## add these lines in the original df
|
||
|
||
## now assign the values to these lines
|
||
wrapped_index = 0
|
||
for line_no in par_list:
|
||
try:
|
||
df_index = df.index[df['line_no'] == line_no]
|
||
print("try block executed\n")
|
||
except:
|
||
print("Exception:")
|
||
df_index = df.index[df['line_no'] == line_no[0]]
|
||
print("except block executed\n")
|
||
|
||
print("printing df_index 10200",df_index,"\n")
|
||
df['data'][df_index] = wrapped_data[wrapped_index]
|
||
print("printing audit_df:\n",audit_df.index,"\n")
|
||
print("checking the audit_fd:",line_no,"\n")
|
||
|
||
|
||
try:
|
||
if line_no not in audit_df.index:
|
||
audit_df.loc[line_no] = np.nan
|
||
audit_df.loc[line_no]['data'] = ''
|
||
audit_df.loc[line_no]['data_corrected'] = ''
|
||
print("###########try############")
|
||
except Exception as e:
|
||
print("Exception accepted:",e)
|
||
|
||
audit_df['line_wrapped_at_prescribed_right_indent'][line_no] = 'Yes'
|
||
wrapped_index += 1
|
||
|
||
|
||
|
||
else:
|
||
print("No need to wrap line")
|
||
try:
|
||
print(par_data)
|
||
except:
|
||
pass
|
||
print(len(par_data))
|
||
|
||
try:
|
||
df = df.sort_values(by=['line_no']).reset_index(drop =True)
|
||
except:
|
||
print("Exception 10184:")
|
||
df['line_no'] = [np.float64(val) if isinstance(val, str) else val for val in df['line_no']]
|
||
df = df.sort_values(by=['line_no']).reset_index(drop =True)
|
||
print("The df in merge_text123456789")
|
||
print(df)
|
||
return df
|
||
|
||
|
||
def check_slug_still_unidentified(df):
|
||
|
||
slug_still_unidentified = False
|
||
print("checking if all slugs were identified")
|
||
df_unidn = df.loc[df['isIdentified'] == 'No',:]
|
||
for index in df_unidn.index:
|
||
try:
|
||
line_pos = df_unidn['Identification_Status'][index].split(';')
|
||
print(line_pos)
|
||
line_pos = line_pos[0:2]
|
||
print("top2 line pos",line_pos)
|
||
except:
|
||
line_pos = []
|
||
for ps in line_pos:
|
||
if ps in ['ps1','ps2','ps18']:
|
||
slug_still_unidentified = True
|
||
return slug_still_unidentified
|
||
|
||
return slug_still_unidentified
|
||
|
||
|
||
def sa_wrapped_output_to_docx(para_df,output_docx):
|
||
|
||
page_no = 1
|
||
lines_added = 0
|
||
output_doc = Document()
|
||
style = output_doc.styles['Normal']
|
||
font = style.font
|
||
font.name = 'Courier New'
|
||
font.size = Pt(12)
|
||
section = output_doc.sections[0]
|
||
section.page_height = Mm(297)
|
||
#section.page_width = Mm(210)
|
||
a4_right = 8.57
|
||
section.page_width = Inches(a4_right)
|
||
section.left_margin = Inches(1.5)
|
||
|
||
|
||
for index in para_df.index:
|
||
para = output_doc.add_paragraph()
|
||
|
||
paragraph_format = para.paragraph_format
|
||
|
||
paragraph_format.space_before = Pt(0)
|
||
paragraph_format.space_after = Pt(0)
|
||
paragraph_format.line_spacing = Pt(12)
|
||
|
||
script_element = para_df['script_element'][index]
|
||
content = para_df['content'][index]
|
||
|
||
if script_element == 'blank':
|
||
continue
|
||
if script_element in ('slugline','action'):
|
||
paragraph_format.left_indent = Inches(0)
|
||
paragraph_format.right_indent = Inches(0)
|
||
|
||
if script_element == 'dialogue':
|
||
paragraph_format.left_indent = Inches(1.0)
|
||
paragraph_format.right_indent = Inches(1.25)
|
||
|
||
|
||
if script_element == 'parenthetical':
|
||
paragraph_format.left_indent = Inches(1.5)
|
||
paragraph_format.right_indent = Inches(2.25)
|
||
|
||
|
||
if script_element == 'speaker':
|
||
paragraph_format.left_indent = Inches(2)
|
||
paragraph_format.right_indent = Inches(1)
|
||
|
||
if script_element == 'transition':
|
||
para.alignment = WD_ALIGN_PARAGRAPH.RIGHT
|
||
paragraph_format.left_indent = Inches(2.5)
|
||
paragraph_format.right_indent = Inches(0)
|
||
|
||
if script_element == 'special_term':
|
||
paragraph_format.left_indent = Inches(0)
|
||
paragraph_format.right_indent = Inches(0)
|
||
|
||
para.text = content
|
||
|
||
|
||
# first_page = True
|
||
|
||
|
||
output_doc.save(output_docx)
|
||
|
||
|
||
|
||
|
||
|
||
def sa_output_to_docx(df,output_docx,output_template):
|
||
|
||
page_no = 1
|
||
lines_added = 0
|
||
output_doc = Document(output_template)
|
||
style = output_doc.styles['Normal']
|
||
font = style.font
|
||
font.name = 'Courier New'
|
||
font.size = Pt(12)
|
||
section = output_doc.sections[0]
|
||
section.page_height = Mm(297)
|
||
section.page_width = Mm(210)
|
||
#section.page_width = Inches(11)
|
||
section.left_margin = Inches(1.5)
|
||
header = section.header
|
||
|
||
paragraph = output_doc.paragraphs[0]
|
||
paragraph_format = paragraph.paragraph_format
|
||
|
||
paragraph_format.space_before = Pt(0)
|
||
paragraph_format.space_after = Pt(0)
|
||
|
||
paragraph_format.line_spacing = Pt(12)
|
||
|
||
#def add_page_number(doc,):
|
||
slug_still_unidentified = check_slug_still_unidentified(df)
|
||
print(slug_still_unidentified)
|
||
|
||
|
||
if df['Identification_Status'][0] == 'blank' and df['Identification_Status'][1] in ('ps1','ps2'):
|
||
## add Fade in
|
||
data = "FADE IN:"
|
||
data = data.rjust(len(data))
|
||
paragraph.add_run(data)
|
||
lines_added = 1
|
||
|
||
elif df['Identification_Status'][0] in ('ps1','ps2'):
|
||
## add Fade in and blank
|
||
data = "FADE IN:"
|
||
data = data.rjust(len(data))
|
||
paragraph.add_run(data)
|
||
run = paragraph.add_run()
|
||
run.add_break()
|
||
lines_added = 2
|
||
|
||
|
||
scene_no = 1
|
||
first_page = True
|
||
|
||
|
||
for index in df.index:
|
||
#print("lines_added",lines_added)
|
||
if lines_added == 56:
|
||
## add break if dialogue is getting separated
|
||
if df['Identification_Status'][index] in ('ps7','ps8'):
|
||
if df['Identification_Status'][index+1] in ('ps9','ps10'):
|
||
output_doc.add_page_break()
|
||
lines_added = 0
|
||
|
||
elif df['Identification_Status'][index] in ('ps1','ps2'):
|
||
## add a page break if next line is slug
|
||
output_doc.add_page_break()
|
||
lines_added = 0
|
||
|
||
if lines_added == 57:
|
||
## add break if dialogue is getting separated
|
||
if df['Identification_Status'][index] in ('ps7','ps8'):
|
||
output_doc.add_page_break()
|
||
lines_added = 0
|
||
elif df['Identification_Status'][index] in ('ps1','ps2'):
|
||
## add a page break if next line is slug
|
||
output_doc.add_page_break()
|
||
lines_added = 0
|
||
|
||
if lines_added == 58:
|
||
lines_added = 0
|
||
# elif lines_added == 58:
|
||
# lines_added = 0
|
||
# first_page = False
|
||
|
||
pos = df['Identification_Status'][index]
|
||
data = df['data'][index]
|
||
try:
|
||
print(index,data)
|
||
except:
|
||
pass
|
||
|
||
|
||
if slug_still_unidentified:
|
||
para = output_doc.add_paragraph()
|
||
paragraph_format = para.paragraph_format
|
||
|
||
paragraph_format.space_before = Pt(0)
|
||
paragraph_format.space_after = Pt(0)
|
||
|
||
paragraph_format.line_spacing = Pt(12)
|
||
if data.strip():
|
||
para.text = data[15:]
|
||
else:
|
||
para.text = ''
|
||
|
||
else:
|
||
|
||
# to do , get scene number
|
||
scene_data = str(scene_no)
|
||
left_indent = 12
|
||
if pos == 'ps1' or pos == 'ps2':
|
||
print("Removing already present scene number")
|
||
print("Adding scene number")
|
||
if scene_no < 9:
|
||
data = scene_data + ' ' + data.lstrip()
|
||
elif scene_no < 100:
|
||
data = scene_data + ' ' + data.lstrip()
|
||
else:
|
||
data = scene_data + data.lstrip()
|
||
|
||
data = data.rjust(len(data) + left_indent)
|
||
|
||
data = data.rstrip()
|
||
scene_indent = 63 - len(data.strip())
|
||
scene_data = scene_data.rjust(scene_indent)
|
||
data = data + scene_data
|
||
scene_no += 1
|
||
|
||
|
||
|
||
para = output_doc.add_paragraph()
|
||
paragraph_format = para.paragraph_format
|
||
|
||
paragraph_format.space_before = Pt(0)
|
||
paragraph_format.space_after = Pt(0)
|
||
|
||
paragraph_format.line_spacing = Pt(12)
|
||
paragraph_format.left_indent = -Inches(0.3)
|
||
|
||
para.text = data[12:]
|
||
|
||
else:
|
||
para = output_doc.add_paragraph()
|
||
paragraph_format = para.paragraph_format
|
||
|
||
paragraph_format.space_before = Pt(0)
|
||
paragraph_format.space_after = Pt(0)
|
||
|
||
paragraph_format.line_spacing = Pt(12)
|
||
if data.strip():
|
||
para.text = data[15:]
|
||
else:
|
||
para.text = ''
|
||
|
||
|
||
|
||
|
||
lines_added += 1
|
||
|
||
|
||
## add Fade out
|
||
data = "FADE OUT:"
|
||
data = data.rjust( 58 - len(data))
|
||
para = output_doc.add_paragraph()
|
||
paragraph_format = para.paragraph_format
|
||
|
||
paragraph_format.space_before = Pt(0)
|
||
paragraph_format.space_after = Pt(0)
|
||
|
||
paragraph_format.line_spacing = Pt(12)
|
||
para.text = data
|
||
|
||
|
||
|
||
|
||
|
||
output_doc.save(output_docx)
|
||
|
||
|
||
|
||
# def sa_txt_to_docx(script_txt,output_script_docx):
|
||
|
||
# output_template_name = 'ScriptTemplate5.docx'
|
||
# output_template = os.path.join(mypath,output_template_name)
|
||
|
||
# new_doc = Document(output_template)
|
||
# style = new_doc.styles['Normal']
|
||
# font = style.font
|
||
# font.name = 'Courier New'
|
||
# font.size = Pt(12)
|
||
# section = new_doc.sections[0]
|
||
# section.page_height = Mm(297)
|
||
# section.page_width = Mm(210)
|
||
# #section.page_width = Inches(11)
|
||
# section.left_margin = Inches(1.5)
|
||
# header = section.header
|
||
|
||
# with open(script_txt,'r',encoding='utf-8') as txt_in:
|
||
# lines = txt_in.readlines()
|
||
# for line in lines:
|
||
# para = new_doc.add_paragraph()
|
||
# paragraph_format = para.paragraph_format
|
||
|
||
# paragraph_format.space_before = Pt(0)
|
||
# paragraph_format.space_after = Pt(0)
|
||
|
||
# paragraph_format.line_spacing = Pt(12)
|
||
|
||
# if line.strip():
|
||
# para.text = line[15:]
|
||
# else:
|
||
# para.text = ''
|
||
|
||
# new_doc.save(output_script_docx)
|
||
|
||
def sa_output_to_txt(output_script_docx,output_script_txt):
|
||
|
||
from docx import Document
|
||
from docx.shared import Pt
|
||
from docx.shared import Mm
|
||
|
||
read_doc = Document(output_script_docx)
|
||
all_paras = read_doc.paragraphs
|
||
first = all_paras[0].paragraph_format
|
||
#print(first.left_indent)
|
||
#count = 1
|
||
print(len(all_paras))
|
||
left_margin = 15
|
||
|
||
with open(output_script_txt, 'w', encoding='utf-8') as f:
|
||
for para in all_paras:
|
||
paragraph_format = para.paragraph_format
|
||
fli =0
|
||
li =0
|
||
ri =0
|
||
try:
|
||
fli = paragraph_format.first_line_indent.inches
|
||
|
||
except:
|
||
pass
|
||
try:
|
||
|
||
li = paragraph_format.left_indent.inches
|
||
except:
|
||
pass
|
||
|
||
try:
|
||
|
||
ri = paragraph_format.right_indent.inches
|
||
except:
|
||
pass
|
||
indent = int((fli + li ) * 10)
|
||
print(fli,li,indent,ri)
|
||
data = para.text
|
||
lines = data.split('\n')
|
||
print(len(lines))
|
||
for line in lines:
|
||
try:
|
||
print(line)
|
||
except:
|
||
pass
|
||
line = line.rjust(len(line) + indent + left_margin)
|
||
try:
|
||
print(line)
|
||
except:
|
||
pass
|
||
|
||
f.write(line)
|
||
f.write('\n')
|
||
|
||
def print_audit_report_docx(audit_df,audit_report_docx):
|
||
|
||
#line_removed header left_indent_corrected right_indent_corrected line_wrapped_at_prescribed_right_indent case_corrected #blank_inserted_before blank_inserted_after blank_deleted_before blank_deleted_after space_removed_between_characters #space_added_between_characters line_merged_with_next_line line_broken_into_multiple_lines punctuation_mark_added #punctuation_mark_removed
|
||
|
||
output_doc = Document()
|
||
para = output_doc.add_paragraph()
|
||
para.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
|
||
run = para.add_run()
|
||
run.text = ' Audit Report'
|
||
run.add_break()
|
||
run.add_break()
|
||
|
||
for index in audit_df.index:
|
||
|
||
para = output_doc.add_paragraph()
|
||
|
||
data = "Line No: " + str(index)
|
||
run = para.add_run()
|
||
run.add_break()
|
||
run.text = data
|
||
run.add_break()
|
||
run.add_break()
|
||
#para.add_run(data)
|
||
|
||
cur_data = audit_df['data'][index]
|
||
|
||
|
||
data = "Current Data: " + cur_data
|
||
run = para.add_run()
|
||
run.text = data
|
||
run.add_break()
|
||
|
||
if audit_df['line_removed'][index] == 'Yes':
|
||
data = "Line was removed"
|
||
run = para.add_run()
|
||
run.text = data
|
||
run.add_break()
|
||
continue
|
||
|
||
|
||
new_data = audit_df['data_corrected'][index]
|
||
data = "Corrected Data: " + new_data
|
||
run = para.add_run()
|
||
run.text = data
|
||
run.add_break()
|
||
|
||
data = "Changes Done:- "
|
||
run = para.add_run()
|
||
run.text = data
|
||
run.add_break()
|
||
|
||
sno = 1
|
||
changes_done = False
|
||
|
||
if audit_df['left_indent_corrected'][index] != 'No':
|
||
change_comment = audit_df['left_indent_corrected'][index]
|
||
|
||
data = str(sno) + '. ' + change_comment
|
||
run = para.add_run()
|
||
run.text = data
|
||
run.add_break()
|
||
sno += 1
|
||
changes_done = True
|
||
|
||
if audit_df['right_indent_corrected'][index] != 'No':
|
||
change_comment = audit_df['right_indent_corrected'][index]
|
||
|
||
data = str(sno) + '. ' + change_comment
|
||
run = para.add_run()
|
||
run.text = data
|
||
run.add_break()
|
||
sno += 1
|
||
changes_done = True
|
||
|
||
if audit_df['case_corrected'][index] != 'No':
|
||
change_comment = 'Case ' + audit_df['case_corrected'][index]
|
||
|
||
data = str(sno) + '. ' + change_comment
|
||
run = para.add_run()
|
||
run.text = data
|
||
run.add_break()
|
||
sno += 1
|
||
changes_done = True
|
||
|
||
if audit_df['line_wrapped_at_prescribed_right_indent'][index] != 'No':
|
||
change_comment = 'Line Wrapped at Prescribed Right Indent'
|
||
|
||
data = str(sno) + '. ' + change_comment
|
||
run = para.add_run()
|
||
run.text = data
|
||
run.add_break()
|
||
sno += 1
|
||
changes_done = True
|
||
|
||
if audit_df['line_broken_into_multiple_lines'][index] != 'No':
|
||
change_comment = 'Line Broken into Multiple Lines'
|
||
|
||
data = str(sno) + '. ' + change_comment
|
||
run = para.add_run()
|
||
run.text = data
|
||
run.add_break()
|
||
sno += 1
|
||
changes_done = True
|
||
|
||
if audit_df['line_merged_with_next_line'][index] != 'No':
|
||
change_comment = 'Line Merged with Next Line'
|
||
|
||
data = str(sno) + '. ' + change_comment
|
||
run = para.add_run()
|
||
run.text = data
|
||
run.add_break()
|
||
sno += 1
|
||
changes_done = True
|
||
|
||
|
||
if not changes_done:
|
||
data = 'No Changes Done'
|
||
run = para.add_run()
|
||
run.text = data
|
||
run.add_break()
|
||
|
||
|
||
|
||
|
||
output_doc.save(audit_report_docx)
|
||
|
||
def ps_to_script_element(ps):
|
||
if ps == 'ps1':
|
||
return 'Slugline'
|
||
elif ps == 'ps2':
|
||
return 'Slugline'
|
||
elif ps == 'ps3':
|
||
return 'Slugline'
|
||
elif ps == 'ps4':
|
||
return 'Action'
|
||
elif ps == 'ps5':
|
||
return 'Action'
|
||
elif ps == 'ps6':
|
||
return 'Action'
|
||
elif ps == 'ps7':
|
||
return 'Speaker'
|
||
elif ps == 'ps8':
|
||
return 'Speaker with Extension'
|
||
elif ps == 'ps9':
|
||
return 'Speaker Extension'
|
||
elif ps == 'ps10':
|
||
return 'Parenthetical'
|
||
elif ps == 'ps11':
|
||
return 'Parenthetical'
|
||
elif ps == 'ps12':
|
||
return 'Parenthetical'
|
||
elif ps == 'ps20':
|
||
return 'Parenthetical'
|
||
elif ps == 'ps13':
|
||
return 'Dialogue'
|
||
elif ps == 'ps14':
|
||
return 'Dialogue'
|
||
elif ps == 'ps15':
|
||
return 'Dialogue'
|
||
elif ps == 'ps16':
|
||
return 'Transition'
|
||
elif ps == 'ps17':
|
||
return 'Special Term'
|
||
# elif ps == 'ps0':
|
||
# return 'Title Lines'
|
||
else:
|
||
return ''
|
||
|
||
|
||
|
||
# def print_audit_report_tabular_docx(audit_df):
|
||
# print("inside audit report")
|
||
# #line_removed header left_indent_corrected right_indent_corrected line_wrapped_at_prescribed_right_indent case_corrected #blank_inserted_before blank_inserted_after blank_deleted_before blank_deleted_after space_removed_between_characters #space_added_between_characters line_merged_with_next_line line_broken_into_multiple_lines punctuation_mark_added #punctuation_mark_removed
|
||
|
||
# output_doc = Document()
|
||
|
||
# style = output_doc.styles['Normal']
|
||
# font = style.font
|
||
# #font.name = 'Courier New'
|
||
# font.size = Pt(8)
|
||
|
||
# section = output_doc.sections[-1]
|
||
# section.orientation = WD_ORIENT.LANDSCAPE
|
||
|
||
# section.page_width = Inches(11)
|
||
# section.left_margin = Inches(0.25)
|
||
# section.right_margin = Inches(0.25)
|
||
|
||
# para = output_doc.add_paragraph()
|
||
# para.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
|
||
# run = para.add_run()
|
||
# font = run.font
|
||
# font.size = Pt(12)
|
||
# run.text = ' Audit Report'
|
||
# run.add_break()
|
||
# run.add_break()
|
||
|
||
# para = output_doc.add_paragraph()
|
||
# para.alignment = WD_ALIGN_PARAGRAPH.LEFT
|
||
# run = para.add_run()
|
||
# font = run.font
|
||
# font.size = Pt(10)
|
||
# run.text = ' Audit Summary'
|
||
|
||
# para = output_doc.add_paragraph()
|
||
# run = para.add_run()
|
||
# font = run.font
|
||
# font.size = Pt(9)
|
||
# print("audit summary column is created")
|
||
# case_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['case_corrected'] != 'No'),:])
|
||
# left_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['left_indent_corrected'] != 'No'),:])
|
||
# right_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['right_indent_corrected'] != 'No'),:])
|
||
# wrapped_lines_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_wrapped_at_prescribed_right_indent'] != 'No'),:])
|
||
|
||
|
||
# table =output_doc.add_table(1, cols =2)
|
||
# table.style = 'Table Grid'
|
||
|
||
# font.size = Pt(9)
|
||
|
||
# heading_cells = table.rows[0].cells
|
||
# heading_cells[0].width = Inches(2)
|
||
# heading_cells[1].width = Inches(1)
|
||
# heading_cells[0].text = 'Type of Change Done'
|
||
# heading_cells[1].text = 'Count of Lines'
|
||
# for i in range(0,2):
|
||
# heading_cells[i].paragraphs[0].runs[0].font.bold = True
|
||
|
||
# cells = table.add_row().cells
|
||
# font.size = Pt(8)
|
||
# cells[0].width = Inches(2)
|
||
# cells[0].text = 'Case Corrected'
|
||
# cells[1].width = Inches(0.5)
|
||
# cells[1].text = str(case_corrected_count)
|
||
|
||
|
||
# cells = table.add_row().cells
|
||
# font.size = Pt(8)
|
||
# cells[0].width = Inches(2)
|
||
# cells[0].text = 'Left Indent Corrected'
|
||
# cells[1].width = Inches(0.5)
|
||
# cells[1].text = str(left_indent_corrected_count)
|
||
|
||
|
||
# cells = table.add_row().cells
|
||
# font.size = Pt(8)
|
||
# cells[0].width = Inches(2)
|
||
# cells[0].text = 'Case Corrected'
|
||
# cells[1].width = Inches(0.5)
|
||
# cells[1].text = str(right_indent_corrected_count)
|
||
|
||
# cells = table.add_row().cells
|
||
# font.size = Pt(8)
|
||
# cells[0].width = Inches(2)
|
||
# cells[0].text = 'Lines Wrapped at prescribed indents'
|
||
# cells[1].width = Inches(0.5)
|
||
# cells[1].text = str(wrapped_lines_count)
|
||
|
||
|
||
|
||
# # run.add_break()
|
||
# para = output_doc.add_paragraph()
|
||
# run = para.add_run()
|
||
# run.add_break()
|
||
# run.add_break()
|
||
|
||
# font.size = Pt(8)
|
||
|
||
# no_rows = len(audit_df.index)
|
||
# table =output_doc.add_table(1, cols =6)
|
||
# table.style = 'Table Grid'
|
||
|
||
# table.autofit = False
|
||
# # table.columns[0].width = Inches(0.5)
|
||
# # table.columns[1].width = Inches(4)
|
||
# # table.columns[2].width = Inches(4)
|
||
# # table.columns[3].width = Inches(0.5)
|
||
|
||
|
||
# heading_cells = table.rows[0].cells
|
||
|
||
# heading_cells[0].width = Inches(0.5)
|
||
# heading_cells[1].width = Inches(0.5)
|
||
# heading_cells[2].width = Inches(3.5)
|
||
# heading_cells[3].width = Inches(0.8)
|
||
# heading_cells[4].width = Inches(3.5)
|
||
# heading_cells[5].width = Inches(2)
|
||
|
||
# heading_cells[0].text = 'Line No'
|
||
# heading_cells[1].text = 'Audited Line No'
|
||
# heading_cells[2].text = 'Current Content'
|
||
# heading_cells[3].text = 'Script Element'
|
||
# heading_cells[4].text = 'New Content'
|
||
# heading_cells[5].text = 'Changes Done'
|
||
|
||
# print("assigned heading")
|
||
# for i in range(0,6):
|
||
# heading_cells[i].paragraphs[0].runs[0].font.bold = True
|
||
# heading_cells[i].paragraphs[0].runs[0].font.size = Pt(9)
|
||
|
||
# print("assigned Index")
|
||
# for index in audit_df.index:
|
||
|
||
# row_index = 1
|
||
|
||
# #line_no = audit_df['line_no'][index]
|
||
|
||
# cells = table.add_row().cells
|
||
# cells[0].width = Inches(0.5)
|
||
# cells[0].text = str(index)
|
||
|
||
|
||
# audited_line_no = audit_df['audited_line_no'][index]
|
||
# data = str(audited_line_no)
|
||
# cells[1].width = Inches(0.5)
|
||
# cells[1].text = data
|
||
|
||
# cur_data = audit_df['data'][index]
|
||
# data = cur_data
|
||
# cells[2].width = Inches(3.5)
|
||
# data = str(data)
|
||
# cells[2].text = data
|
||
|
||
# if audit_df['Identification_Status'][index] == 'blank':
|
||
# script_element = 'Blank Line'
|
||
# elif audit_df['Identification_Status'][index] == '':
|
||
# if audit_df['introduction'][index] == 'Yes':
|
||
# script_element = 'Title/Introduction'
|
||
# elif audit_df['appendix'][index] == 'Yes':
|
||
# script_element = 'Appendix'
|
||
# # -----------------------------changed with mohit sir
|
||
# else:
|
||
# continue
|
||
# # -----------------------------changed with mohit sir
|
||
# else:
|
||
# script_element = ps_to_script_element(audit_df['Identification_Status'][index])
|
||
# data = script_element
|
||
# cells[3].width = Inches(0.8)
|
||
# cells[3].text = data
|
||
|
||
# new_data = audit_df['data_corrected'][index]
|
||
# data = new_data
|
||
# cells[4].width = Inches(3.5)
|
||
# data = str(data)
|
||
# cells[4].text = data
|
||
|
||
|
||
# # if audit_df['line_removed'][index] == 'Yes':
|
||
# # data = "Line was removed"
|
||
# # run = para.add_run()
|
||
# # run.text = data
|
||
# # run.add_break()
|
||
# # continue
|
||
|
||
|
||
|
||
|
||
# sno = 1
|
||
# changes_done = False
|
||
|
||
|
||
# if audit_df['left_indent_corrected'][index] != 'No':
|
||
# change_comment = audit_df['left_indent_corrected'][index]
|
||
|
||
# data = str(sno) + '. ' + str(change_comment)
|
||
# cells[5].width = Inches(2)
|
||
# para = cells[5].add_paragraph()
|
||
# run = para.add_run()
|
||
# run.text = data
|
||
# run.add_break()
|
||
# sno += 1
|
||
# changes_done = True
|
||
|
||
# if audit_df['right_indent_corrected'][index] != 'No':
|
||
# change_comment = audit_df['right_indent_corrected'][index]
|
||
|
||
|
||
# data = str(sno) + '. ' + str(change_comment)
|
||
# cells[5].width = Inches(2)
|
||
# para = cells[5].add_paragraph()
|
||
# run = para.add_run()
|
||
# run.text = data
|
||
# run.add_break()
|
||
# sno += 1
|
||
# changes_done = True
|
||
|
||
# if audit_df['case_corrected'][index] != 'No':
|
||
# change_comment = 'Case ' + str(audit_df['case_corrected'][index])
|
||
|
||
# data = str(sno) + '. ' + str(change_comment)
|
||
# cells[5].width = Inches(2)
|
||
# para = cells[5].add_paragraph()
|
||
# run = para.add_run()
|
||
# run.text = data
|
||
# run.add_break()
|
||
# sno += 1
|
||
# changes_done = True
|
||
|
||
# if audit_df['line_wrapped_at_prescribed_right_indent'][index] != 'No':
|
||
# change_comment = 'Line Wrapped at Prescribed Right Indent'
|
||
|
||
# data = str(sno) + '. ' + str(change_comment)
|
||
# cells[5].width = Inches(2)
|
||
# para = cells[5].add_paragraph()
|
||
# run = para.add_run()
|
||
# run.text = data
|
||
# run.add_break()
|
||
# sno += 1
|
||
# changes_done = True
|
||
|
||
# if audit_df['line_broken_into_multiple_lines'][index] != 'No':
|
||
# change_comment = 'Line Broken into Multiple Lines'
|
||
|
||
# data = str(sno) + '. ' + str(change_comment)
|
||
# cells[5].width = Inches(2)
|
||
# para = cells[5].add_paragraph()
|
||
# run = para.add_run()
|
||
# run.text = data
|
||
# run.add_break()
|
||
# sno += 1
|
||
# changes_done = True
|
||
|
||
# if audit_df['line_merged_with_next_line'][index] != 'No':
|
||
# change_comment = 'Line Merged with Next Line'
|
||
# data = str(sno) + '. ' + str(change_comment)
|
||
# cells[5].width = Inches(2)
|
||
# para = cells[5].add_paragraph()
|
||
# run = para.add_run()
|
||
# run.text = data
|
||
# run.add_break()
|
||
# sno += 1
|
||
# changes_done = True
|
||
|
||
# if audit_df['language_specific_audit_comments'][index] != 'No':
|
||
# change_comment = str(audit_df['language_specific_audit_comments'][index])
|
||
|
||
# data = str(sno) + '. ' + str(change_comment)
|
||
# cells[5].width = Inches(2)
|
||
# para = cells[5].add_paragraph()
|
||
# run = para.add_run()
|
||
# run.text = data
|
||
# run.add_break()
|
||
# sno += 1
|
||
# changes_done = True
|
||
|
||
# if not changes_done:
|
||
# data = 'No Changes Done'
|
||
# cells[5].width = Inches(2)
|
||
# para = cells[5].add_paragraph()
|
||
# run = para.add_run()
|
||
# run.text = data
|
||
# run.add_break()
|
||
|
||
# row_index += 1
|
||
|
||
# buffer = io.BytesIO()
|
||
# output_doc.save(buffer)
|
||
# buffer.seek(0)
|
||
# print("complete")
|
||
|
||
# #output_doc.save(audit_report_tabular_docx)
|
||
# return buffer
|
||
|
||
|
||
|
||
# def print_audit_report_tabular_docx(audit_df):
|
||
|
||
# #line_removed header left_indent_corrected right_indent_corrected line_wrapped_at_prescribed_right_indent case_corrected #blank_inserted_before blank_inserted_after blank_deleted_before blank_deleted_after space_removed_between_characters #space_added_between_characters line_merged_with_next_line line_broken_into_multiple_lines punctuation_mark_added #punctuation_mark_removed
|
||
|
||
# output_doc = Document()
|
||
|
||
# style = output_doc.styles['Normal']
|
||
# font = style.font
|
||
# #font.name = 'Courier New'
|
||
# font.size = Pt(8)
|
||
|
||
# section = output_doc.sections[-1]
|
||
# section.orientation = WD_ORIENT.LANDSCAPE
|
||
|
||
# section.page_width = Inches(11)
|
||
# section.left_margin = Inches(0.25)
|
||
# section.right_margin = Inches(0.25)
|
||
|
||
# para = output_doc.add_paragraph()
|
||
# para.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
|
||
# run = para.add_run()
|
||
# font = run.font
|
||
# font.size = Pt(12)
|
||
# run.text = ' Audit Report'
|
||
# run.add_break()
|
||
# run.add_break()
|
||
|
||
# para = output_doc.add_paragraph()
|
||
# para.alignment = WD_ALIGN_PARAGRAPH.LEFT
|
||
# run = para.add_run()
|
||
# font = run.font
|
||
# font.size = Pt(10)
|
||
# run.text = ' Audit Summary'
|
||
|
||
# para = output_doc.add_paragraph()
|
||
# run = para.add_run()
|
||
# font = run.font
|
||
# font.size = Pt(9)
|
||
|
||
# case_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['case_corrected'] != 'No'),:])
|
||
# left_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['left_indent_corrected'] != 'No'),:])
|
||
# right_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['right_indent_corrected'] != 'No'),:])
|
||
# wrapped_lines_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_wrapped_at_prescribed_right_indent'] != 'No'),:])
|
||
|
||
|
||
# table =output_doc.add_table(1, cols =2)
|
||
# table.style = 'Table Grid'
|
||
|
||
# font.size = Pt(9)
|
||
|
||
# heading_cells = table.rows[0].cells
|
||
# heading_cells[0].width = Inches(2)
|
||
# heading_cells[1].width = Inches(1)
|
||
# heading_cells[0].text = 'Type of Change Done'
|
||
# heading_cells[1].text = 'Count of Lines'
|
||
# for i in range(0,2):
|
||
# heading_cells[i].paragraphs[0].runs[0].font.bold = True
|
||
|
||
# cells = table.add_row().cells
|
||
# font.size = Pt(8)
|
||
# cells[0].width = Inches(2)
|
||
# cells[0].text = 'Case Corrected'
|
||
# cells[1].width = Inches(0.5)
|
||
# cells[1].text = str(case_corrected_count)
|
||
|
||
|
||
# cells = table.add_row().cells
|
||
# font.size = Pt(8)
|
||
# cells[0].width = Inches(2)
|
||
# cells[0].text = 'Left Indent Corrected'
|
||
# cells[1].width = Inches(0.5)
|
||
# cells[1].text = str(left_indent_corrected_count)
|
||
|
||
|
||
# cells = table.add_row().cells
|
||
# font.size = Pt(8)
|
||
# cells[0].width = Inches(2)
|
||
# cells[0].text = 'Case Corrected'
|
||
# cells[1].width = Inches(0.5)
|
||
# cells[1].text = str(right_indent_corrected_count)
|
||
|
||
# cells = table.add_row().cells
|
||
# font.size = Pt(8)
|
||
# cells[0].width = Inches(2)
|
||
# cells[0].text = 'Lines Wrapped at prescribed indents'
|
||
# cells[1].width = Inches(0.5)
|
||
# cells[1].text = str(wrapped_lines_count)
|
||
|
||
|
||
|
||
# run.add_break()
|
||
# para = output_doc.add_paragraph()
|
||
# run = para.add_run()
|
||
# run.add_break()
|
||
# run.add_break()
|
||
|
||
# font.size = Pt(8)
|
||
|
||
# no_rows = len(audit_df.index)
|
||
# table =output_doc.add_table(1, cols =6)
|
||
# table.alignment = WD_TABLE_ALIGNMENT.CENTER
|
||
# table.style = 'Table Grid'
|
||
|
||
# table.autofit = False
|
||
# table.columns[0].width = Inches(0.5)
|
||
# table.columns[1].width = Inches(1.2)
|
||
# table.columns[2].width = Inches(2)
|
||
# table.columns[3].width = Inches(1.5)
|
||
# table.columns[4].width = Inches(2)
|
||
# table.columns[5].width = Inches(2.5)
|
||
|
||
# heading_cells = table.rows[0].cells
|
||
|
||
# heading_cells[0].width = Inches(0.5)
|
||
# heading_cells[1].width = Inches(0.5)
|
||
# heading_cells[2].width = Inches(3.5)
|
||
# heading_cells[3].width = Inches(0.8)
|
||
# heading_cells[4].width = Inches(3.5)
|
||
# heading_cells[5].width = Inches(2)
|
||
|
||
# heading_cells[0].text = 'Line No'
|
||
# heading_cells[1].text = 'Audited Line No'
|
||
# heading_cells[2].text = 'Current Content'
|
||
# heading_cells[3].text = 'Script Element'
|
||
# heading_cells[4].text = 'New Content'
|
||
# heading_cells[5].text = 'Changes Done'
|
||
|
||
|
||
# for i in range(0,6):
|
||
# heading_cells[i].paragraphs[0].runs[0].font.bold = True
|
||
# heading_cells[i].paragraphs[0].runs[0].font.size = Pt(9)
|
||
|
||
|
||
# for index in audit_df.index:
|
||
# columns_to_check = ["line_removed","introduction", "appendix", "page_no" ,"left_indent_corrected" ,"right_indent_corrected" ,"line_wrapped_at_prescribed_right_indent", "case_corrected", "blank_inserted_before" ,"blank_inserted_after" ,"blank_deleted_before" ,"blank_deleted_after" ,"space_removed_between_characters" ,"space_added_between_characters" ,"line_merged_with_next_line", "line_broken_into_multiple_lines" ,"punctuation_mark_added" ,"punctuation_mark_removed" ,"language_specific_audit_comments"]
|
||
# audit_df[columns_to_check] = audit_df[columns_to_check].fillna('No')
|
||
# if audit_df.loc[index, columns_to_check].eq('No').all().all():
|
||
# continue
|
||
|
||
# elif audit_df['introduction'][index] == 'Yes':
|
||
# continue
|
||
|
||
# elif audit_df['appendix'][index] == 'Yes':
|
||
# continue
|
||
|
||
# elif audit_df['Identification_Status'][index] == 'blank':
|
||
# continue
|
||
|
||
# elif pd.isna(audit_df.loc[index, "Identification_Status"]):
|
||
# continue
|
||
# row_index = 1
|
||
|
||
# #line_no = audit_df['line_no'][index]
|
||
|
||
# cells = table.add_row().cells
|
||
# cells[0].width = Inches(0.5)
|
||
# cells[0].text = str(index)
|
||
|
||
|
||
# audited_line_no = audit_df['audited_line_no'][index]
|
||
# data = str(audited_line_no)
|
||
# cells[1].width = Inches(0.5)
|
||
# cells[1].text = data
|
||
|
||
# cur_data = audit_df['data'][index]
|
||
# data = str(cur_data).strip()
|
||
# cells[2].width = Inches(3.5)
|
||
# data = str(data)
|
||
# cells[2].text = data
|
||
|
||
# if audit_df['Identification_Status'][index] == 'blank':
|
||
# script_element = 'Blank Line'
|
||
# elif audit_df['Identification_Status'][index] == '':
|
||
# if audit_df['introduction'][index] == 'Yes':
|
||
# script_element = 'Title/Introduction'
|
||
# elif audit_df['appendix'][index] == 'Yes':
|
||
# script_element = 'Appendix'
|
||
# # -----------------------------changed with mohit sir
|
||
# else:
|
||
# continue
|
||
# # -----------------------------changed with mohit sir
|
||
# else:
|
||
# script_element = ps_to_script_element(audit_df['Identification_Status'][index])
|
||
# data = script_element
|
||
# cells[3].width = Inches(0.8)
|
||
# cells[3].text = data
|
||
|
||
# new_data = audit_df['data_corrected'][index]
|
||
# data = str(new_data).strip()
|
||
# cells[4].width = Inches(3.5)
|
||
# data = str(data)
|
||
# cells[4].text = data
|
||
|
||
# sno = 1
|
||
# changes_done = False
|
||
|
||
# # identification_status = audit_df['Identification_Status'][index]
|
||
# if pd.isnull(audit_df['Identification_Status'][index]) or audit_df['Identification_Status'][index] == "":
|
||
# continue
|
||
|
||
|
||
# if audit_df['left_indent_corrected'][index] != 'No':
|
||
# change_comment = audit_df['left_indent_corrected'][index]
|
||
# try:
|
||
# str_int = change_comment[-2]+change_comment[-1]
|
||
# except Exception as e:
|
||
# pass
|
||
# if ps_to_script_element(audit_df['Identification_Status'][index]) == "Dialogue":
|
||
# if str_int == "15":
|
||
# change_comment = "Dialogue line left index corrected to 1.5 Inch"
|
||
# elif str_int == "25":
|
||
# change_comment = "Dialogue line left index corrected to 2.5 Inch"
|
||
|
||
|
||
# if str_int == "15":
|
||
# name = ps_to_script_element(audit_df['Identification_Status'][index])
|
||
# change_comment = f"{name} line left indent corrected to 1.5 Inch"
|
||
# print(change_comment)
|
||
# elif str_int == "25":
|
||
# name = ps_to_script_element(audit_df['Identification_Status'][index])
|
||
# change_commen = f"{name} left indent corrected to 2.5 Inch"
|
||
|
||
# elif str_int == "30":
|
||
# change_comment = "Parenthetical left indent corrected to 3 Inch"
|
||
# elif str_int == "35":
|
||
# change_comment = "Speaker left indent corrected to 3.5 Inch"
|
||
|
||
# if len(str(change_comment)) <= 2 :
|
||
# continue
|
||
# data = str(sno) + '. ' + str(change_comment)
|
||
# cells[5].width = Inches(2)
|
||
# para = cells[5].add_paragraph()
|
||
# run = para.add_run()
|
||
# run.text = data
|
||
# run.add_break()
|
||
# sno += 1
|
||
# changes_done = True
|
||
|
||
# if audit_df['right_indent_corrected'][index] != 'No':
|
||
# name = ps_to_script_element(audit_df['Identification_Status'][index])
|
||
# change_comment = audit_df['right_indent_corrected'][index]
|
||
# try:
|
||
# str_int = change_comment[-2]+change_comment[-1]
|
||
# except Exception as e:
|
||
# pass
|
||
# if str_int == "10":
|
||
# change_comment = f"{name} right indent corrected to 1 Inch"
|
||
|
||
# if len(str(change_comment)) <= 2 :
|
||
# continue
|
||
# data = str(sno) + '. ' + str(change_comment)
|
||
# cells[5].width = Inches(2)
|
||
# para = cells[5].add_paragraph()
|
||
# run = para.add_run()
|
||
# run.text = data
|
||
# run.add_break()
|
||
# sno += 1
|
||
# changes_done = True
|
||
|
||
# if audit_df['case_corrected'][index] != 'No':
|
||
# name = ps_to_script_element(audit_df['Identification_Status'][index])
|
||
# string = str(audit_df['case_corrected'][index])
|
||
# string = string.split()
|
||
# content = string[-1]
|
||
# if content == "AllUpper":
|
||
# change_comment = f'{name} Case ' + "Corrected to All Upper"
|
||
# elif content == "AllLower":
|
||
# change_comment = f'{name} Case ' + "Corrected to All Lowerr"
|
||
# if len(str(change_comment)) <= 2 :
|
||
# continue
|
||
# data = str(sno) + '. ' + str(change_comment)
|
||
# cells[5].width = Inches(2)
|
||
# para = cells[5].add_paragraph()
|
||
# run = para.add_run()
|
||
# run.text = data
|
||
# run.add_break()
|
||
# sno += 1
|
||
# changes_done = True
|
||
|
||
# if audit_df['line_wrapped_at_prescribed_right_indent'][index] != 'No':
|
||
# change_comment = 'Line Wrapped at Prescribed Right Indent 1 Inch'
|
||
# name = ps_to_script_element(audit_df['Identification_Status'][index])
|
||
# if name == "Action":
|
||
# change_comment = f'{name}Line Wrapped at Prescribed Right Indent 1 Inch'
|
||
# elif name == "Dialogue":
|
||
# change_comment = f'{name}Line Wrapped at Prescribed Right Indent 2 Inch'
|
||
|
||
# if len(str(change_comment)) <= 2 :
|
||
# continue
|
||
# data = str(sno) + '. ' + str(change_comment)
|
||
# cells[5].width = Inches(2)
|
||
# para = cells[5].add_paragraph()
|
||
# run = para.add_run()
|
||
# run.text = data
|
||
# run.add_break()
|
||
# sno += 1
|
||
# changes_done = True
|
||
|
||
# if audit_df['line_broken_into_multiple_lines'][index] != 'No':
|
||
# name = ps_to_script_element(audit_df['Identification_Status'][index])
|
||
# change_comment = f'{name} line Broken into Multiple Lines'
|
||
|
||
# if len(str(change_comment)) <= 2 :
|
||
# continue
|
||
# data = str(sno) + '. ' + str(change_comment)
|
||
# cells[5].width = Inches(2)
|
||
# para = cells[5].add_paragraph()
|
||
# run = para.add_run()
|
||
# run.text = data
|
||
# run.add_break()
|
||
# sno += 1
|
||
# changes_done = True
|
||
|
||
# if audit_df['line_merged_with_next_line'][index] != 'No':
|
||
# name = ps_to_script_element(audit_df['Identification_Status'][index])
|
||
# change_comment = f'{name} line Merged with Next Line'
|
||
|
||
# if len(str(change_comment)) <= 2 :
|
||
# continue
|
||
# data = str(sno) + '. ' + str(change_comment)
|
||
# cells[5].width = Inches(2)
|
||
# para = cells[5].add_paragraph()
|
||
# run = para.add_run()
|
||
# run.text = data
|
||
# run.add_break()
|
||
# sno += 1
|
||
# changes_done = True
|
||
|
||
# if audit_df['language_specific_audit_comments'][index] != 'No':
|
||
# pass
|
||
# name = ps_to_script_element(audit_df['Identification_Status'][index])
|
||
# change_comment = f"{name}",str(audit_df['language_specific_audit_comments'][index])
|
||
|
||
# if len(str(change_comment)) <= 2 :
|
||
# continue
|
||
# data = str(sno) + '. ' + str(change_comment)
|
||
# cells[5].width = Inches(2)
|
||
# para = cells[5].add_paragraph()
|
||
# run = para.add_run()
|
||
# run.text = data
|
||
# run.add_break()
|
||
# sno += 1
|
||
# changes_done = True
|
||
|
||
# if audit_df['blank_inserted_after'][index] != 'No':
|
||
# change_comment = 'A blank line is added below'
|
||
# data = str(sno) + '. ' + str(change_comment)
|
||
# cells[5].width = Inches(2)
|
||
# para = cells[5].add_paragraph()
|
||
# run = para.add_run()
|
||
# run.text = data
|
||
# run.add_break()
|
||
# sno += 1
|
||
# changes_done = True
|
||
|
||
# if not changes_done:
|
||
# continue
|
||
# # data = 'No Changes Done'
|
||
# # cells[5].width = Inches(2)
|
||
# # para = cells[5].add_paragraph()
|
||
# # run = para.add_run()
|
||
# # run.text = data
|
||
# # run.add_break()
|
||
|
||
# row_index += 1
|
||
|
||
# buffer = io.BytesIO()
|
||
# output_doc.save(buffer)
|
||
# buffer.seek(0)
|
||
|
||
|
||
# # output_doc.save(audit_report_tabular_docx)
|
||
# return buffer
|
||
|
||
# def print_audit_report_tabular_docx(audit_df,scriptname,author,pre_audit_pagenumber,postauditpagenumber,preaudit_line_no,postaudit_line_no,script_language,dialogue_language):
|
||
|
||
# #line_removed header left_indent_corrected right_indent_corrected line_wrapped_at_prescribed_right_indent case_corrected #blank_inserted_before blank_inserted_after blank_deleted_before blank_deleted_after space_removed_between_characters #space_added_between_characters line_merged_with_next_line line_broken_into_multiple_lines punctuation_mark_added #punctuation_mark_removed
|
||
|
||
# total_no_blanklines = len(audit_df[audit_df['Identification_Status'].isin(['blank'])])
|
||
# # <---------------------BLANK LINE ADD AND remove LOGIC IS HERE----------------->
|
||
# blankline_added = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['blank_inserted_before'] != 'No'),:] )
|
||
# blank_add_after = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['blank_inserted_after'] != 'No'),:] )
|
||
# blankline_inserted = blankline_added + blank_add_after
|
||
|
||
# blankline_rem_before = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['blank_deleted_before'] != 'No'),:] )
|
||
# blank_rem_after = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['blank_deleted_after'] != 'No'),:] )
|
||
# blankline_removed_total = blankline_rem_before + blank_rem_after
|
||
|
||
# ### <<----------------- logic for case --------------------------------->
|
||
# # for slugline
|
||
# # case corrected
|
||
# sluglinecase_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['case_corrected'] != 'No') & (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])), :])
|
||
# print(sluglinecase_corrected_count)
|
||
# # indentatioin corrected
|
||
# sleft_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['left_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])),:])
|
||
# sright_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['right_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])),:])
|
||
# swrapped_lines_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_wrapped_at_prescribed_right_indent'] != 'No')& (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])),:])
|
||
# slugline_indentation = sleft_indent_corrected_count + sright_indent_corrected_count + swrapped_lines_count
|
||
# print("sluglin_indentation:",slugline_indentation)
|
||
# # formate corrected
|
||
# slugline_formate1 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_removed_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])),:])
|
||
# slugline_formate2 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_added_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])),:])
|
||
# slugline_formate3 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_merged_with_next_line'] != 'No')& (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])),:])
|
||
# slugline_formate4 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_broken_into_multiple_lines'] != 'No')& (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])),:])
|
||
# slugline_formate5 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_added'] != 'No')& (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])),:])
|
||
# slugline_formate6 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_removed'] != 'No')& (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])),:])
|
||
# slugline_formated = slugline_formate1 + slugline_formate2 + slugline_formate3 + slugline_formate4 + slugline_formate5 + slugline_formate6
|
||
# print("slugline_formated",slugline_formated)
|
||
# #total sluglines
|
||
# total_no_sluglines = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])), :])
|
||
# print(total_no_sluglines)
|
||
|
||
# # for actioon -----line
|
||
# # case corrected
|
||
# actionlinecase_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['case_corrected'] != 'No') & (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])), :])
|
||
# print(actionlinecase_corrected_count)
|
||
# # indentatioin corrected
|
||
# actionleft_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['left_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])),:])
|
||
# actionright_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['right_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])),:])
|
||
# actionwrapped_lines_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_wrapped_at_prescribed_right_indent'] != 'No')& (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])),:])
|
||
# actionline_indentation = actionleft_indent_corrected_count + actionright_indent_corrected_count + actionwrapped_lines_count
|
||
# print("actionliine_indentation:",actionline_indentation)
|
||
# # formate corrected
|
||
# actionline_formate1 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_removed_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])),:])
|
||
# actionline_formate2 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_added_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])),:])
|
||
# actionline_formate3 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_merged_with_next_line'] != 'No')& (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])),:])
|
||
# actionline_formate4 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_broken_into_multiple_lines'] != 'No')& (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])),:])
|
||
# actionline_formate5 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_added'] != 'No')& (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])),:])
|
||
# actionline_formate6 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_removed'] != 'No')& (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])),:])
|
||
# actionline_formated = actionline_formate1 + actionline_formate2 + actionline_formate3 + actionline_formate4 + actionline_formate5 + actionline_formate6
|
||
# print("actionline_formated",actionline_formated)
|
||
# #total no of actionline
|
||
# total_actionlines = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])), :])
|
||
|
||
# print(total_actionlines)
|
||
|
||
|
||
# # for Speaker
|
||
# # case corrected
|
||
# speakercase_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['case_corrected'] != 'No') & (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])), :])
|
||
# print("speakercase_corrected_count", speakercase_corrected_count)
|
||
# # indentatioin corrected
|
||
# speakerleft_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['left_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])),:])
|
||
# speakerright_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['right_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])),:])
|
||
# speaker_lines_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_wrapped_at_prescribed_right_indent'] != 'No')& (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])),:])
|
||
# speaker_indentation = speakerleft_indent_corrected_count + speakerright_indent_corrected_count + speaker_lines_count
|
||
# print("speaker_indentation:",speaker_indentation)
|
||
# # formate corrected
|
||
# speaker_formate1 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_removed_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])),:])
|
||
# speaker_formate2 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_added_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])),:])
|
||
# speaker_formate3 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_merged_with_next_line'] != 'No')& (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])),:])
|
||
# speaker_formate4 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_broken_into_multiple_lines'] != 'No')& (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])),:])
|
||
# speaker_formate5 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_added'] != 'No')& (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])),:])
|
||
# speaker_formate6 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_removed'] != 'No')& (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])),:])
|
||
# speaker_formated = speaker_formate1 + speaker_formate2 + speaker_formate3 + speaker_formate4 + speaker_formate5 + speaker_formate6
|
||
# print("speaker_formated",speaker_formated)
|
||
# #total no of speaker -speaker
|
||
# total_no_speaker = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])),:])
|
||
|
||
# print(total_no_speaker)
|
||
|
||
|
||
# # for Parenthetical -----line
|
||
# # case corrected
|
||
# parentheticalcase_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['case_corrected'] != 'No') & (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])), :])
|
||
# print(parentheticalcase_corrected_count)
|
||
# # indentatioin corrected
|
||
# parenthetical_left_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['left_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])),:])
|
||
# parenthetical_right_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['right_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])),:])
|
||
# parenthetical_wrapped_lines_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_wrapped_at_prescribed_right_indent'] != 'No')& (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])),:])
|
||
# parenthetical_line_indentation = parenthetical_left_indent_corrected_count + parenthetical_right_indent_corrected_count + parenthetical_wrapped_lines_count
|
||
# print("parenthetical_line_indentation:",parenthetical_line_indentation)
|
||
# # formate corrected
|
||
# parenthetical_formate1 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_removed_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])),:])
|
||
# parenthetical_formate2 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_added_between_characters'] != 'No') & (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])),:])
|
||
# parenthetical_formate3 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_merged_with_next_line'] != 'No')& (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])),:])
|
||
# parenthetical_formate4 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_broken_into_multiple_lines'] != 'No')& (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])),:])
|
||
# parenthetical_formate5 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_added'] != 'No')& (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])),:])
|
||
# parenthetical_formate6 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_removed'] != 'No')& (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])),:])
|
||
# parenthetical_formated = parenthetical_formate1 + parenthetical_formate2 + parenthetical_formate3 + parenthetical_formate4 + parenthetical_formate5 + parenthetical_formate6
|
||
# print("parenthetical_formated",parenthetical_formated)
|
||
# #total number of parenthetical
|
||
# total_no_parenthetical = len(audit_df.loc[(audit_df['line_removed'] == 'No')& (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])),:])
|
||
|
||
# print(total_no_parenthetical)
|
||
|
||
|
||
# # for Dialogue -----line
|
||
# # case corrected
|
||
# Dialogue_case_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['case_corrected'] != 'No') & (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])), :])
|
||
# print(Dialogue_case_corrected_count)
|
||
# # indentatioin corrected
|
||
# dialogue_left_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['left_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])),:])
|
||
# dialogue_right_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['right_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])),:])
|
||
# dialogue_wrapped_lines_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_wrapped_at_prescribed_right_indent'] != 'No')& (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])),:])
|
||
# dialogue_line_indentation = dialogue_left_indent_corrected_count + dialogue_right_indent_corrected_count + dialogue_wrapped_lines_count
|
||
# print("dialogue_line_indentation:",dialogue_line_indentation)
|
||
# # formate corrected
|
||
# dialogue_formate1 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_removed_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])),:])
|
||
# dialogue_formate2 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_added_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])),:])
|
||
# dialogue_formate3 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_merged_with_next_line'] != 'No')& (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])),:])
|
||
# dialogue_formate4 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_broken_into_multiple_lines'] != 'No')& (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])),:])
|
||
# dialogue_formate5 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_added'] != 'No')& (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])),:])
|
||
# dialogue_formate6 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_removed'] != 'No')& (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])),:])
|
||
# dialogue_formated = dialogue_formate1 + dialogue_formate2 + dialogue_formate3 + dialogue_formate4 + dialogue_formate5 + dialogue_formate6
|
||
# print("dialogue_formated",dialogue_formated)
|
||
# # total number of dialogue
|
||
# total_no_dialogue = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])),:])
|
||
|
||
# print(total_no_dialogue)
|
||
|
||
# # for Transistion -----line
|
||
# # case corrected
|
||
# transitions_case_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['case_corrected'] != 'No') & (audit_df['Identification_Status'].isin(['ps16'])), :])
|
||
# print(transitions_case_corrected_count)
|
||
# # indentatioin corrected
|
||
# transitions_left_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['left_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps16'])),:])
|
||
# transitions_right_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['right_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps16'])),:])
|
||
# transitions_wrapped_lines_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_wrapped_at_prescribed_right_indent'] != 'No')& (audit_df['Identification_Status'].isin(['ps16'])),:])
|
||
# transitions_line_indentation = transitions_left_indent_corrected_count + transitions_right_indent_corrected_count + transitions_wrapped_lines_count
|
||
# print("transitions_line_indentation:",transitions_line_indentation)
|
||
# # formate corrected
|
||
# transitions_formate1 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_removed_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps16'])),:])
|
||
# transitions_formate2 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_added_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps16'])),:])
|
||
# transitions_formate3 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_merged_with_next_line'] != 'No')& (audit_df['Identification_Status'].isin(['ps16'])),:])
|
||
# transitions_formate4 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_broken_into_multiple_lines'] != 'No')& (audit_df['Identification_Status'].isin(['ps16'])),:])
|
||
# transitions_formate5 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_added'] != 'No')& (audit_df['Identification_Status'].isin(['ps16'])),:])
|
||
# transitions_formate6 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_removed'] != 'No')& (audit_df['Identification_Status'].isin(['ps16'])),:])
|
||
# transitions_formated = transitions_formate1 + transitions_formate2 + transitions_formate3 + transitions_formate4 + transitions_formate5 + transitions_formate6
|
||
# print("transitions_formated",transitions_formated)
|
||
# #total transition
|
||
# total_no_transition = len(audit_df.loc[audit_df['Identification_Status'].isin(['ps16']),:])
|
||
|
||
# print(total_no_transition)
|
||
|
||
|
||
|
||
# # for Spectial Terms -----line
|
||
# # case corrected
|
||
# st_case_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['case_corrected'] != 'No') & (audit_df['Identification_Status'].isin(['ps17'])), :])
|
||
# print("st_case_corrected_count",st_case_corrected_count)
|
||
# # indentatioin corrected
|
||
# st_left_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['left_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps17'])),:])
|
||
# st_right_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['right_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps17'])),:])
|
||
# st_wrapped_lines_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_wrapped_at_prescribed_right_indent'] != 'No')& (audit_df['Identification_Status'].isin(['ps17'])),:])
|
||
# st_line_indentation = st_left_indent_corrected_count + st_right_indent_corrected_count + st_wrapped_lines_count
|
||
# print("st_line_indentation:",st_line_indentation)
|
||
# # formate corrected
|
||
# st_formate1 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_removed_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps17'])),:])
|
||
# st_formate2 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_added_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps17'])),:])
|
||
# st_formate3 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_merged_with_next_line'] != 'No')& (audit_df['Identification_Status'].isin(['ps17'])),:])
|
||
# st_formate4 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_broken_into_multiple_lines'] != 'No')& (audit_df['Identification_Status'].isin(['ps17'])),:])
|
||
# st_formate5 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_added'] != 'No')& (audit_df['Identification_Status'].isin(['ps17'])),:])
|
||
# st_formate6 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_removed'] != 'No')& (audit_df['Identification_Status'].isin(['ps17'])),:])
|
||
# st_formated = st_formate1 + st_formate2 + st_formate3 + st_formate4 + st_formate5 + st_formate6
|
||
# print("st_formated",st_formated)
|
||
# #total numner of special terms
|
||
# total_special_terms = len(audit_df.loc[audit_df['Identification_Status'].isin(['ps17']),:])
|
||
# if total_special_terms < 1 :
|
||
# total_special_terms = 1
|
||
# print(total_special_terms)
|
||
|
||
|
||
|
||
# # write logic for the percentage
|
||
# #a
|
||
# difference_of_page_no = int(pre_audit_pagenumber) - int(postauditpagenumber)
|
||
# average_of_page_no = (int(pre_audit_pagenumber) + int(postauditpagenumber)) / 2
|
||
# final_ratio_pageno = (difference_of_page_no / average_of_page_no) * 100
|
||
|
||
# #b
|
||
# difference_of_line_no = int(preaudit_line_no)- int(postaudit_line_no)
|
||
# average_of_line_no = (int(preaudit_line_no) + int(postaudit_line_no)) / 2
|
||
# final_ratio_lineno = (difference_of_line_no / average_of_line_no) * 100
|
||
|
||
# #c
|
||
# try:
|
||
# ratio_for_blanklines = ((int(blankline_inserted) + int(blankline_removed_total)) / average_of_line_no) *100
|
||
# except:
|
||
# ratio_for_blanklines = 0
|
||
|
||
# #j
|
||
# try:
|
||
# ratio_for_sluglines = ((int(sluglinecase_corrected_count)+int(slugline_indentation)+int(slugline_formated))/total_no_sluglines)*100
|
||
# except:
|
||
# ratio_for_sluglines = 0
|
||
# #d
|
||
# try:
|
||
# ratio_for_actionlines = ((int(actionlinecase_corrected_count)+ int(actionline_indentation)+ int(total_actionlines))/total_actionlines)*100
|
||
# except:
|
||
# ratio_for_actionlines = 0
|
||
|
||
# #e
|
||
# try:
|
||
# ratio_for_Speaker = ((int(speakercase_corrected_count)+int(speaker_formated)+int(speaker_formated))/ total_actionlines)*100
|
||
# except:
|
||
# ratio_for_Speaker = 0
|
||
|
||
|
||
# #f
|
||
# try:
|
||
# ratio_for_parenthetical = ((int(parentheticalcase_corrected_count)+int(parenthetical_line_indentation)+int(parenthetical_formated)) / total_no_parenthetical)*100
|
||
# except:
|
||
# ratio_for_parenthetical = 0
|
||
# #g
|
||
# try:
|
||
# ratio_for_dialogues = ((int(Dialogue_case_corrected_count)+int(dialogue_line_indentation)+int(dialogue_formated)) / total_no_dialogue)*100
|
||
# except:
|
||
# ratio_for_dialogues = 0
|
||
# #h
|
||
# try:
|
||
# ratio_for_transitions = ((int(transitions_case_corrected_count)+int(transitions_line_indentation)+int(transitions_formated)) / total_no_transition)*100
|
||
# except:
|
||
# ratio_for_transitions = 0
|
||
|
||
# #i
|
||
# try:
|
||
# ratio_for_special_terms = ((int(st_case_corrected_count)+int(st_line_indentation)+int(st_formated))/total_special_terms) * 100
|
||
# except:
|
||
# ratio_for_special_terms = 0
|
||
|
||
# average_of_c_j = (ratio_for_sluglines+ratio_for_actionlines+ratio_for_Speaker+ratio_for_parenthetical+ratio_for_dialogues+ratio_for_transitions+ratio_for_special_terms)/7
|
||
# audit_configuration_percentage = (final_ratio_pageno+final_ratio_lineno+ratio_for_blanklines) + (average_of_c_j)
|
||
# audit_configuration_percentage_str = f"{audit_configuration_percentage:.2f}%"
|
||
# print("audit_configuration_percentage",audit_configuration_percentage_str)
|
||
|
||
# total_script_element_correct = (total_no_sluglines+total_actionlines+total_no_speaker+total_no_parenthetical+total_no_dialogue+total_no_transition+total_special_terms)
|
||
# print("total_script_element_correct",total_script_element_correct)
|
||
# audit_script_accuracy = (total_no_sluglines+total_actionlines+total_no_speaker+total_no_parenthetical+total_no_dialogue+total_no_transition+total_special_terms+total_no_blanklines)/preaudit_line_no
|
||
# print("audit_script_accuracy",audit_script_accuracy)
|
||
# # audit_script_accuracy_str = min(audit_script_accuracy*100 , 100)
|
||
# audit_script_accuracy_str = min(audit_script_accuracy*100,100)
|
||
# audit_script_accuracy_str = f"{audit_script_accuracy_str:.2f}%"
|
||
# print("audit_script_accuracy_str",audit_script_accuracy_str)
|
||
|
||
|
||
|
||
# # the table logics ends here
|
||
# # percenteage table from here
|
||
|
||
# output_doc = Document()
|
||
# style = output_doc.styles['Normal']
|
||
# font = style.font
|
||
# #font.name = 'Courier New'
|
||
# font.size = Pt(10)
|
||
|
||
# section = output_doc.sections[-1]
|
||
# section.orientation = WD_ORIENT.LANDSCAPE
|
||
|
||
# section.page_width = Inches(11)
|
||
# section.left_margin = Inches(0.25)
|
||
# section.right_margin = Inches(0.25)
|
||
|
||
# para = output_doc.add_paragraph()
|
||
# para.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
|
||
# # Audit Summary at center of the page with bold
|
||
# run = para.add_run()
|
||
# font = run.font
|
||
# font.bold = True
|
||
# font.size = Pt(14)
|
||
# run.text = ' Audit Summary'
|
||
# run.add_break()
|
||
|
||
# # Add a paragraph for the left-aligned "Audit Date"
|
||
# current_date = date.today()
|
||
# # Convert to the "day month year" format
|
||
# formatted_date = current_date.strftime("%d %B %Y")
|
||
# left_aligned_text = output_doc.add_paragraph("\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tAudit Date: " + str(formatted_date))
|
||
# left_aligned_text.alignment = WD_ALIGN_PARAGRAPH.LEFT
|
||
# font_audit_date = left_aligned_text.runs[0].font
|
||
# font_audit_date.size = Pt(12)
|
||
|
||
# para = output_doc.add_paragraph()
|
||
# right_aligned_text = para.add_run('\t\tScriptname: ' + str(scriptname))
|
||
# right_aligned_text.alignment = WD_ALIGN_PARAGRAPH.RIGHT
|
||
# font_right = right_aligned_text.font
|
||
# font_right.size = Pt(12)
|
||
|
||
# author_para = output_doc.add_paragraph()
|
||
# run_author = author_para.add_run("\t\tAuthor: " + str(author))
|
||
# font_author = run_author.font
|
||
# font_author.size = Pt(12)
|
||
|
||
# language_script_para = output_doc.add_paragraph()
|
||
# run_language_script = language_script_para.add_run("\t\tLanguage of Script: " + str(script_language))
|
||
# font_language_script = run_language_script.font
|
||
# font_language_script.size = Pt(12)
|
||
|
||
# language_dialogue_para = output_doc.add_paragraph()
|
||
# run_language_dialogue = language_dialogue_para.add_run("\t\tLanguage of Dialogue: " + str(dialogue_language))
|
||
# font_language_dialogue = run_language_dialogue.font
|
||
# font_language_dialogue.size = Pt(12)
|
||
|
||
|
||
|
||
# # for pre audit and post Audit
|
||
# para = output_doc.add_paragraph()
|
||
# run = para.add_run()
|
||
# font = run.font
|
||
# font.size = Pt(11)
|
||
|
||
# percent_table = output_doc.add_table(1, cols =2)
|
||
# percent_table.alignment = WD_TABLE_ALIGNMENT.CENTER
|
||
# percent_table.style = 'Table Grid'
|
||
|
||
# percent_heading_cells = percent_table.rows[0].cells
|
||
# percent_heading_cells[0].width = Inches(1.5)
|
||
# percent_heading_cells[1].width = Inches(1)
|
||
# percent_heading_cells[0].text = 'Audit Contribution'
|
||
# percent_heading_cells[1].text = str(audit_configuration_percentage_str)
|
||
|
||
# percent_heading_cells = percent_table.add_row().cells
|
||
# font.size = Pt(12)
|
||
# percent_heading_cells[0].width = Inches(1.5)
|
||
# percent_heading_cells[0].text = 'Audit Script Accuracy'
|
||
# percent_heading_cells[1].width = Inches(1)
|
||
# percent_heading_cells[1].text = str(audit_script_accuracy_str)
|
||
|
||
# para = output_doc.add_paragraph()
|
||
|
||
# pre_post_table = output_doc.add_table(1, cols =3)
|
||
# pre_post_table.alignment = WD_TABLE_ALIGNMENT.CENTER
|
||
# pre_post_table.style = 'Table Grid'
|
||
|
||
# preheading_cells = pre_post_table.rows[0].cells
|
||
# preheading_cells[0].width = Inches(1.5)
|
||
# preheading_cells[1].width = Inches(1)
|
||
# preheading_cells[2].width = Inches(1)
|
||
# preheading_cells[1].text = 'Pre Audit'
|
||
# preheading_cells[2].text = 'Post Audit'
|
||
|
||
# # row No of pages
|
||
# pcells = pre_post_table.add_row().cells
|
||
# font.size = Pt(12)
|
||
# pcells[0].width = Inches(1.5)
|
||
# pcells[0].text = 'No of Pages'
|
||
# pcells[1].width = Inches(1)
|
||
# pcells[1].text = str(pre_audit_pagenumber)
|
||
# pcells[2].width = Inches(1)
|
||
# pcells[2].text = str(postauditpagenumber)
|
||
# # row no of lines
|
||
# pcells = pre_post_table.add_row().cells
|
||
# font.size = Pt(12)
|
||
# pcells[0].width = Inches(1.5)
|
||
# pcells[0].text = 'No of lines'
|
||
# pcells[1].width = Inches(1)
|
||
# pcells[1].text = str(preaudit_line_no)
|
||
# pcells[2].width = Inches(1)
|
||
# pcells[2].text = str(postaudit_line_no)
|
||
|
||
# # adding extra line after the table above
|
||
# para = output_doc.add_paragraph()
|
||
# run = para.add_run()
|
||
# font = run.font
|
||
# font.size = Pt(12)
|
||
# #--------------------------------------
|
||
|
||
|
||
# bl_table = output_doc.add_table(1, cols =2)
|
||
# bl_table.alignment = WD_TABLE_ALIGNMENT.CENTER
|
||
# bl_table.style = 'Table Grid'
|
||
|
||
# bl_heading_cells = bl_table.rows[0].cells
|
||
# bl_heading_cells[0].width = Inches(1.5)
|
||
# bl_heading_cells[0].text = 'Blank Lines Added'
|
||
# bl_heading_cells[1].width = Inches(1.5)
|
||
# bl_heading_cells[1].text = str(blankline_inserted) # add the number here
|
||
|
||
# blcells = bl_table.add_row().cells
|
||
# font.size = Pt(12)
|
||
# blcells[0].width = Inches(1.5)
|
||
# blcells[0].text = 'Blank Lines Removed'
|
||
# blcells[1].width = Inches(1.5)
|
||
# blcells[1].text = str(blankline_removed_total) # add the number here
|
||
|
||
# # adding extra line after the table above
|
||
|
||
# para = output_doc.add_paragraph()
|
||
# run = para.add_run()
|
||
# font = run.font
|
||
# font.size = Pt(12)
|
||
|
||
# sum_table = output_doc.add_table(1, cols =4)
|
||
# sum_table.alignment = WD_TABLE_ALIGNMENT.CENTER
|
||
# sum_table.style = 'Table Grid'
|
||
|
||
# sum_heading_cells = sum_table.rows[0].cells
|
||
# sum_heading_cells[0].width = Inches(1.5)
|
||
# sum_heading_cells[0].text = ''
|
||
# sum_heading_cells[1].width = Inches(1.5)
|
||
# sum_heading_cells[1].height = Inches(0.5)
|
||
# sum_heading_cells[1].text = 'Case Correction'
|
||
# sum_heading_cells[2].width = Inches(1.5)
|
||
# sum_heading_cells[2].text = 'Indent Correction'
|
||
# sum_heading_cells[3].width = Inches(1.5)
|
||
# sum_heading_cells[3].text = 'Format Correction'
|
||
|
||
|
||
# sum_cells = sum_table.add_row().cells
|
||
# font.size = Pt(12)
|
||
# sum_cells[0].width = Inches(1.5)
|
||
# sum_cells[0].height = Inches(0.3)
|
||
# sum_cells[0].text = 'Sluglines'
|
||
# sum_cells[1].width = Inches(1.5)
|
||
# sum_cells[1].height = Inches(0.3)
|
||
# sum_cells[1].text = str(sluglinecase_corrected_count)
|
||
# sum_cells[2].width = Inches(1.5)
|
||
# sum_cells[2].height = Inches(0.3)
|
||
# sum_cells[2].text = str(slugline_indentation)
|
||
# sum_cells[3].width = Inches(1.5)
|
||
# sum_cells[3].height = Inches(0.3)
|
||
# sum_cells[3].text = str(slugline_formated)
|
||
|
||
# sum_cells = sum_table.add_row().cells
|
||
# font.size = Pt(12)
|
||
# sum_cells[0].width = Inches(1.5)
|
||
# sum_cells[0].text = 'Actioin Lines'
|
||
# sum_cells[1].width = Inches(1.5)
|
||
# sum_cells[1].text = str(actionlinecase_corrected_count)
|
||
# sum_cells[2].width = Inches(1.5)
|
||
# sum_cells[2].text = str(actionline_indentation)
|
||
# sum_cells[3].width = Inches(1.5)
|
||
# sum_cells[3].text = str(actionline_formated)
|
||
|
||
# sum_cells = sum_table.add_row().cells
|
||
# font.size = Pt(12)
|
||
# sum_cells[0].width = Inches(1.5)
|
||
# sum_cells[0].text = 'Speakers'
|
||
# sum_cells[1].width = Inches(1.5)
|
||
# sum_cells[1].text = str(speakercase_corrected_count)
|
||
# sum_cells[2].width = Inches(1.5)
|
||
# sum_cells[2].text = str(speaker_indentation)
|
||
# sum_cells[3].width = Inches(1.5)
|
||
# sum_cells[3].text = str(speaker_formated)
|
||
|
||
# sum_cells = sum_table.add_row().cells
|
||
# font.size = Pt(12)
|
||
# sum_cells[0].width = Inches(1.5)
|
||
# sum_cells[0].text = 'Parentheticals'
|
||
# sum_cells[1].width = Inches(1.5)
|
||
# sum_cells[1].text = str(parentheticalcase_corrected_count)
|
||
# sum_cells[2].width = Inches(1.5)
|
||
# sum_cells[2].text = str(parenthetical_line_indentation)
|
||
# sum_cells[3].width = Inches(1.5)
|
||
# sum_cells[3].text = str(parenthetical_formated)
|
||
|
||
# sum_cells = sum_table.add_row().cells
|
||
# font.size = Pt(12)
|
||
# sum_cells[0].width = Inches(1.5)
|
||
# sum_cells[0].text = 'Dialogues'
|
||
# sum_cells[1].width = Inches(1.5)
|
||
# sum_cells[1].text = str(Dialogue_case_corrected_count)
|
||
# sum_cells[2].width = Inches(1.5)
|
||
# sum_cells[2].text = str(dialogue_line_indentation)
|
||
# sum_cells[3].width = Inches(1.5)
|
||
# sum_cells[3].text = str(dialogue_formated)
|
||
|
||
# sum_cells = sum_table.add_row().cells
|
||
# font.size = Pt(12)
|
||
# sum_cells[0].width = Inches(1.5)
|
||
# sum_cells[0].text = 'Transitions'
|
||
# sum_cells[1].width = Inches(1.5)
|
||
# sum_cells[1].text = str(transitions_case_corrected_count)
|
||
# sum_cells[2].width = Inches(1.5)
|
||
# sum_cells[2].text = str(transitions_line_indentation)
|
||
# sum_cells[3].width = Inches(1.5)
|
||
# sum_cells[3].text = str(transitions_formated)
|
||
|
||
# sum_cells = sum_table.add_row().cells
|
||
# font.size = Pt(12)
|
||
# sum_cells[0].width = Inches(1.5)
|
||
# sum_cells[0].text = 'Special Terms'
|
||
# sum_cells[1].width = Inches(1.5)
|
||
# sum_cells[1].text = str(st_case_corrected_count)
|
||
# sum_cells[2].width = Inches(1.5)
|
||
# sum_cells[2].text = str(st_line_indentation)
|
||
# sum_cells[3].width = Inches(1.5)
|
||
# sum_cells[3].text = str(st_line_indentation)
|
||
|
||
|
||
# para = output_doc.add_paragraph()
|
||
# run = para.add_run()
|
||
# font = run.font
|
||
# font.size = Pt(12)
|
||
|
||
# para = output_doc.add_paragraph()
|
||
# run = para.add_run()
|
||
# run.add_break()
|
||
# run.add_break()
|
||
# #--------------------------- 14-09-2023
|
||
# for _ in range(5):
|
||
# output_doc.add_paragraph()
|
||
# #----------------------- 14-09-23
|
||
# para = output_doc.add_paragraph()
|
||
# para.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
|
||
# # Audit detail at center of the page with bold
|
||
# run = para.add_run()
|
||
# font = run.font
|
||
# font.bold = True
|
||
# font.size = Pt(14)
|
||
# run.text = ' Audit Details'
|
||
# run.add_break()
|
||
# # -------------------------- 14-09-23
|
||
|
||
# no_rows = len(audit_df.index)
|
||
|
||
# table =output_doc.add_table(1, cols =6)
|
||
# table.alignment = WD_TABLE_ALIGNMENT.CENTER
|
||
|
||
# table.style = 'Table Grid'
|
||
|
||
# table.autofit = False
|
||
# table.columns[0].width = Inches(0.5)
|
||
# table.columns[1].width = Inches(1.2)
|
||
# table.columns[2].width = Inches(2)
|
||
# table.columns[3].width = Inches(1.5)
|
||
# table.columns[4].width = Inches(2)
|
||
# table.columns[5].width = Inches(2.5)
|
||
# # table.columns[3].width = Inches(0.5)
|
||
|
||
|
||
# heading_cells = table.rows[0].cells
|
||
|
||
# heading_cells[0].width = Inches(0.1)
|
||
# heading_cells[1].width = Inches(0.1)
|
||
# heading_cells[2].width = Inches(3.5)
|
||
# heading_cells[3].width = Inches(0.8)
|
||
# heading_cells[4].width = Inches(3.5)
|
||
# heading_cells[5].width = Inches(2)
|
||
|
||
# heading_cells[0].text = 'Line No'
|
||
# heading_cells[1].text = 'Audited Line No'
|
||
# heading_cells[2].text = 'Current Content'
|
||
# heading_cells[3].text = 'Script Element'
|
||
# heading_cells[4].text = 'New Content'
|
||
# heading_cells[5].text = 'Changes Done'
|
||
|
||
|
||
# for i in range(0,6):
|
||
# heading_cells[i].paragraphs[0].runs[0].font.bold = True
|
||
# heading_cells[i].paragraphs[0].runs[0].font.size = Pt(9)
|
||
|
||
|
||
# #------------------------------->LOGIC HERE<---------------------------------------------
|
||
# report_df = pd.DataFrame(columns=['line_no', 'audited_line_no', 'current_content', 'script_element', 'new_content', 'changes_done', 'para_no'])
|
||
|
||
# for index in audit_df.index:
|
||
|
||
# columns_to_check = ["line_removed","introduction", "appendix", "page_no" ,"left_indent_corrected" ,"right_indent_corrected" ,"line_wrapped_at_prescribed_right_indent", "case_corrected", "blank_inserted_before" ,"blank_inserted_after" ,"blank_deleted_before" ,"blank_deleted_after" ,"space_removed_between_characters" ,"space_added_between_characters" ,"line_merged_with_next_line", "line_broken_into_multiple_lines" ,"punctuation_mark_added" ,"punctuation_mark_removed" ,"language_specific_audit_comments"]
|
||
# audit_df[columns_to_check] = audit_df[columns_to_check].fillna('No')
|
||
# if audit_df.loc[index, columns_to_check].eq('No').all().all():
|
||
# # All columns contain 'No', skip this row
|
||
# continue
|
||
# elif audit_df['introduction'][index] == 'Yes':
|
||
# continue
|
||
# elif audit_df['appendix'][index] == 'Yes':
|
||
# continue
|
||
# elif audit_df['Identification_Status'][index] == 'blank':
|
||
# continue
|
||
# elif pd.isna(audit_df.loc[index, "Identification_Status"]):
|
||
# continue
|
||
|
||
|
||
|
||
|
||
# para_value = audit_df["para_no"][index] # ---------------------------------------------><-------------------------
|
||
# current_para_value = report_df['para_no'].iloc[-1] if not report_df.empty else None
|
||
# if para_value == current_para_value:
|
||
# continue
|
||
# else:
|
||
# # report_df = report_df.append(audit_df.loc[index], ignore_index=True)
|
||
# new_row = audit_df.loc[index].to_frame().T
|
||
# report_df = pd.concat([report_df, new_row], ignore_index=True)
|
||
# print("current_para_value",current_para_value)
|
||
|
||
|
||
# row_index = 1
|
||
# old_line_no_index = index
|
||
# collection_old_line_no = []
|
||
# while old_line_no_index < len(audit_df) and str(audit_df["para_no"][old_line_no_index]) == str(para_value):
|
||
# if audit_df['Identification_Status'][old_line_no_index] != "blank":
|
||
# try:
|
||
# data = int(old_line_no_index)
|
||
# collection_old_line_no.append(str(data))
|
||
# except ValueError:
|
||
# pass
|
||
# old_line_no_index += 1
|
||
|
||
# cells = table.add_row().cells
|
||
# cells[0].width = Inches(0.1)
|
||
# cells[0].text = ', '.join(collection_old_line_no)
|
||
|
||
# audited_line_index = index
|
||
# #--------------------------------------audited_lino_no------------------
|
||
# collection_audited_line_no = []
|
||
# while audited_line_index < len(audit_df) and str(audit_df["para_no"][audited_line_index]) == str(para_value):
|
||
# if audit_df['Identification_Status'][audited_line_index] != "blank":
|
||
# audited_line_no = audit_df['audited_line_no'][audited_line_index]
|
||
# try:
|
||
# data = int(audited_line_no)
|
||
# collection_audited_line_no.append(str(data))
|
||
# except ValueError:
|
||
# pass
|
||
# audited_line_index += 1
|
||
# print("collection_audited_line_no", collection_audited_line_no)
|
||
# data_string = ', '.join(collection_audited_line_no)
|
||
# print("data_string:", data_string)
|
||
# cells[1].width = Inches(0.1)
|
||
# cells[1].text = data_string
|
||
|
||
|
||
# #------------------------------>OLD DATA<---------------------------------
|
||
# data_index = index
|
||
# collection_data = []
|
||
# while data_index < len(audit_df) and str(audit_df["para_no"][data_index]) == str(para_value):
|
||
# cur_data = audit_df['data'][data_index]
|
||
# if not pd.isna(cur_data): # Check if the value is not NaN
|
||
# data = str(cur_data).strip()
|
||
# collection_data.append(data)
|
||
# data_index += 1
|
||
|
||
# cells[2].width = Inches(3.5)
|
||
# data = str(data)
|
||
# cells[2].text = '\n '.join(collection_data)
|
||
|
||
# if audit_df['Identification_Status'][index] == 'blank':
|
||
# script_element = 'Blank Line'
|
||
# elif audit_df['Identification_Status'][index] == '':
|
||
# if audit_df['introduction'][index] == 'Yes':
|
||
# script_element = 'Title/Introduction'
|
||
# elif audit_df['appendix'][index] == 'Yes':
|
||
# script_element = 'Appendix'
|
||
# else:
|
||
# continue
|
||
|
||
# else:
|
||
# script_element = ps_to_script_element(audit_df['Identification_Status'][index])
|
||
# data = script_element
|
||
# cells[3].width = Inches(0.8)
|
||
# cells[3].text = data
|
||
|
||
# collection_new_data = []
|
||
# new_data_index = index
|
||
|
||
# while new_data_index < len(audit_df) and str(audit_df["para_no"][new_data_index]) == str(para_value):
|
||
# if audit_df["line_removed"][new_data_index] == "No":
|
||
# new_data = audit_df['data_corrected'][new_data_index]
|
||
# if not pd.isna(new_data): # Check if the value is not NaN
|
||
# data = str(new_data).strip()
|
||
# collection_new_data.append(data)
|
||
# new_data_index += 1
|
||
# data = str(new_data).strip()
|
||
# cells[4].width = Inches(3.5)
|
||
# data = str(data)
|
||
# cells[4].text = '\n '.join(collection_new_data)
|
||
|
||
# sno = 1
|
||
# changes_done = False
|
||
|
||
# # identification_status = audit_df['Identification_Status'][index]
|
||
# if pd.isnull(audit_df['Identification_Status'][index]) or audit_df['Identification_Status'][index] == "":
|
||
# continue
|
||
|
||
|
||
# if audit_df['left_indent_corrected'][index] != 'No':
|
||
# change_comment = audit_df['left_indent_corrected'][index]
|
||
# try:
|
||
# str_int = change_comment[-2]+change_comment[-1]
|
||
# except Exception as e:
|
||
# pass
|
||
# if ps_to_script_element(audit_df['Identification_Status'][index]) == "Dialogue":
|
||
# if str_int == "15":
|
||
# change_comment = "Dialogue line left index corrected to 1.5 Inch"
|
||
# elif str_int == "25":
|
||
# change_comment = "Dialogue line left index corrected to 2.5 Inch"
|
||
|
||
|
||
# if str_int == "15":
|
||
# name = ps_to_script_element(audit_df['Identification_Status'][index])
|
||
# change_comment = f"{name} line left indent corrected to 1.5 Inch"
|
||
# print(change_comment)
|
||
# elif str_int == "25":
|
||
# name = ps_to_script_element(audit_df['Identification_Status'][index])
|
||
# change_commen = f"{name} left indent corrected to 2.5 Inch"
|
||
|
||
# elif str_int == "30":
|
||
# change_comment = "Parenthetical left indent corrected to 3 Inch"
|
||
# elif str_int == "35":
|
||
# change_comment = "Speaker left indent corrected to 3.5 Inch"
|
||
|
||
# data = str(sno) + '. ' + str(change_comment)
|
||
# # dataa = data.split()
|
||
# # if dataa[-1] == "nan":
|
||
# # continue
|
||
|
||
# cells[5].width = Inches(2)
|
||
# para = cells[5].add_paragraph()
|
||
# run = para.add_run()
|
||
# run.text = data
|
||
# run.add_break()
|
||
# sno += 1
|
||
# changes_done = True
|
||
|
||
# if audit_df['right_indent_corrected'][index] != 'No':
|
||
# name = ps_to_script_element(audit_df['Identification_Status'][index])
|
||
# change_comment = audit_df['right_indent_corrected'][index]
|
||
# try:
|
||
# str_int = change_comment[-2]+change_comment[-1]
|
||
# except Exception as e:
|
||
# pass
|
||
# if str_int == "10":
|
||
# change_comment = f"{name} right indent corrected to 1 Inch"
|
||
|
||
|
||
# data = str(sno) + '. ' + str(change_comment)
|
||
# # dataa = data.split()
|
||
# # if dataa[-1] == "nan":
|
||
# # continue
|
||
# cells[5].width = Inches(2)
|
||
# para = cells[5].add_paragraph()
|
||
# run = para.add_run()
|
||
# run.text = data
|
||
# run.add_break()
|
||
# sno += 1
|
||
# changes_done = True
|
||
|
||
# if audit_df['case_corrected'][index] != 'No':
|
||
# name = ps_to_script_element(audit_df['Identification_Status'][index])
|
||
# string = str(audit_df['case_corrected'][index])
|
||
# string = string.split()
|
||
# content = string[-1]
|
||
# if content == "AllUpper":
|
||
# change_comment = f'{name} Case ' + "Corrected to All Upper"
|
||
# elif content == "AllLower":
|
||
# change_comment = f'{name} Case ' + "Corrected to All Lowerr"
|
||
# if len(str(change_comment)) <= 2 :
|
||
# continue
|
||
# data = str(sno) + '. ' + str(change_comment)
|
||
# # dataa = data.split()
|
||
# # if dataa[-1] == "nan":
|
||
# # continue
|
||
# cells[5].width = Inches(2)
|
||
# para = cells[5].add_paragraph()
|
||
# run = para.add_run()
|
||
# run.text = data
|
||
# run.add_break()
|
||
# sno += 1
|
||
# changes_done = True
|
||
|
||
# if audit_df['line_wrapped_at_prescribed_right_indent'][index] != 'No':
|
||
# change_comment = 'Line Wrapped at Prescribed Right Indent 1 Inch'
|
||
# name = ps_to_script_element(audit_df['Identification_Status'][index])
|
||
# if name == "Action":
|
||
# change_comment = f'{name}Line Wrapped at Prescribed Right Indent 1 Inch'
|
||
# elif name == "Dialogue":
|
||
# change_comment = f'{name}Line Wrapped at Prescribed Right Indent 2 Inch'
|
||
|
||
# data = str(sno) + '. ' + str(change_comment)
|
||
# # dataa = data.split()
|
||
# # if dataa[-1] == "nan":
|
||
# # continue
|
||
|
||
# cells[5].width = Inches(2)
|
||
# para = cells[5].add_paragraph()
|
||
# run = para.add_run()
|
||
# run.text = data
|
||
# run.add_break()
|
||
# sno += 1
|
||
# changes_done = True
|
||
|
||
# if audit_df['line_broken_into_multiple_lines'][index] != 'No':
|
||
# name = ps_to_script_element(audit_df['Identification_Status'][index])
|
||
# change_comment = f'{name} line Broken into Multiple Lines'
|
||
|
||
# data = str(sno) + '. ' + str(change_comment)
|
||
# # dataa = data.split()
|
||
# # if dataa[-1] == "nan":
|
||
# # continue
|
||
# cells[5].width = Inches(2)
|
||
# para = cells[5].add_paragraph()
|
||
# run = para.add_run()
|
||
# run.text = data
|
||
# run.add_break()
|
||
# sno += 1
|
||
# changes_done = True
|
||
|
||
# if audit_df['line_merged_with_next_line'][index] != 'No':
|
||
# name = ps_to_script_element(audit_df['Identification_Status'][index])
|
||
# change_comment = f'{name} line Merged with Next Line'
|
||
|
||
# data = str(sno) + '. ' + str(change_comment)
|
||
# # dataa = data.split()
|
||
# # if dataa[-1] == "nan":
|
||
# # continue
|
||
# cells[5].width = Inches(2)
|
||
# para = cells[5].add_paragraph()
|
||
# run = para.add_run()
|
||
# run.text = data
|
||
# run.add_break()
|
||
# sno += 1
|
||
# changes_done = True
|
||
|
||
# if audit_df['language_specific_audit_comments'][index] != 'No':
|
||
# pass
|
||
# name = ps_to_script_element(audit_df['Identification_Status'][index])
|
||
# change_comment = f"{name}",str(audit_df['language_specific_audit_comments'][index])
|
||
|
||
# data = str(sno) + '. ' + str(change_comment)
|
||
|
||
# cells[5].width = Inches(2)
|
||
# para = cells[5].add_paragraph()
|
||
# run = para.add_run()
|
||
# run.text = data
|
||
# run.add_break()
|
||
# sno += 1
|
||
# changes_done = True
|
||
|
||
|
||
# if audit_df['blank_inserted_after'][index] != 'No':
|
||
# change_comment = 'A blank line is added below'
|
||
# # name = ps_to_script_element(audit_df['Identification_Status'][index])
|
||
# # if name == "Action":
|
||
# # change_comment = f'{name}Line Wrapped at Prescribed Right Indent 1 Inch'
|
||
# # elif name == "Dialogue":
|
||
# # change_comment = f'{name}Line Wrapped at Prescribed Right Indent 2 Inch'
|
||
|
||
# data = str(sno) + '. ' + str(change_comment)
|
||
# # dataa = data.split()
|
||
# # if dataa[-1] == "nan":
|
||
# # continue
|
||
|
||
# cells[5].width = Inches(2)
|
||
# para = cells[5].add_paragraph()
|
||
# run = para.add_run()
|
||
# run.text = data
|
||
# run.add_break()
|
||
# sno += 1
|
||
# changes_done = True
|
||
|
||
# if not changes_done:
|
||
# continue
|
||
# # data = 'No Changes Done'
|
||
# # cells[5].width = Inches(2)
|
||
# # para = cells[5].add_paragraph()
|
||
# # run = para.add_run()
|
||
# # run.text = data
|
||
# # run.add_break()
|
||
|
||
# row_index += 1
|
||
|
||
# buffer = io.BytesIO()
|
||
# output_doc.save(buffer)
|
||
# buffer.seek(0)
|
||
|
||
|
||
# #output_doc.save(audit_report_tabular_docx)
|
||
# return buffer
|
||
|
||
def count_the_line(text_file_path):
|
||
with open(text_file_path, 'r') as fp:
|
||
lines = len(fp.readlines())
|
||
return lines
|
||
|
||
def convert_to_pdf(input_docx, out_folder):
|
||
p = subprocess.Popen(
|
||
[
|
||
"libreoffice",
|
||
"--headless",
|
||
"--convert-to",
|
||
"pdf",
|
||
"--outdir",
|
||
out_folder,
|
||
input_docx,
|
||
]
|
||
)
|
||
print(["--convert-to", "pdf", input_docx])
|
||
p.communicate()
|
||
|
||
|
||
def countPages(docfile, pdf_file_path, base_path_directory):
|
||
convert_to_pdf(docfile, base_path_directory)
|
||
print("converted to pdf")
|
||
print("pdf_file_path",pdf_file_path)
|
||
pdf = PdfFileReader(open(pdf_file_path, "rb"))
|
||
number_of_pages = pdf.getNumPages()
|
||
return number_of_pages
|
||
|
||
|
||
def convert_txt_to_docx(txt_file_path, docx_file_path):
|
||
doc = docx.Document()
|
||
with open(txt_file_path, 'r', encoding='utf-8') as txt:
|
||
text = txt.read()
|
||
doc.add_paragraph(text)
|
||
doc.save(docx_file_path)
|
||
|
||
|
||
def csv_to_docx(csv: pd.DataFrame) -> Document:
|
||
|
||
output_doc = Document()
|
||
style = output_doc.styles["Normal"]
|
||
font = style.font
|
||
font.name = "Courier New"
|
||
font.size = Pt(12)
|
||
section = output_doc.sections[0]
|
||
section.page_height = Mm(297)
|
||
a4_right = 8.57
|
||
section.page_width = Inches(a4_right)
|
||
section.left_margin = Inches(1.5)
|
||
|
||
for index in csv.index:
|
||
para = output_doc.add_paragraph()
|
||
|
||
paragraph_format = para.paragraph_format
|
||
|
||
paragraph_format.space_before = Pt(0)
|
||
paragraph_format.space_after = Pt(0)
|
||
paragraph_format.line_spacing = Pt(12)
|
||
|
||
script_element = csv["script_element"][index]
|
||
content = csv["content"][index]
|
||
|
||
if script_element == "blank":
|
||
continue
|
||
|
||
elif script_element == "slugline":
|
||
paragraph_format.left_indent = Inches(0)
|
||
paragraph_format.right_indent = Inches(0)
|
||
content = content.upper()
|
||
|
||
elif script_element == "action":
|
||
paragraph_format.left_indent = Inches(0)
|
||
paragraph_format.right_indent = Inches(0)
|
||
|
||
elif script_element == "dialogue":
|
||
paragraph_format.left_indent = Inches(1.0)
|
||
paragraph_format.right_indent = Inches(1.25)
|
||
|
||
elif script_element == "parenthetical":
|
||
paragraph_format.left_indent = Inches(1.5)
|
||
paragraph_format.right_indent = Inches(2.25)
|
||
|
||
elif script_element == "speaker":
|
||
paragraph_format.left_indent = Inches(2)
|
||
paragraph_format.right_indent = Inches(1)
|
||
content = content.upper()
|
||
|
||
elif script_element == "transition":
|
||
para.alignment = WD_ALIGN_PARAGRAPH.RIGHT
|
||
paragraph_format.left_indent = Inches(2.5)
|
||
paragraph_format.right_indent = Inches(0)
|
||
|
||
elif script_element == "special_term":
|
||
paragraph_format.left_indent = Inches(0)
|
||
paragraph_format.right_indent = Inches(0)
|
||
|
||
if isinstance(content, float):
|
||
content = ""
|
||
|
||
para.text = content
|
||
|
||
return output_doc
|
||
|
||
def language_detector_for_csv(orginal_csv_path):
|
||
try:
|
||
audit_df = pd.read_csv(orginal_csv_path)
|
||
except:
|
||
audit_df = orginal_csv_path
|
||
|
||
actionline_lang = []
|
||
dialogue_lang = []
|
||
|
||
for index, row in audit_df.iterrows():
|
||
if audit_df["script_element"][index] in ["action"]:
|
||
string_original = audit_df["content"][index]
|
||
src_lang = language_detector(string_original)
|
||
Final_lang = [language_code[src_lang]]
|
||
actionline_lang.append(Final_lang)
|
||
|
||
elif audit_df["script_element"][index] in ["dialogue"]:
|
||
string_original = audit_df["content"][index]
|
||
src_lang = language_detector(string_original)
|
||
Final_lang = [language_code[src_lang]]
|
||
dialogue_lang.append(Final_lang)
|
||
|
||
return actionline_lang, dialogue_lang
|
||
|
||
def assign_para_no(df):
|
||
para_no = 1
|
||
|
||
df['para_no'] = 0
|
||
|
||
index_iter = iter(df.index)
|
||
for index in df.index:
|
||
line_pos = df['Identification_Status'][index]
|
||
|
||
if line_pos == 'blank' :
|
||
continue
|
||
|
||
if line_pos == 'ps1':
|
||
df.at[index, 'para_no'] = para_no
|
||
para_no += 1
|
||
continue
|
||
|
||
if line_pos == "ps2":
|
||
if df['Identification_Status'][index + 1] == "ps3":
|
||
df.at[index, 'para_no'] = para_no
|
||
df.at[index+1, 'para_no'] = para_no
|
||
para_no += 1
|
||
continue
|
||
else:
|
||
df.at[index, 'para_no'] = para_no
|
||
para_no += 1
|
||
continue
|
||
|
||
if line_pos == 'ps4':
|
||
df.at[index, 'para_no'] = para_no
|
||
spot_index = index +1
|
||
while df['Identification_Status'][spot_index] in ["ps5","ps6","ps4"]:
|
||
df.at[spot_index, 'para_no'] = para_no
|
||
spot_index += 1
|
||
para_no += 1
|
||
continue
|
||
|
||
|
||
if line_pos == 'ps13':
|
||
df.at[index, 'para_no'] = para_no
|
||
spot_index = index +1
|
||
while spot_index < len(df) and df['Identification_Status'][spot_index] in ["ps14","ps15","ps13","blank"]:
|
||
if df['Identification_Status'][spot_index] == "blank":
|
||
if spot_index + 1 < len(df) and df['Identification_Status'][spot_index+1] == "ps14":
|
||
df.at[spot_index+1, 'para_no'] = para_no
|
||
spot_index += 1
|
||
else:
|
||
pass
|
||
df.at[spot_index, 'para_no'] = para_no
|
||
spot_index += 1
|
||
para_no += 1
|
||
continue
|
||
|
||
|
||
if line_pos == 'ps6':
|
||
if df['Identification_Status'][index-1] in ["ps5","ps4"]:
|
||
continue
|
||
else:
|
||
df.at[index, 'para_no'] = para_no
|
||
para_no += 1
|
||
|
||
if line_pos == "ps7":
|
||
df.at[index, 'para_no'] = para_no
|
||
spot_index = index +1
|
||
while df['Identification_Status'][spot_index] in ["ps8","ps9"]:
|
||
df.at[spot_index, 'para_no'] = para_no
|
||
spot_index += 1
|
||
para_no += 1
|
||
continue
|
||
|
||
if line_pos == "ps8":
|
||
if df['Identification_Status'][index+1] in ["ps13","ps15"]:
|
||
df.at[index, 'para_no'] = para_no
|
||
para_no += 1
|
||
continue
|
||
else:
|
||
df.at[index, 'para_no'] = para_no
|
||
para_no += 1
|
||
continue
|
||
|
||
|
||
if line_pos == 'ps15':
|
||
if df['Identification_Status'][index-1] in ["ps7","ps12","ps10","ps20","ps8","blank"]:
|
||
df.at[index, 'para_no'] = para_no
|
||
para_no += 1
|
||
continue
|
||
else:
|
||
continue
|
||
|
||
if line_pos == "ps14":
|
||
if df['Identification_Status'][index-1] in ["ps8","ps7"]:
|
||
df.at[index, 'para_no'] = para_no
|
||
spot_index = index +1
|
||
while df['Identification_Status'][spot_index] == "ps15":
|
||
df.at[spot_index, 'para_no'] = para_no
|
||
spot_index += 1
|
||
para_no += 1
|
||
else:
|
||
continue
|
||
|
||
if line_pos == 'ps11':
|
||
df.at[index, 'para_no'] = para_no
|
||
spot_index = index +1
|
||
while df['Identification_Status'][spot_index] in ["ps12","ps20"]:
|
||
df.at[spot_index, 'para_no'] = para_no
|
||
spot_index += 1
|
||
para_no += 1
|
||
continue
|
||
|
||
if line_pos == "ps12":
|
||
if df['Identification_Status'][index-1] in ["ps11","ps20"]:
|
||
continue
|
||
continue
|
||
|
||
if line_pos == "ps10":
|
||
df.at[index, 'para_no'] = para_no
|
||
para_no += 1
|
||
continue
|
||
|
||
if line_pos == "ps20":
|
||
if df['Identification_Status'][index-1] == "ps11":
|
||
continue
|
||
elif df['Identification_Status'][index+1] == "ps12":
|
||
df.at[index, 'para_no'] = para_no
|
||
df.at[index+1, 'para_no'] = para_no
|
||
para_no += 1
|
||
continue
|
||
para_no += 1
|
||
continue
|
||
|
||
if line_pos == 'ps17' :
|
||
df.at[index, 'para_no'] = para_no
|
||
para_no += 1
|
||
continue
|
||
|
||
if line_pos == 'ps16' :
|
||
df.at[index, 'para_no'] = para_no
|
||
para_no += 1
|
||
continue
|
||
|
||
|
||
columns = list(df.columns)
|
||
columns.insert(3, columns.pop(columns.index('para_no')))
|
||
df = df[columns]
|
||
return df
|
||
|
||
|
||
def print_audit_report_tabular_docx(audit_df,scriptname,author,pre_audit_pagenumber,postauditpagenumber,preaudit_line_no,postaudit_line_no,script_language,dialogue_language):
|
||
|
||
#line_removed header left_indent_corrected right_indent_corrected line_wrapped_at_prescribed_right_indent case_corrected #blank_inserted_before blank_inserted_after blank_deleted_before blank_deleted_after space_removed_between_characters #space_added_between_characters line_merged_with_next_line line_broken_into_multiple_lines punctuation_mark_added #punctuation_mark_removed
|
||
|
||
total_no_blanklines = len(audit_df[audit_df['Identification_Status'].isin(['blank'])])
|
||
# <---------------------BLANK LINE ADD AND remove LOGIC IS HERE----------------->
|
||
blankline_added = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['blank_inserted_before'] != 'No'),:] )
|
||
blank_add_after = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['blank_inserted_after'] != 'No'),:] )
|
||
blankline_inserted = blankline_added + blank_add_after
|
||
|
||
blankline_rem_before = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['blank_deleted_before'] != 'No'),:] )
|
||
blank_rem_after = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['blank_deleted_after'] != 'No'),:] )
|
||
blankline_removed_total = blankline_rem_before + blank_rem_after
|
||
|
||
### <<----------------- logic for case --------------------------------->
|
||
# for slugline
|
||
# case corrected
|
||
sluglinecase_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['case_corrected'] != 'No') & (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])), :])
|
||
print(sluglinecase_corrected_count)
|
||
# indentatioin corrected
|
||
sleft_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['left_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])),:])
|
||
sright_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['right_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])),:])
|
||
swrapped_lines_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_wrapped_at_prescribed_right_indent'] != 'No')& (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])),:])
|
||
slugline_indentation = sleft_indent_corrected_count + sright_indent_corrected_count + swrapped_lines_count
|
||
print("sluglin_indentation:",slugline_indentation)
|
||
# formate corrected
|
||
slugline_formate1 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_removed_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])),:])
|
||
slugline_formate2 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_added_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])),:])
|
||
slugline_formate3 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_merged_with_next_line'] != 'No')& (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])),:])
|
||
slugline_formate4 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_broken_into_multiple_lines'] != 'No')& (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])),:])
|
||
slugline_formate5 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_added'] != 'No')& (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])),:])
|
||
slugline_formate6 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_removed'] != 'No')& (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])),:])
|
||
slugline_formated = slugline_formate1 + slugline_formate2 + slugline_formate3 + slugline_formate4 + slugline_formate5 + slugline_formate6
|
||
print("slugline_formated",slugline_formated)
|
||
#total sluglines
|
||
total_no_sluglines = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])), :])
|
||
print(total_no_sluglines)
|
||
|
||
# for actioon -----line
|
||
# case corrected
|
||
actionlinecase_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['case_corrected'] != 'No') & (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])), :])
|
||
print(actionlinecase_corrected_count)
|
||
# indentatioin corrected
|
||
actionleft_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['left_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])),:])
|
||
actionright_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['right_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])),:])
|
||
actionwrapped_lines_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_wrapped_at_prescribed_right_indent'] != 'No')& (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])),:])
|
||
actionline_indentation = actionleft_indent_corrected_count + actionright_indent_corrected_count + actionwrapped_lines_count
|
||
print("actionliine_indentation:",actionline_indentation)
|
||
# formate corrected
|
||
actionline_formate1 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_removed_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])),:])
|
||
actionline_formate2 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_added_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])),:])
|
||
actionline_formate3 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_merged_with_next_line'] != 'No')& (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])),:])
|
||
actionline_formate4 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_broken_into_multiple_lines'] != 'No')& (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])),:])
|
||
actionline_formate5 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_added'] != 'No')& (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])),:])
|
||
actionline_formate6 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_removed'] != 'No')& (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])),:])
|
||
actionline_formated = actionline_formate1 + actionline_formate2 + actionline_formate3 + actionline_formate4 + actionline_formate5 + actionline_formate6
|
||
print("actionline_formated",actionline_formated)
|
||
#total no of actionline
|
||
total_actionlines = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])), :])
|
||
|
||
print(total_actionlines)
|
||
|
||
|
||
# for Speaker
|
||
# case corrected
|
||
speakercase_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['case_corrected'] != 'No') & (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])), :])
|
||
print("speakercase_corrected_count", speakercase_corrected_count)
|
||
# indentatioin corrected
|
||
speakerleft_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['left_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])),:])
|
||
speakerright_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['right_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])),:])
|
||
speaker_lines_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_wrapped_at_prescribed_right_indent'] != 'No')& (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])),:])
|
||
speaker_indentation = speakerleft_indent_corrected_count + speakerright_indent_corrected_count + speaker_lines_count
|
||
print("speaker_indentation:",speaker_indentation)
|
||
# formate corrected
|
||
speaker_formate1 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_removed_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])),:])
|
||
speaker_formate2 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_added_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])),:])
|
||
speaker_formate3 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_merged_with_next_line'] != 'No')& (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])),:])
|
||
speaker_formate4 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_broken_into_multiple_lines'] != 'No')& (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])),:])
|
||
speaker_formate5 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_added'] != 'No')& (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])),:])
|
||
speaker_formate6 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_removed'] != 'No')& (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])),:])
|
||
speaker_formated = speaker_formate1 + speaker_formate2 + speaker_formate3 + speaker_formate4 + speaker_formate5 + speaker_formate6
|
||
print("speaker_formated",speaker_formated)
|
||
#total no of speaker -speaker
|
||
total_no_speaker = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])),:])
|
||
|
||
print(total_no_speaker)
|
||
|
||
|
||
# for Parenthetical -----line
|
||
# case corrected
|
||
parentheticalcase_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['case_corrected'] != 'No') & (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])), :])
|
||
print(parentheticalcase_corrected_count)
|
||
# indentatioin corrected
|
||
parenthetical_left_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['left_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])),:])
|
||
parenthetical_right_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['right_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])),:])
|
||
parenthetical_wrapped_lines_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_wrapped_at_prescribed_right_indent'] != 'No')& (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])),:])
|
||
parenthetical_line_indentation = parenthetical_left_indent_corrected_count + parenthetical_right_indent_corrected_count + parenthetical_wrapped_lines_count
|
||
print("parenthetical_line_indentation:",parenthetical_line_indentation)
|
||
# formate corrected
|
||
parenthetical_formate1 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_removed_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])),:])
|
||
parenthetical_formate2 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_added_between_characters'] != 'No') & (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])),:])
|
||
parenthetical_formate3 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_merged_with_next_line'] != 'No')& (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])),:])
|
||
parenthetical_formate4 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_broken_into_multiple_lines'] != 'No')& (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])),:])
|
||
parenthetical_formate5 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_added'] != 'No')& (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])),:])
|
||
parenthetical_formate6 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_removed'] != 'No')& (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])),:])
|
||
parenthetical_formated = parenthetical_formate1 + parenthetical_formate2 + parenthetical_formate3 + parenthetical_formate4 + parenthetical_formate5 + parenthetical_formate6
|
||
print("parenthetical_formated",parenthetical_formated)
|
||
#total number of parenthetical
|
||
total_no_parenthetical = len(audit_df.loc[(audit_df['line_removed'] == 'No')& (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])),:])
|
||
|
||
print(total_no_parenthetical)
|
||
|
||
|
||
# for Dialogue -----line
|
||
# case corrected
|
||
Dialogue_case_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['case_corrected'] != 'No') & (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])), :])
|
||
print(Dialogue_case_corrected_count)
|
||
# indentatioin corrected
|
||
dialogue_left_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['left_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])),:])
|
||
dialogue_right_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['right_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])),:])
|
||
dialogue_wrapped_lines_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_wrapped_at_prescribed_right_indent'] != 'No')& (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])),:])
|
||
dialogue_line_indentation = dialogue_left_indent_corrected_count + dialogue_right_indent_corrected_count + dialogue_wrapped_lines_count
|
||
print("dialogue_line_indentation:",dialogue_line_indentation)
|
||
# formate corrected
|
||
dialogue_formate1 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_removed_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])),:])
|
||
dialogue_formate2 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_added_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])),:])
|
||
dialogue_formate3 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_merged_with_next_line'] != 'No')& (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])),:])
|
||
dialogue_formate4 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_broken_into_multiple_lines'] != 'No')& (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])),:])
|
||
dialogue_formate5 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_added'] != 'No')& (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])),:])
|
||
dialogue_formate6 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_removed'] != 'No')& (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])),:])
|
||
dialogue_formated = dialogue_formate1 + dialogue_formate2 + dialogue_formate3 + dialogue_formate4 + dialogue_formate5 + dialogue_formate6
|
||
print("dialogue_formated",dialogue_formated)
|
||
# total number of dialogue
|
||
total_no_dialogue = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])),:])
|
||
|
||
print(total_no_dialogue)
|
||
|
||
# for Transistion -----line
|
||
# case corrected
|
||
transitions_case_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['case_corrected'] != 'No') & (audit_df['Identification_Status'].isin(['ps16'])), :])
|
||
print(transitions_case_corrected_count)
|
||
# indentatioin corrected
|
||
transitions_left_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['left_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps16'])),:])
|
||
transitions_right_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['right_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps16'])),:])
|
||
transitions_wrapped_lines_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_wrapped_at_prescribed_right_indent'] != 'No')& (audit_df['Identification_Status'].isin(['ps16'])),:])
|
||
transitions_line_indentation = transitions_left_indent_corrected_count + transitions_right_indent_corrected_count + transitions_wrapped_lines_count
|
||
print("transitions_line_indentation:",transitions_line_indentation)
|
||
# formate corrected
|
||
transitions_formate1 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_removed_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps16'])),:])
|
||
transitions_formate2 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_added_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps16'])),:])
|
||
transitions_formate3 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_merged_with_next_line'] != 'No')& (audit_df['Identification_Status'].isin(['ps16'])),:])
|
||
transitions_formate4 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_broken_into_multiple_lines'] != 'No')& (audit_df['Identification_Status'].isin(['ps16'])),:])
|
||
transitions_formate5 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_added'] != 'No')& (audit_df['Identification_Status'].isin(['ps16'])),:])
|
||
transitions_formate6 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_removed'] != 'No')& (audit_df['Identification_Status'].isin(['ps16'])),:])
|
||
transitions_formated = transitions_formate1 + transitions_formate2 + transitions_formate3 + transitions_formate4 + transitions_formate5 + transitions_formate6
|
||
print("transitions_formated",transitions_formated)
|
||
#total transition
|
||
total_no_transition = len(audit_df.loc[audit_df['Identification_Status'].isin(['ps16']),:])
|
||
|
||
print(total_no_transition)
|
||
|
||
|
||
|
||
# for Spectial Terms -----line
|
||
# case corrected
|
||
st_case_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['case_corrected'] != 'No') & (audit_df['Identification_Status'].isin(['ps17'])), :])
|
||
print("st_case_corrected_count",st_case_corrected_count)
|
||
# indentatioin corrected
|
||
st_left_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['left_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps17'])),:])
|
||
st_right_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['right_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps17'])),:])
|
||
st_wrapped_lines_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_wrapped_at_prescribed_right_indent'] != 'No')& (audit_df['Identification_Status'].isin(['ps17'])),:])
|
||
st_line_indentation = st_left_indent_corrected_count + st_right_indent_corrected_count + st_wrapped_lines_count
|
||
print("st_line_indentation:",st_line_indentation)
|
||
# formate corrected
|
||
st_formate1 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_removed_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps17'])),:])
|
||
st_formate2 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_added_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps17'])),:])
|
||
st_formate3 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_merged_with_next_line'] != 'No')& (audit_df['Identification_Status'].isin(['ps17'])),:])
|
||
st_formate4 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_broken_into_multiple_lines'] != 'No')& (audit_df['Identification_Status'].isin(['ps17'])),:])
|
||
st_formate5 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_added'] != 'No')& (audit_df['Identification_Status'].isin(['ps17'])),:])
|
||
st_formate6 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_removed'] != 'No')& (audit_df['Identification_Status'].isin(['ps17'])),:])
|
||
st_formated = st_formate1 + st_formate2 + st_formate3 + st_formate4 + st_formate5 + st_formate6
|
||
print("st_formated",st_formated)
|
||
#total numner of special terms
|
||
total_special_terms = len(audit_df.loc[audit_df['Identification_Status'].isin(['ps17']),:])
|
||
if total_special_terms < 1 :
|
||
total_special_terms = 1
|
||
print(total_special_terms)
|
||
|
||
|
||
|
||
# write logic for the percentage
|
||
#a
|
||
difference_of_page_no = int(pre_audit_pagenumber) - int(postauditpagenumber)
|
||
average_of_page_no = (int(pre_audit_pagenumber) + int(postauditpagenumber)) / 2
|
||
final_ratio_pageno = (difference_of_page_no / average_of_page_no) * 100
|
||
|
||
#b
|
||
difference_of_line_no = int(preaudit_line_no)- int(postaudit_line_no)
|
||
average_of_line_no = (int(preaudit_line_no) + int(postaudit_line_no)) / 2
|
||
final_ratio_lineno = (difference_of_line_no / average_of_line_no) * 100
|
||
|
||
#c
|
||
try:
|
||
ratio_for_blanklines = ((int(blankline_inserted) + int(blankline_removed_total)) / average_of_line_no) *100
|
||
except:
|
||
ratio_for_blanklines = 0
|
||
|
||
#j
|
||
try:
|
||
ratio_for_sluglines = ((int(sluglinecase_corrected_count)+int(slugline_indentation)+int(slugline_formated))/total_no_sluglines)*100
|
||
except:
|
||
ratio_for_sluglines = 0
|
||
#d
|
||
try:
|
||
ratio_for_actionlines = ((int(actionlinecase_corrected_count)+ int(actionline_indentation)+ int(total_actionlines))/total_actionlines)*100
|
||
except:
|
||
ratio_for_actionlines = 0
|
||
|
||
#e
|
||
try:
|
||
ratio_for_Speaker = ((int(speakercase_corrected_count)+int(speaker_formated)+int(speaker_formated))/ total_actionlines)*100
|
||
except:
|
||
ratio_for_Speaker = 0
|
||
|
||
|
||
#f
|
||
try:
|
||
ratio_for_parenthetical = ((int(parentheticalcase_corrected_count)+int(parenthetical_line_indentation)+int(parenthetical_formated)) / total_no_parenthetical)*100
|
||
except:
|
||
ratio_for_parenthetical = 0
|
||
#g
|
||
try:
|
||
ratio_for_dialogues = ((int(Dialogue_case_corrected_count)+int(dialogue_line_indentation)+int(dialogue_formated)) / total_no_dialogue)*100
|
||
except:
|
||
ratio_for_dialogues = 0
|
||
#h
|
||
try:
|
||
ratio_for_transitions = ((int(transitions_case_corrected_count)+int(transitions_line_indentation)+int(transitions_formated)) / total_no_transition)*100
|
||
except:
|
||
ratio_for_transitions = 0
|
||
|
||
#i
|
||
try:
|
||
ratio_for_special_terms = ((int(st_case_corrected_count)+int(st_line_indentation)+int(st_formated))/total_special_terms) * 100
|
||
except:
|
||
ratio_for_special_terms = 0
|
||
|
||
average_of_c_j = (ratio_for_sluglines+ratio_for_actionlines+ratio_for_Speaker+ratio_for_parenthetical+ratio_for_dialogues+ratio_for_transitions+ratio_for_special_terms)/7
|
||
audit_configuration_percentage = (final_ratio_pageno+final_ratio_lineno+ratio_for_blanklines) + (average_of_c_j)
|
||
audit_configuration_percentage_str = f"{audit_configuration_percentage:.2f}%"
|
||
print("audit_configuration_percentage",audit_configuration_percentage_str)
|
||
|
||
total_script_element_correct = (total_no_sluglines+total_actionlines+total_no_speaker+total_no_parenthetical+total_no_dialogue+total_no_transition+total_special_terms)
|
||
print("total_script_element_correct",total_script_element_correct)
|
||
audit_script_accuracy = (total_no_sluglines+total_actionlines+total_no_speaker+total_no_parenthetical+total_no_dialogue+total_no_transition+total_special_terms+total_no_blanklines)/preaudit_line_no
|
||
print("audit_script_accuracy",audit_script_accuracy)
|
||
# audit_script_accuracy_str = min(audit_script_accuracy*100 , 100)
|
||
audit_script_accuracy_str = audit_script_accuracy*100
|
||
audit_script_accuracy_str = f"{audit_script_accuracy_str:.2f}%"
|
||
print("audit_script_accuracy_str",audit_script_accuracy_str)
|
||
|
||
|
||
|
||
# the table logics ends here
|
||
# percenteage table from here
|
||
|
||
output_doc = Document()
|
||
style = output_doc.styles['Normal']
|
||
font = style.font
|
||
#font.name = 'Courier New'
|
||
font.size = Pt(10)
|
||
|
||
section = output_doc.sections[-1]
|
||
section.orientation = WD_ORIENT.LANDSCAPE
|
||
|
||
section.page_width = Inches(11)
|
||
section.left_margin = Inches(0.25)
|
||
section.right_margin = Inches(0.25)
|
||
|
||
para = output_doc.add_paragraph()
|
||
para.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
|
||
# Audit Summary at center of the page with bold
|
||
run = para.add_run()
|
||
font = run.font
|
||
font.bold = True
|
||
font.size = Pt(18)
|
||
run.text = ' Audit Summary'
|
||
para = output_doc.add_paragraph()
|
||
# run.add_break()
|
||
|
||
# Add a paragraph for the left-aligned "Audit Date"
|
||
current_date = date.today()
|
||
# Convert to the "day month year" format
|
||
formatted_date = current_date.strftime("%d %B %Y")
|
||
string_date = "𝐀𝐮𝐝𝐢𝐭 𝐃𝐚𝐭𝐞"
|
||
left_aligned_text = output_doc.add_paragraph("\t\t\t\t\t\t\t\t\t\t\t\t\t\t"+ string_date+ " : " + str(formatted_date))
|
||
left_aligned_text.alignment = WD_ALIGN_PARAGRAPH.LEFT
|
||
font_audit_date = left_aligned_text.runs[0].font
|
||
font_audit_date.size = Pt(14)
|
||
font.bold = True
|
||
para = output_doc.add_paragraph()
|
||
|
||
table = output_doc.add_table(rows=2, cols=2)
|
||
table.alignment = WD_TABLE_ALIGNMENT.CENTER
|
||
table.style = 'Colorful Shading Accent 6'
|
||
table.autofit = False # Turn off autofit to set cell widths explicitly
|
||
|
||
# Set cell widths (you can adjust these values as needed)
|
||
table.columns[0].width = Pt(150)
|
||
table.columns[1].width = Pt(100)
|
||
# Access the first cell in the first row
|
||
cell = table.cell(0, 0)
|
||
cell.text = "Audit Contributions"
|
||
cell1 = table.cell(0, 1)
|
||
cell1.text = audit_configuration_percentage_str
|
||
for paragraph in cell.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.font.size = Pt(14)
|
||
for paragraph in cell1.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.font.size = Pt(14)
|
||
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
|
||
# Set vertical alignment to top
|
||
cell.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
|
||
# Access the first cell in the second row
|
||
cell = table.cell(1, 0)
|
||
cell.text = "Audited Accuracy"
|
||
cell1 = table.cell(1,1)
|
||
cell1.text = audit_script_accuracy_str
|
||
for paragraph in cell.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.font.size = Pt(14)
|
||
for paragraph in cell1.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.font.size = Pt(14)
|
||
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
# Set vertical alignment to top
|
||
cell.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
|
||
|
||
para = output_doc.add_paragraph()
|
||
para = output_doc.add_paragraph()
|
||
para = output_doc.add_paragraph()
|
||
|
||
right_aligned_text = para.add_run('\t\t𝐒𝐜𝐫𝐢𝐩𝐭 𝐍𝐚𝐦𝐞: ' + str(scriptname))
|
||
right_aligned_text.alignment = WD_ALIGN_PARAGRAPH.RIGHT
|
||
font_right = right_aligned_text.font
|
||
font_right.size = Pt(14)
|
||
|
||
author_para = output_doc.add_paragraph()
|
||
run_author = author_para.add_run("\t\t𝐀𝐮<F09D9080><F09D90AE>𝐡𝐨𝐫: " + str(author))
|
||
font_author = run_author.font
|
||
font_author.size = Pt(14)
|
||
|
||
language_script_para = output_doc.add_paragraph()
|
||
run_language_script = language_script_para.add_run("\t\t𝐋𝐚𝐧𝐠𝐮𝐚𝐠𝐞 𝐨𝐟 𝐒𝐜𝐫𝐢𝐩𝐭: " + str(script_language))
|
||
font_language_script = run_language_script.font
|
||
font_language_script.size = Pt(14)
|
||
|
||
language_dialogue_para = output_doc.add_paragraph()
|
||
run_language_dialogue = language_dialogue_para.add_run("\t\t𝐋𝐚𝐧𝐠𝐮𝐚𝐠𝐞 𝐨𝐟 𝐃𝐢𝐚𝐥𝐨𝐠𝐮𝐞: " + str(dialogue_language))
|
||
font_language_dialogue = run_language_dialogue.font
|
||
font_language_dialogue.size = Pt(14)
|
||
# Remove line spacing for the entire document
|
||
for para in output_doc.paragraphs:
|
||
para.paragraph_format.space_before = Pt(1)
|
||
para.paragraph_format.space_after = Pt(1)
|
||
|
||
para = output_doc.add_paragraph()
|
||
para = output_doc.add_paragraph()
|
||
|
||
# changes_string_line = output_doc.add_paragraph()
|
||
# run_changes_string_line = changes_string_line.add_run("\t\tStructural Changes\t\t\t\t\\t Blank Lines Adjustments ")
|
||
# font_changes_string_line = run_changes_string_line.font
|
||
# # Set font properties
|
||
# font_changes_string_line.color.rgb = WD_COLOR_INDEX.RED # Red font color
|
||
# font_changes_string_line.italic = True # Italic style
|
||
# font_changes_string_line.bold = True
|
||
paragraph = output_doc.add_paragraph()
|
||
paragraph = output_doc.add_paragraph()
|
||
run = paragraph.add_run("\t\t\t\tStructural Changes\t\t\t\t\t\tBlank Lines Adjustments ")
|
||
# Set font size
|
||
font = run.font
|
||
font.size = Pt(14)
|
||
# Set font color to red
|
||
font.color.rgb = RGBColor(255, 0, 0)
|
||
font.bold = True
|
||
font.italic = True
|
||
for para in output_doc.paragraphs:
|
||
para.paragraph_format.space_before = Pt(0)
|
||
para.paragraph_format.space_after = Pt(0)
|
||
|
||
|
||
|
||
|
||
table = output_doc.add_table(rows=1, cols=2)
|
||
table.allow_autofit = False
|
||
table.alignment = WD_TABLE_ALIGNMENT.CENTER
|
||
table._cells[0].width = Inches(4.3)
|
||
table._cells[1].width = Inches(4.3)
|
||
|
||
|
||
column_first = table._cells[0].add_table(rows=3, cols=3)
|
||
column_second = table._cells[1].add_table(rows=2, cols=2)
|
||
column_first.style = 'Colorful Shading Accent 6'
|
||
column_second.style = 'Colorful Shading Accent 6'
|
||
|
||
column_first_row1 = column_first.cell(0,1)
|
||
column_first_row1.text ="Pre Audit"
|
||
column_first_row1.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
|
||
for paragraph in column_first_row1.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.font.size = Pt(14)
|
||
|
||
column_first_row1_c2 = column_first.cell(0,2)
|
||
column_first_row1_c2.text ="Post Audit"
|
||
column_first_row1_c2.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
|
||
for paragraph in column_first_row1_c2.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.font.size = Pt(14)
|
||
|
||
column_first_row2_c1 = column_first.cell(1,0)
|
||
column_first_row2_c1.text ="No of Pages"
|
||
column_first_row2_c1.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
|
||
for paragraph in column_first_row2_c1.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.font.size = Pt(14)
|
||
|
||
column_first_row2_c2 = column_first.cell(1,1)
|
||
column_first_row2_c2.text = str(pre_audit_pagenumber)
|
||
column_first_row2_c2.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
|
||
for paragraph in column_first_row2_c2.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.font.size = Pt(14)
|
||
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
|
||
column_first_row2_c3 = column_first.cell(1,2)
|
||
column_first_row2_c3.text = str(postauditpagenumber)
|
||
column_first_row2_c3.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
|
||
for paragraph in column_first_row2_c3.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.font.size = Pt(14)
|
||
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
|
||
column_first_row3_c1 = column_first.cell(2,0)
|
||
column_first_row3_c1.text = "No of Lines"
|
||
column_first_row3_c1.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
|
||
for paragraph in column_first_row3_c1.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.font.size = Pt(14)
|
||
|
||
column_first_row3_c2 = column_first.cell(2,1)
|
||
column_first_row3_c2.text = str(preaudit_line_no)
|
||
column_first_row3_c2.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
|
||
for paragraph in column_first_row3_c2.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.font.size = Pt(14)
|
||
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
|
||
column_first_row3_c3 = column_first.cell(2,2)
|
||
column_first_row3_c3.text = str(postaudit_line_no)
|
||
column_first_row3_c3.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
|
||
for paragraph in column_first_row3_c3.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.font.size = Pt(14)
|
||
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
|
||
column_second_row1_c1 = column_second.cell(0,0)
|
||
column_second_row1_c1.text = "Blank Lines Added"
|
||
column_second_row1_c1.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
|
||
for paragraph in column_second_row1_c1.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.font.size = Pt(14)
|
||
|
||
column_second_row1_c2 = column_second.cell(0,1)
|
||
column_second_row1_c2.text = str(blankline_inserted)
|
||
column_second_row1_c2.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
|
||
for paragraph in column_second_row1_c2.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.font.size = Pt(14)
|
||
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
|
||
column_second_row2_c1 = column_second.cell(1,0)
|
||
column_second_row2_c1.text = "Blank Lines Removed"
|
||
column_second_row2_c1.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
|
||
for paragraph in column_second_row2_c1.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.font.size = Pt(14)
|
||
|
||
column_second_row2_c2 = column_second.cell(1,1)
|
||
column_second_row2_c2.text = str(blankline_removed_total)
|
||
column_second_row2_c2.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
|
||
for paragraph in column_second_row2_c2.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.font.size = Pt(14)
|
||
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
|
||
paragraph = output_doc.add_paragraph()
|
||
paragraph = output_doc.add_paragraph()
|
||
paragraph = output_doc.add_paragraph()
|
||
run = paragraph.add_run("Summary of Correction made")
|
||
# Set font size
|
||
font = run.font
|
||
font.size = Pt(14)
|
||
# Set font color to red
|
||
font.color.rgb = RGBColor(255, 0, 0)
|
||
font.bold = True
|
||
font.italic = True
|
||
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
|
||
summary_table = output_doc.add_table(rows=9, cols=5)
|
||
summary_table.allow_autofit = False
|
||
summary_table.alignment = WD_TABLE_ALIGNMENT.CENTER
|
||
summary_table.style = 'Colorful Shading Accent 6'
|
||
# # Calculate the column widths
|
||
# column_widths = [Inches(1), Inches(1.5), Inches(1.5), Inches(1.5), Inches(1)] # Adjust the widths as needed
|
||
|
||
# # Set the column widths
|
||
# for col, width in enumerate(column_widths):
|
||
# summary_table.columns[col].width = width
|
||
# table.columns[0].width = Pt(150)
|
||
# table.columns[1].width = Pt(100)
|
||
# summary_table.columns[0].width = Inches(1)
|
||
# summary_table.columns[1].width = Inches(1)
|
||
# summary_table.columns[2].width = Inches(1)
|
||
# summary_table.columns[3].width = Inches(1)
|
||
# summary_table.columns[4].width = Inches(0.5)
|
||
|
||
summary_table_row1_col_2 = summary_table.cell(0,1)
|
||
summary_table_row1_col_2.text ="Case Correction"
|
||
summary_table_row1_col_2.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
|
||
for paragraph in summary_table_row1_col_2.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.font.size = Pt(14)
|
||
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
|
||
summary_table_row1_col_3 = summary_table.cell(0,2)
|
||
summary_table_row1_col_3.text ="Indent Correction"
|
||
summary_table_row1_col_3.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
|
||
for paragraph in summary_table_row1_col_3.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.font.size = Pt(14)
|
||
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
|
||
summary_table_row1_col_4 = summary_table.cell(0,3)
|
||
summary_table_row1_col_4.text ="Format Correction"
|
||
summary_table_row1_col_4.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
|
||
for paragraph in summary_table_row1_col_4.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.font.size = Pt(14)
|
||
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
|
||
summary_table_row1_col_5 = summary_table.cell(0,4)
|
||
summary_table_row1_col_5.text ="Total"
|
||
summary_table_row1_col_5.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
|
||
summary_table_row1_col_5.width = Inches(0.5)
|
||
|
||
for paragraph in summary_table_row1_col_5.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.font.size = Pt(14)
|
||
run.font.bold = True
|
||
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
|
||
#---------ROW 2------------
|
||
summary_table_row2_col_1 = summary_table.cell(1,0)
|
||
summary_table_row2_col_1.text ="Sluglines"
|
||
summary_table_row2_col_1.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
|
||
for paragraph in summary_table_row2_col_1.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.font.size = Pt(14)
|
||
|
||
summary_table_row2_col_2 = summary_table.cell(1,1)
|
||
summary_table_row2_col_2.text = str(sluglinecase_corrected_count)
|
||
summary_table_row2_col_2.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
|
||
for paragraph in summary_table_row2_col_2.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.font.size = Pt(14)
|
||
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
|
||
summary_table_row2_col_3 = summary_table.cell(1,2)
|
||
summary_table_row2_col_3.text = str(slugline_indentation)
|
||
summary_table_row2_col_3.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
|
||
for paragraph in summary_table_row2_col_3.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.font.size = Pt(14)
|
||
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
|
||
summary_table_row2_col_4 = summary_table.cell(1,3)
|
||
summary_table_row2_col_4.text = str(slugline_formated)
|
||
summary_table_row2_col_4.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
|
||
for paragraph in summary_table_row2_col_4.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.font.size = Pt(14)
|
||
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
|
||
summary_table_row2_col_5 = summary_table.cell(1,4)
|
||
total_slug = slugline_formated+slugline_indentation+sluglinecase_corrected_count
|
||
summary_table_row2_col_5.text = str(total_slug)
|
||
summary_table_row2_col_5.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
|
||
for paragraph in summary_table_row2_col_5.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.font.size = Pt(14)
|
||
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
|
||
|
||
summary_table_row3_col_1 = summary_table.cell(2,0)
|
||
summary_table_row3_col_1.text = "Action Lines"
|
||
summary_table_row3_col_1.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
|
||
for paragraph in summary_table_row3_col_1.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.font.size = Pt(14)
|
||
|
||
|
||
summary_table_row3_col_2 = summary_table.cell(2,1)
|
||
summary_table_row3_col_2.text = str(actionlinecase_corrected_count)
|
||
summary_table_row3_col_2.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
|
||
for paragraph in summary_table_row3_col_2.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.font.size = Pt(14)
|
||
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
|
||
summary_table_row3_col_3 = summary_table.cell(2,2)
|
||
summary_table_row3_col_3.text = str(actionline_indentation)
|
||
summary_table_row3_col_3.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
|
||
for paragraph in summary_table_row3_col_3.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.font.size = Pt(14)
|
||
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
|
||
summary_table_row3_col_4 = summary_table.cell(2,3)
|
||
summary_table_row3_col_4.text = str(actionline_formated)
|
||
summary_table_row3_col_4.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
|
||
for paragraph in summary_table_row3_col_4.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.font.size = Pt(14)
|
||
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
|
||
summary_table_row3_col_5 = summary_table.cell(2,4)
|
||
total_action_line = actionlinecase_corrected_count+actionline_indentation+actionline_formated
|
||
summary_table_row3_col_5.text = str(total_action_line)
|
||
summary_table_row3_col_5.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
|
||
for paragraph in summary_table_row3_col_5.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.font.size = Pt(14)
|
||
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
|
||
#-----ROW 4 ---------
|
||
|
||
summary_table_row4_col_1 = summary_table.cell(3,0)
|
||
summary_table_row4_col_1.text = "Speaker"
|
||
summary_table_row4_col_1.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
|
||
for paragraph in summary_table_row4_col_1.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.font.size = Pt(14)
|
||
# paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
|
||
summary_table_row4_col_2 = summary_table.cell(3,1)
|
||
summary_table_row4_col_2.text = str(speakercase_corrected_count)
|
||
summary_table_row4_col_2.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
|
||
for paragraph in summary_table_row4_col_2.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.font.size = Pt(14)
|
||
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
|
||
summary_table_row4_col_3 = summary_table.cell(3,2)
|
||
summary_table_row4_col_3.text = str(speaker_indentation)
|
||
summary_table_row4_col_3.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
|
||
for paragraph in summary_table_row4_col_3.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.font.size = Pt(14)
|
||
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
|
||
summary_table_row4_col_4 = summary_table.cell(3,3)
|
||
summary_table_row4_col_4.text = str(speaker_formated)
|
||
summary_table_row4_col_4.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
|
||
for paragraph in summary_table_row4_col_4.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.font.size = Pt(14)
|
||
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
|
||
summary_table_row4_col_5 = summary_table.cell(3,4)
|
||
total_speaker = speaker_formated+speaker_indentation+speakercase_corrected_count
|
||
summary_table_row4_col_5.text = str(total_speaker)
|
||
summary_table_row4_col_5.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
|
||
for paragraph in summary_table_row4_col_5.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.font.size = Pt(14)
|
||
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
|
||
|
||
|
||
# ----ROW 5 -------
|
||
summary_table_row5_col_1 = summary_table.cell(4,0)
|
||
summary_table_row5_col_1.text = "Parentheticals"
|
||
summary_table_row5_col_1.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
|
||
for paragraph in summary_table_row5_col_1.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.font.size = Pt(14)
|
||
|
||
|
||
summary_table_row5_col_2 = summary_table.cell(4,1)
|
||
summary_table_row5_col_2.text = str(parentheticalcase_corrected_count)
|
||
summary_table_row5_col_2.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
|
||
for paragraph in summary_table_row5_col_2.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.font.size = Pt(14)
|
||
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
|
||
summary_table_row5_col_3 = summary_table.cell(4,2)
|
||
summary_table_row5_col_3.text = str(parenthetical_line_indentation)
|
||
summary_table_row5_col_3.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
|
||
for paragraph in summary_table_row5_col_3.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.font.size = Pt(14)
|
||
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
|
||
summary_table_row5_col_4 = summary_table.cell(4,3)
|
||
summary_table_row5_col_4.text = str(parenthetical_formated)
|
||
summary_table_row5_col_4.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
|
||
for paragraph in summary_table_row5_col_4.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.font.size = Pt(14)
|
||
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
|
||
summary_table_row5_col_5 = summary_table.cell(4,4)
|
||
total_parenthetical = parenthetical_formated + parenthetical_line_indentation+parentheticalcase_corrected_count
|
||
summary_table_row5_col_5.text = str(total_parenthetical)
|
||
summary_table_row5_col_5.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
|
||
for paragraph in summary_table_row5_col_5.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.font.size = Pt(14)
|
||
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
|
||
|
||
|
||
#ROW --- 6
|
||
summary_table_row6_col_1 = summary_table.cell(5,0)
|
||
summary_table_row6_col_1.text = "Dialogue"
|
||
summary_table_row6_col_1.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
|
||
for paragraph in summary_table_row6_col_1.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.font.size = Pt(14)
|
||
|
||
summary_table_row6_col_2 = summary_table.cell(5,1)
|
||
summary_table_row6_col_2.text = str(Dialogue_case_corrected_count)
|
||
summary_table_row6_col_2.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
|
||
for paragraph in summary_table_row6_col_2.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.font.size = Pt(14)
|
||
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
|
||
summary_table_row6_col_3 = summary_table.cell(5,2)
|
||
summary_table_row6_col_3.text = str(dialogue_line_indentation)
|
||
summary_table_row6_col_3.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
|
||
for paragraph in summary_table_row6_col_3.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.font.size = Pt(14)
|
||
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
|
||
summary_table_row6_col_4 = summary_table.cell(5,3)
|
||
summary_table_row6_col_4.text = str(dialogue_formated)
|
||
summary_table_row6_col_4.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
|
||
for paragraph in summary_table_row6_col_4.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.font.size = Pt(14)
|
||
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
|
||
summary_table_row6_col_5 = summary_table.cell(5,4)
|
||
total_dialogue = dialogue_formated + dialogue_line_indentation+Dialogue_case_corrected_count
|
||
summary_table_row6_col_5.text = str(total_dialogue)
|
||
summary_table_row6_col_5.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
|
||
for paragraph in summary_table_row6_col_5.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.font.size = Pt(14)
|
||
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
|
||
|
||
|
||
#ROW --- 7
|
||
summary_table_row7_col_1 = summary_table.cell(6,0)
|
||
summary_table_row7_col_1.text = "Transitions"
|
||
summary_table_row7_col_1.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
|
||
for paragraph in summary_table_row7_col_1.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.font.size = Pt(14)
|
||
|
||
summary_table_row7_col_2 = summary_table.cell(6,1)
|
||
summary_table_row7_col_2.text = str(transitions_case_corrected_count)
|
||
summary_table_row7_col_2.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
|
||
for paragraph in summary_table_row7_col_2.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.font.size = Pt(14)
|
||
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
|
||
summary_table_row7_col_3 = summary_table.cell(6,2)
|
||
summary_table_row7_col_3.text = str(transitions_line_indentation)
|
||
summary_table_row7_col_3.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
|
||
for paragraph in summary_table_row7_col_3.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.font.size = Pt(14)
|
||
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
|
||
summary_table_row7_col_4 = summary_table.cell(6,3)
|
||
summary_table_row7_col_4.text = str(transitions_formated)
|
||
summary_table_row7_col_4.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
|
||
for paragraph in summary_table_row7_col_4.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.font.size = Pt(14)
|
||
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
|
||
summary_table_row7_col_5 = summary_table.cell(6,4)
|
||
total_transition = transitions_formated+transitions_line_indentation+transitions_case_corrected_count
|
||
summary_table_row7_col_5.text = str(total_transition)
|
||
summary_table_row7_col_5.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
|
||
for paragraph in summary_table_row7_col_5.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.font.size = Pt(14)
|
||
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
|
||
|
||
#ROW --- 8
|
||
summary_table_row8_col_1 = summary_table.cell(7,0)
|
||
summary_table_row8_col_1.text = "Special Terms"
|
||
summary_table_row8_col_1.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
|
||
for paragraph in summary_table_row8_col_1.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.font.size = Pt(14)
|
||
|
||
summary_table_row8_col_2 = summary_table.cell(7,1)
|
||
summary_table_row8_col_2.text = str(st_case_corrected_count)
|
||
summary_table_row8_col_2.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
|
||
for paragraph in summary_table_row8_col_2.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.font.size = Pt(14)
|
||
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
|
||
summary_table_row8_col_3 = summary_table.cell(7,2)
|
||
summary_table_row8_col_3.text = str(st_line_indentation)
|
||
summary_table_row8_col_3.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
|
||
for paragraph in summary_table_row8_col_3.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.font.size = Pt(14)
|
||
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
|
||
summary_table_row8_col_4 = summary_table.cell(7,3)
|
||
summary_table_row8_col_4.text = str(st_formated)
|
||
summary_table_row8_col_4.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
|
||
for paragraph in summary_table_row8_col_4.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.font.size = Pt(14)
|
||
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
|
||
summary_table_row8_col_5 = summary_table.cell(7,4)
|
||
total_special_term = st_formated +st_line_indentation+ st_case_corrected_count
|
||
summary_table_row8_col_5.text = str(total_special_term)
|
||
summary_table_row8_col_5.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
|
||
for paragraph in summary_table_row8_col_5.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.font.size = Pt(14)
|
||
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
|
||
|
||
|
||
#ROW --- 9
|
||
summary_table_row9_col_1 = summary_table.cell(8,0)
|
||
summary_table_row9_col_1.text = "Total"
|
||
summary_table_row9_col_1.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
|
||
for paragraph in summary_table_row9_col_1.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.font.size = Pt(14)
|
||
run.font.bold = True
|
||
|
||
|
||
summary_table_row9_col_2 = summary_table.cell(8,1)
|
||
summary_table_row9_col_2.text = str(sluglinecase_corrected_count+actionlinecase_corrected_count+speakercase_corrected_count+parentheticalcase_corrected_count +Dialogue_case_corrected_count+transitions_case_corrected_count+st_case_corrected_count)
|
||
summary_table_row9_col_2.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
|
||
for paragraph in summary_table_row9_col_2.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.font.size = Pt(14)
|
||
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
|
||
summary_table_row9_col_3 = summary_table.cell(8,2)
|
||
summary_table_row9_col_3.text = str(slugline_indentation+actionline_indentation+speaker_indentation+parenthetical_line_indentation+dialogue_line_indentation+transitions_line_indentation+st_line_indentation)
|
||
summary_table_row9_col_3.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
|
||
for paragraph in summary_table_row9_col_3.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.font.size = Pt(14)
|
||
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
|
||
summary_table_row9_col_4 = summary_table.cell(8,3)
|
||
summary_table_row9_col_4.text = str(slugline_formated+actionline_formated+speaker_formated+parenthetical_formated+dialogue_formated+transitions_formated+st_formated)
|
||
summary_table_row9_col_4.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
|
||
for paragraph in summary_table_row9_col_4.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.font.size = Pt(14)
|
||
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
|
||
|
||
summary_table_row9_col_5 = summary_table.cell(8,4)
|
||
summary_table_row9_col_5.text = str(total_slug+total_action_line+total_speaker+total_parenthetical+total_dialogue+total_transition+total_special_term)
|
||
summary_table_row9_col_5.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
|
||
for paragraph in summary_table_row9_col_5.paragraphs:
|
||
for run in paragraph.runs:
|
||
run.font.size = Pt(14)
|
||
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
|
||
|
||
#--------------------------- 14-09-2023
|
||
for _ in range(3):
|
||
output_doc.add_paragraph()
|
||
#----------------------- 14-09-23
|
||
para = output_doc.add_paragraph()
|
||
para.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||
|
||
# Audit detail at center of the page with bold
|
||
run = para.add_run()
|
||
font = run.font
|
||
font.bold = True
|
||
font.size = Pt(18)
|
||
run.text = ' Audit Details'
|
||
run.add_break()
|
||
# -------------------------- 14-09-23
|
||
|
||
no_rows = len(audit_df.index)
|
||
|
||
table =output_doc.add_table(1, cols =6)
|
||
table.alignment = WD_TABLE_ALIGNMENT.CENTER
|
||
|
||
table.style = 'Colorful Shading Accent 6'
|
||
|
||
table.autofit = False
|
||
table.columns[0].width = Inches(0.5)
|
||
table.columns[1].width = Inches(1.2)
|
||
table.columns[2].width = Inches(2)
|
||
table.columns[3].width = Inches(1.5)
|
||
table.columns[4].width = Inches(2)
|
||
table.columns[5].width = Inches(2.5)
|
||
# table.columns[3].width = Inches(0.5)
|
||
|
||
|
||
heading_cells = table.rows[0].cells
|
||
|
||
heading_cells[0].width = Inches(0.1)
|
||
heading_cells[1].width = Inches(0.1)
|
||
heading_cells[2].width = Inches(3.5)
|
||
heading_cells[3].width = Inches(0.8)
|
||
heading_cells[4].width = Inches(3.5)
|
||
heading_cells[5].width = Inches(2)
|
||
|
||
heading_cells[0].text = 'Line No'
|
||
heading_cells[1].text = 'Audited Line No'
|
||
heading_cells[2].text = 'Current Content'
|
||
heading_cells[3].text = 'Script Element'
|
||
heading_cells[4].text = 'New Content'
|
||
heading_cells[5].text = 'Changes Done'
|
||
|
||
|
||
for i in range(0,6):
|
||
heading_cells[i].paragraphs[0].runs[0].font.bold = True
|
||
heading_cells[i].paragraphs[0].runs[0].font.size = Pt(9)
|
||
|
||
|
||
#------------------------------->LOGIC HERE<---------------------------------------------
|
||
report_df = pd.DataFrame(columns=['line_no', 'audited_line_no', 'current_content', 'script_element', 'new_content', 'changes_done', 'para_no'])
|
||
|
||
for index in audit_df.index:
|
||
|
||
columns_to_check = ["line_removed","introduction", "appendix", "page_no" ,"left_indent_corrected" ,"right_indent_corrected" ,"line_wrapped_at_prescribed_right_indent", "case_corrected", "blank_inserted_before" ,"blank_inserted_after" ,"blank_deleted_before" ,"blank_deleted_after" ,"space_removed_between_characters" ,"space_added_between_characters" ,"line_merged_with_next_line", "line_broken_into_multiple_lines" ,"punctuation_mark_added" ,"punctuation_mark_removed" ,"language_specific_audit_comments"]
|
||
audit_df[columns_to_check] = audit_df[columns_to_check].fillna('No')
|
||
if audit_df.loc[index, columns_to_check].eq('No').all().all():
|
||
# All columns contain 'No', skip this row
|
||
continue
|
||
elif audit_df['introduction'][index] == 'Yes':
|
||
continue
|
||
elif audit_df['appendix'][index] == 'Yes':
|
||
continue
|
||
elif audit_df['Identification_Status'][index] == 'blank':
|
||
continue
|
||
elif pd.isna(audit_df.loc[index, "Identification_Status"]):
|
||
continue
|
||
|
||
|
||
|
||
|
||
para_value = audit_df["para_no"][index] # ---------------------------------------------><-------------------------
|
||
current_para_value = report_df['para_no'].iloc[-1] if not report_df.empty else None
|
||
if para_value == current_para_value:
|
||
continue
|
||
else:
|
||
# report_df = report_df.append(audit_df.loc[index], ignore_index=True)
|
||
new_row = audit_df.loc[index].to_frame().T
|
||
report_df = pd.concat([report_df, new_row], ignore_index=True)
|
||
print("current_para_value",current_para_value)
|
||
|
||
|
||
row_index = 1
|
||
old_line_no_index = index
|
||
collection_old_line_no = []
|
||
while old_line_no_index < len(audit_df) and str(audit_df["para_no"][old_line_no_index]) == str(para_value):
|
||
if audit_df['Identification_Status'][old_line_no_index] != "blank":
|
||
try:
|
||
data = int(old_line_no_index)
|
||
collection_old_line_no.append(str(data))
|
||
except ValueError:
|
||
pass
|
||
old_line_no_index += 1
|
||
|
||
cells = table.add_row().cells
|
||
cells[0].width = Inches(0.1)
|
||
cells[0].text = ', '.join(collection_old_line_no)
|
||
|
||
audited_line_index = index
|
||
#--------------------------------------audited_lino_no------------------
|
||
collection_audited_line_no = []
|
||
while audited_line_index < len(audit_df) and str(audit_df["para_no"][audited_line_index]) == str(para_value):
|
||
if audit_df['Identification_Status'][audited_line_index] != "blank":
|
||
audited_line_no = audit_df['audited_line_no'][audited_line_index]
|
||
try:
|
||
data = int(audited_line_no)
|
||
collection_audited_line_no.append(str(data))
|
||
except ValueError:
|
||
pass
|
||
audited_line_index += 1
|
||
print("collection_audited_line_no", collection_audited_line_no)
|
||
data_string = ', '.join(collection_audited_line_no)
|
||
print("data_string:", data_string)
|
||
cells[1].width = Inches(0.1)
|
||
cells[1].text = data_string
|
||
|
||
|
||
#------------------------------>OLD DATA<---------------------------------
|
||
data_index = index
|
||
collection_data = []
|
||
while data_index < len(audit_df) and str(audit_df["para_no"][data_index]) == str(para_value):
|
||
cur_data = audit_df['data'][data_index]
|
||
if not pd.isna(cur_data): # Check if the value is not NaN
|
||
data = str(cur_data).strip()
|
||
collection_data.append(data)
|
||
data_index += 1
|
||
|
||
cells[2].width = Inches(3.5)
|
||
data = str(data)
|
||
cells[2].text = '\n '.join(collection_data)
|
||
|
||
if audit_df['Identification_Status'][index] == 'blank':
|
||
script_element = 'Blank Line'
|
||
elif audit_df['Identification_Status'][index] == '':
|
||
if audit_df['introduction'][index] == 'Yes':
|
||
script_element = 'Title/Introduction'
|
||
elif audit_df['appendix'][index] == 'Yes':
|
||
script_element = 'Appendix'
|
||
else:
|
||
continue
|
||
|
||
else:
|
||
script_element = ps_to_script_element(audit_df['Identification_Status'][index])
|
||
data = script_element
|
||
cells[3].width = Inches(0.8)
|
||
cells[3].text = data
|
||
|
||
collection_new_data = []
|
||
new_data_index = index
|
||
|
||
while new_data_index < len(audit_df) and str(audit_df["para_no"][new_data_index]) == str(para_value):
|
||
if audit_df["line_removed"][new_data_index] == "No":
|
||
new_data = audit_df['data_corrected'][new_data_index]
|
||
if not pd.isna(new_data): # Check if the value is not NaN
|
||
data = str(new_data).strip()
|
||
collection_new_data.append(data)
|
||
new_data_index += 1
|
||
data = str(new_data).strip()
|
||
cells[4].width = Inches(3.5)
|
||
data = str(data)
|
||
cells[4].text = '\n '.join(collection_new_data)
|
||
|
||
sno = 1
|
||
changes_done = False
|
||
|
||
# identification_status = audit_df['Identification_Status'][index]
|
||
if pd.isnull(audit_df['Identification_Status'][index]) or audit_df['Identification_Status'][index] == "":
|
||
continue
|
||
|
||
|
||
if audit_df['left_indent_corrected'][index] != 'No':
|
||
change_comment = audit_df['left_indent_corrected'][index]
|
||
try:
|
||
str_int = change_comment[-2]+change_comment[-1]
|
||
except Exception as e:
|
||
pass
|
||
if ps_to_script_element(audit_df['Identification_Status'][index]) == "Dialogue":
|
||
if str_int == "15":
|
||
change_comment = "Dialogue line left index corrected to 1.5 Inch"
|
||
elif str_int == "25":
|
||
change_comment = "Dialogue line left index corrected to 2.5 Inch"
|
||
|
||
|
||
if str_int == "15":
|
||
name = ps_to_script_element(audit_df['Identification_Status'][index])
|
||
change_comment = f"{name} Line left indent corrected to 1.5 Inch"
|
||
print(change_comment)
|
||
elif str_int == "25":
|
||
name = ps_to_script_element(audit_df['Identification_Status'][index])
|
||
change_commen = f"{name} Left indent corrected to 2.5 Inch"
|
||
|
||
elif str_int == "30":
|
||
change_comment = "Parenthetical left indent corrected to 3 Inch"
|
||
elif str_int == "35":
|
||
change_comment = "Speaker left indent corrected to 3.5 Inch"
|
||
|
||
data = str(sno) + '. ' + str(change_comment)
|
||
# dataa = data.split()
|
||
# if dataa[-1] == "nan":
|
||
# continue
|
||
|
||
cells[5].width = Inches(2)
|
||
para = cells[5].add_paragraph()
|
||
run = para.add_run()
|
||
run.text = data
|
||
run.add_break()
|
||
sno += 1
|
||
changes_done = True
|
||
|
||
if audit_df['right_indent_corrected'][index] != 'No':
|
||
name = ps_to_script_element(audit_df['Identification_Status'][index])
|
||
change_comment = audit_df['right_indent_corrected'][index]
|
||
try:
|
||
str_int = change_comment[-2]+change_comment[-1]
|
||
except Exception as e:
|
||
pass
|
||
if str_int == "10":
|
||
change_comment = f"{name} Line right indent corrected to 1 Inch"
|
||
|
||
|
||
data = str(sno) + '. ' + str(change_comment)
|
||
# dataa = data.split()
|
||
# if dataa[-1] == "nan":
|
||
# continue
|
||
cells[5].width = Inches(2)
|
||
para = cells[5].add_paragraph()
|
||
run = para.add_run()
|
||
run.text = data
|
||
run.add_break()
|
||
sno += 1
|
||
changes_done = True
|
||
|
||
if audit_df['case_corrected'][index] != 'No':
|
||
name = ps_to_script_element(audit_df['Identification_Status'][index])
|
||
string = str(audit_df['case_corrected'][index])
|
||
string = string.split()
|
||
content = string[-1]
|
||
if content == "AllUpper":
|
||
change_comment = f'{name} Case ' + "Corrected to All Upper"
|
||
elif content == "AllLower":
|
||
change_comment = f'{name} Case ' + "Corrected to All Lowerr"
|
||
if len(str(change_comment)) <= 2 :
|
||
continue
|
||
data = str(sno) + '. ' + str(change_comment)
|
||
# dataa = data.split()
|
||
# if dataa[-1] == "nan":
|
||
# continue
|
||
cells[5].width = Inches(2)
|
||
para = cells[5].add_paragraph()
|
||
run = para.add_run()
|
||
run.text = data
|
||
run.add_break()
|
||
sno += 1
|
||
changes_done = True
|
||
|
||
if audit_df['line_wrapped_at_prescribed_right_indent'][index] != 'No':
|
||
change_comment = 'Line Wrapped at Prescribed Right Indent 1 Inch'
|
||
name = ps_to_script_element(audit_df['Identification_Status'][index])
|
||
if name == "Action":
|
||
change_comment = f'{name} Line Wrapped at Prescribed Right Indent 1 Inch'
|
||
elif name == "Dialogue":
|
||
change_comment = f'{name} Line Wrapped at Prescribed Right Indent 2 Inch'
|
||
|
||
data = str(sno) + '. ' + str(change_comment)
|
||
# dataa = data.split()
|
||
# if dataa[-1] == "nan":
|
||
# continue
|
||
|
||
cells[5].width = Inches(2)
|
||
para = cells[5].add_paragraph()
|
||
run = para.add_run()
|
||
run.text = data
|
||
run.add_break()
|
||
sno += 1
|
||
changes_done = True
|
||
|
||
if audit_df['line_broken_into_multiple_lines'][index] != 'No':
|
||
name = ps_to_script_element(audit_df['Identification_Status'][index])
|
||
change_comment = f'{name} line Broken into Multiple Lines'
|
||
|
||
data = str(sno) + '. ' + str(change_comment)
|
||
# dataa = data.split()
|
||
# if dataa[-1] == "nan":
|
||
# continue
|
||
cells[5].width = Inches(2)
|
||
para = cells[5].add_paragraph()
|
||
run = para.add_run()
|
||
run.text = data
|
||
run.add_break()
|
||
sno += 1
|
||
changes_done = True
|
||
|
||
if audit_df['line_merged_with_next_line'][index] != 'No':
|
||
name = ps_to_script_element(audit_df['Identification_Status'][index])
|
||
change_comment = f'{name} line Merged with Next Line'
|
||
|
||
data = str(sno) + '. ' + str(change_comment)
|
||
# dataa = data.split()
|
||
# if dataa[-1] == "nan":
|
||
# continue
|
||
cells[5].width = Inches(2)
|
||
para = cells[5].add_paragraph()
|
||
run = para.add_run()
|
||
run.text = data
|
||
run.add_break()
|
||
sno += 1
|
||
changes_done = True
|
||
|
||
if audit_df['language_specific_audit_comments'][index] != 'No':
|
||
pass
|
||
name = ps_to_script_element(audit_df['Identification_Status'][index])
|
||
change_comment = f"{name}",str(audit_df['language_specific_audit_comments'][index])
|
||
|
||
data = str(sno) + '. ' + str(change_comment)
|
||
|
||
cells[5].width = Inches(2)
|
||
para = cells[5].add_paragraph()
|
||
run = para.add_run()
|
||
run.text = data
|
||
run.add_break()
|
||
sno += 1
|
||
changes_done = True
|
||
|
||
|
||
if audit_df['blank_inserted_after'][index] != 'No':
|
||
change_comment = 'A blank line is added below'
|
||
# name = ps_to_script_element(audit_df['Identification_Status'][index])
|
||
# if name == "Action":
|
||
# change_comment = f'{name}Line Wrapped at Prescribed Right Indent 1 Inch'
|
||
# elif name == "Dialogue":
|
||
# change_comment = f'{name}Line Wrapped at Prescribed Right Indent 2 Inch'
|
||
|
||
data = str(sno) + '. ' + str(change_comment)
|
||
# dataa = data.split()
|
||
# if dataa[-1] == "nan":
|
||
# continue
|
||
|
||
cells[5].width = Inches(2)
|
||
para = cells[5].add_paragraph()
|
||
run = para.add_run()
|
||
run.text = data
|
||
run.add_break()
|
||
sno += 1
|
||
changes_done = True
|
||
|
||
if not changes_done:
|
||
continue
|
||
# data = 'No Changes Done'
|
||
# cells[5].width = Inches(2)
|
||
# para = cells[5].add_paragraph()
|
||
# run = para.add_run()
|
||
# run.text = data
|
||
# run.add_break()
|
||
|
||
row_index += 1
|
||
|
||
buffer = io.BytesIO()
|
||
output_doc.save(buffer)
|
||
buffer.seek(0)
|
||
|
||
|
||
#output_doc.save(audit_report_tabular_docx)
|
||
return buffer
|
||
|
||
|
||
def replace_dot_with_comma(slugline):
|
||
pattern = r'((?:INT\./EXT\. |INT\. |EXT\. |E/I\. |INT |EXT)?)\s*(.*?)\s*-\s*([A-Z\s]+)'
|
||
def replacer(match):
|
||
location = match.group(2)
|
||
location = location.replace(".", ",")
|
||
return f'{match.group(1)}{location} - {match.group(3)}'
|
||
return re.sub(pattern, replacer, slugline)
|
||
|
||
|
||
def change_dot_to_comma_inslug(df):
|
||
for index, row in df.iterrows():
|
||
if row['script_element'] == 'slugline':
|
||
text = (row['content'])
|
||
print(text)
|
||
modified_sluglines = replace_dot_with_comma(text)
|
||
print(modified_sluglines)
|
||
df.loc[index, 'content'] = modified_sluglines
|
||
|
||
return df
|
||
|
||
|
||
def fdx_to_audited_df(input_script):
|
||
|
||
fdx = open(input_script, 'r')
|
||
fdx_df = utilities.fdx_to_csv(fdx)
|
||
|
||
df = pd.DataFrame(columns=['para_no','scene_no','content','script_element'])
|
||
|
||
df['content'] = fdx_df['Text']
|
||
df['script_element'] = fdx_df['Script_Element']
|
||
|
||
di = {'Scene Heading':'slugline','Character':'speaker','Parenthetical':'parenthetical','Transition':'transition','Action':'action','Dialogue':'dialogue'}
|
||
|
||
df.replace({"script_element":di},inplace= True)
|
||
|
||
##inserting blanks
|
||
## after slugline
|
||
## after action
|
||
## after dialogue
|
||
## after transition
|
||
count = len(df)
|
||
for index in df.index:
|
||
se = df['script_element'][index]
|
||
if index < (count-1):
|
||
|
||
if se in ('slugline','action','dialogue','transition'):
|
||
# skip parenthticals in between dialogues
|
||
if not(se == 'dialogue' and df['script_element'][index+1] in ('parenthetical','dialogue')):
|
||
df.loc[index + 0.5] = np.nan
|
||
df.loc[index + 0.5,'script_element'] = 'blank'
|
||
df.loc[index + 0.5,'content'] = ''
|
||
|
||
## case upper for slugline, character, transition, lower for parenthetical
|
||
## more elaborate function for slugline is present in sa_functions_english which will have to be repurposed
|
||
if se in ('slugline','speaker','transition'):
|
||
df.loc[index,'content'] = str(df.loc[index,'content']).upper()
|
||
if se == 'parenthetical':
|
||
df.loc[index,'content'] = str(df.loc[index,'content']).lower()
|
||
|
||
|
||
|
||
|
||
df = df.sort_index().reset_index(drop=True)
|
||
|
||
## add para_no and scene_no
|
||
|
||
para_no = 1
|
||
scene_no = 1
|
||
|
||
for index in df.index:
|
||
|
||
df['para_no'][index] = para_no
|
||
df['scene_no'][index] = scene_no
|
||
|
||
if df['script_element'][index] == 'slugline':
|
||
scene_no += 1
|
||
|
||
para_no += 1
|
||
|
||
#best of luck
|
||
return df
|
||
|
||
|
||
|
||
|
||
|
||
|