import numpy as np import pandas as pd import math import os import csv import subprocess import io import shutil from centralisedFileSystem.models import File, Script from pathlib import Path import re import textwrap import docx from docx import Document from docx.shared import Pt, RGBColor from docx.shared import Mm,Inches from docx.enum.text import WD_ALIGN_PARAGRAPH from docx.enum.table import WD_TABLE_ALIGNMENT from docx.enum.table import WD_CELL_VERTICAL_ALIGNMENT from docx.enum.section import WD_ORIENT from pdf2docx import parse # import pdftotext from scriptAudit.exceptions import ScriptAuditException from utils import utilities from datetime import date from PyPDF2 import PdfFileReader, PdfFileWriter from utils.scripts_functions import countPages from conversion.translation.detection import script_det, language_detector from conversion.translation.translation_variables import get_language_script_code, language_code #mypath= str(Path(__file__).resolve().parent.parent) + "/neutralAudit/matrices/" mypath= str(Path(__file__).resolve().parent) + "/matrices/" # mypath = os.getcwd() +'\\' def convert_to_pdf(input_docx, out_folder): subprocess.Popen(['libreoffice', '--headless', '--convert-to', 'pdf', '--outdir',out_folder, input_docx]).communicate() def check_space_line(value): if value.isspace(): return "Y" else: return "N" def check_space(data): # counter space_count = 0 for i in range(0, len(data)): # Check each char # is blank or not if data[i] == " ": space_count += 1 else: break #print(space_count) return space_count def get_last_char_pos(data): l = len(data) for pos in range(0,l): after_pos = data[pos+1:] #print(data[pos],after_pos) if after_pos.isspace() or not after_pos: return pos def get_case(value): upperFound = False if check_space_line(value) == "Y": return "None" elif value.isupper(): return "AllUpper" elif value.islower(): return "AllLower" else : words = value.lstrip().split(" ") try: ch = words[0][0] except: return "None" if words[0][0]: if words[0][0].isupper() and not words[0].isupper(): return "FirstCamel" elif words[0].isupper() and len(words[0]) > 1: return "FirstUpper" elif words[-1].isupper() and len(words[-1]) > 1: return "EndUpper" else: for word in words: if word.isupper() and len(word) > 1: upperFound = True if upperFound: return "MidUpper" else: return "Partial" return 'None' def conv_pdf_to_docx(input_script,output_converted_docx): parse(input_script,output_converted_docx,start=0,end=None) def conv_docx_to_txt(input_script,output_converted_txt): # import textwrap # from docx import Document # from docx.shared import Pt # from docx.shared import Mm read_doc = Document(input_script) # print(read_doc._body._body.xml) #section= read_doc.sections[-1] def recalculate_section_properties(n): try: section = read_doc.sections[n] section_width_inches = section.page_width.inches #section_width = int(section.page_width.inches * 10) # print("section width direct ",section.page_width.inches) # print(section.left_margin.inches,section.right_margin.inches) margins_inches = section.left_margin.inches + section.right_margin.inches #margins = int((section.left_margin.inches + section.right_margin.inches)*10) print(margins_inches) canvas_width_inches = section_width_inches - margins_inches canvas_width = int(canvas_width_inches *10) print("canvas width",canvas_width) left_margin = int(section.left_margin.inches * 10) except: section = None canvas_width = 65 left_margin = 15 return section,canvas_width,left_margin #for para in read_doc.paragraphs: # n= 0 # p = para._p # sectPrs = p.xpath("./w:pPr/w:sectPr") # if sectPrs: # n = n +1 # print("Section changed after para") # print(para.text) # section,canvas_width,left_margin = recalculate_section_properties(n) # print(section.left_margin.inches) n = 0 try: section,canvas_width,left_margin = recalculate_section_properties(n) print(section.left_margin.inches) except: n =-1 section,canvas_width,left_margin = recalculate_section_properties(n) all_paras = read_doc.paragraphs first = all_paras[0].paragraph_format #print(first.left_indent) #count = 1 print("number of paras",len(all_paras)) #left_margin = 15 previous_indent= 0 with open(output_converted_txt, 'w', encoding='utf-8') as f: for para in all_paras: print('\n') paragraph_format = para.paragraph_format ## using the paragraph spacing add blank line if required try: space_before = paragraph_format.space_before.pt except: space_before = 0.0 try: space_after = paragraph_format.space_after.pt except: space_after = 0.0 print("space before") print(space_before) print("space after") print(space_after) try: print("line spacing ",paragraph_format.line_spacing.pt) print("line spacing rule ",paragraph_format.line_spacing_rule) if paragraph_format.line_spacing.pt < 5 and previous_indent > 20: continue #print("space before",paragraph_format.space_before.pt) except: pass section_changed = False try: ####check section end and remove if CONTINUED p = para._p sectPrs = p.xpath("./w:pPr/w:sectPr") if sectPrs: section_changed = True print("checking for continued at section change") text = para.text.split(' ') print(text) if len(text) == 1: skip_words = ['CONT','CONTD','CONTINUED',"CONT'D"] ## to be replaced by regex ,match found_continue = False for skip_word in skip_words: if skip_word in text[0]: #skip para print("found continued") found_continue = True break if found_continue: print("skipping para but setting new section") n= n+1 try: section,canvas_width,left_margin = recalculate_section_properties(n) print(section.left_margin.inches) except Exception as e: print(e) continue print("Continued not found at section change") except: pass if float(space_before) > 5.0 : print("adding blank line") f.write('\n') fli =0 li =0 ri =0 try: if para.style.name == 'List Paragraph': fli = 0 else: fli = paragraph_format.first_line_indent.inches except: pass try: li = paragraph_format.left_indent.inches except: pass try: ri = paragraph_format.right_indent.inches except: pass indent = int((fli + li ) * 10) print("calculated indent ",indent) data = para.text lines = data.split('\n') print("Examining para") try: print(para.text) print(para.style.name) except: pass print("lines in para",len(lines)) ## remove starting number (before margin) and number after 65 characters if len(lines) == 1 : if indent < 0: print(indent) #start = -(indent ) #if re.search('\d',lines[0][0:start]): # lines[0] = " ".join(lines[0].split()[1:]) #else: # lines[0] = lines[0][start:] indent = 0 lines[0] = lines[0].rstrip() if len(lines[0]) > 40: if lines[0][40:-2].strip() == '' and re.search('\d',lines[0][-2:]) : lines[0] = lines[0][0:-2] print(indent) for line in lines: #line = line.rjust(len(line) + indent + left_margin) line = line.replace('\t',' ') if indent == 0: indent = check_space(line) line = line.strip() if line: #print(line) print(fli,li,indent,ri) print(para.alignment) try: width = int(canvas_width - (indent + ri*10)) except: width = 58 - indent #if fli == 0 and li == 0 and str(para.alignment) == 'CENTER (1)': if str(para.alignment) == 'CENTER (1)' : ch_count = len(line) print("line is center aligned") print(ch_count) indent = indent + int((width-ch_count)/2) print(indent) #elif fli == 0 and li == 0 and str(para.alignment) == 'RIGHT (2)': elif str(para.alignment) == 'RIGHT (2)': ##removing fli li = ch_count = len(line) print("line is right aligned") print(ch_count) indent = indent + int(width-ch_count) print(indent) else: if str(para.alignment) == 'JUSTIFY (3)': line = ' '.join(line.split()) print("line is left aligned") if indent+left_margin > 55: indent = indent - 1 if width <= 0: width = 1 print("Calculated Width:",width) wrapped_lines = textwrap.wrap(line, width) wrapped_data_lines_count = len(wrapped_lines) if wrapped_data_lines_count > 1: print("need to wrap line") for wrapped_line in wrapped_lines: #print(wrapped_line) wrapped_line = wrapped_line.rjust(len(wrapped_line) + indent + left_margin) print(indent+left_margin) #print(wrapped_line) f.write(wrapped_line) f.write('\n') continue line = line.rjust(len(line) + indent + left_margin) f.write(line) f.write('\n') else: print("line is blank") f.write(line) f.write('\n') ####check section end #p = para._p #sectPrs = p.xpath("./w:pPr/w:sectPr") if section_changed: n = n +1 print("Section changed") section,canvas_width,left_margin = recalculate_section_properties(n) print(section.left_margin.inches) if space_after >5.0: print("adding blank line") f.write('\n') print("\n") previous_indent = indent + left_margin print("Converted to text") def conv_pdf_to_txt(input_script,output_converted_txt): # Load your PDF with open(input_script, "rb") as f: # pdf = pdftotext.PDF(f) #06-2-24 pass # # If it's password-protected # with open("secure.pdf", "rb") as f: # pdf = pdftotext.PDF(f, "secret") # How many pages? print(len(pdf)) # # Iterate over all the pages # for page in pdf: # print(page) # Read some individual pages # print(pdf[0]) # print(pdf[1]) # Read all the text into one string #print("\n\n".join(pdf)) txt_data = "\n\n".join(pdf) with open(output_converted_txt, "w", encoding="utf8") as out_file: out_file.write(txt_data) def conv_pdf_to_txt_java(input_script,output_converted_txt): from py4j.java_gateway import JavaGateway import sys #import global_file_db #pdf_file = global_file_db.input_script_pdf #converted_txt = global_file_db.input_text_file pdf_file = input_script gw = JavaGateway() result = gw.entry_point.strip(pdf_file) # result is a dict of { # 'success': 'true' or 'false', # 'payload': pdf file content if 'success' is 'true' # 'error': error message if 'success' is 'false' # } #print(result['error']) print(result['payload']) #print(result['success']) file = open(output_converted_txt, "w", encoding="utf8") file.write(str(result['payload'])) def conv_to_txt(input_script, output_converted_docx, output_converted_txt): extention = input_script.rsplit(".", 1)[-1] if extention == "txt": shutil.copyfile(input_script, output_converted_txt) elif extention == "pdf": # try: # conv_pdf_to_txt(input_script, output_converted_txt) # except: # conv_pdf_to_docx(input_script, output_converted_docx) # conv_docx_to_txt(output_converted_docx, output_converted_txt) conv_pdf_to_docx(input_script, output_converted_docx) conv_docx_to_txt(output_converted_docx, output_converted_txt) elif extention == "docx": conv_docx_to_txt(input_script, output_converted_txt) elif extention == "fdx": fdx = open(input_script, 'r') plain_txt = utilities.fdx_to_txt(fdx) with open(output_converted_txt, 'w') as f: f.write(plain_txt) else: raise ScriptAuditException(f"{extention} file is not supported for Audit!") def conv_to_df(txt_script) : script_data = open(txt_script, 'r', encoding="utf-8").read() script_data = script_data.split("\n") paragphs = [] line_no = 0.0 data = '' fields = ['line_no','data','Identification_Status','isIdentified'] df = pd.DataFrame([],columns= fields) for index_script in range(len(script_data)): # This replaces the new-line character with a space character within a paragraph. script_data[index_script] = script_data[index_script].replace("\n", " ") paragphs.append(script_data[index_script]) #data = script_data[index_script] for index_para in range(len(paragphs)): data = paragphs[index_para] line_no +=1 print("processing line",line_no) #print(data) df.loc[len(df.index)] = [str(line_no),data,'','No'] return df def conv_to_csv(txt_script,csv_for_processing) : #print(csv_for_processing) import csv script_data = open(txt_script, 'r', encoding="utf-8").read() script_data = script_data.split("\n") paragphs = [] line_no = 0.0 data = '' for index_script in range(len(script_data)): # This replaces the new-line character with a space character within a paragraph. script_data[index_script] = script_data[index_script].replace("\n", " ") paragphs.append(script_data[index_script]) fields = ['line_no','data','Identification_Status','isIdentified'] with open(csv_for_processing, 'w',newline='') as csvfile: # creating a csv writer object csvwriter = csv.writer(csvfile) # writing the fields csvwriter.writerow(fields) for index_para in range(len(paragphs)): data = paragphs[index_para] line_no +=1 print("processing line",line_no) #print(data) with open(csv_for_processing, 'a', encoding='utf-8',newline='') as csvfile: # creating a csv writer object csvwriter = csv.writer(csvfile) # writing the data rows csvwriter.writerow([str(line_no),data,'','No']) def pre_assign_wts(df): skip_words = ['INT.','EXT.','I/E','E/I','CUT TO','CUT BACK TO','FLASHCUT TO','DISSOLVE TO', 'INTERCUT', 'INTER CUT','PBS', 'INTERVAL', 'FLASHBACK','FADE IN','FADE TO BLACK','ON THE SCREEN','ON THE TV','MORNING','AT HOTEL','TV','MONTAGES','MUSICAL MONTAGES','ESSENTIALS','LATER','ESSENTIAL'] pos_sp_dial_line_nos = df.loc[(df['data'].str.strip().str.contains(r':-|:|-|".*"') == True) & (df['data'].str.strip().str.contains('|'.join(skip_words)) == False) ,'line_no'].to_list() print(pos_sp_dial_line_nos) new_pos_sp_dial_line_nos =pos_sp_dial_line_nos for index in df.loc[df['line_no'].isin(pos_sp_dial_line_nos),:].index: data = df['data'][index] line_no = 0.0 new_line_no = 0.0 pos_sp_par = '' line_no = df['line_no'][index] pos_sp ='' pos_par = '' pos_dia = '' pos_sp_par = '' print(df.dtypes) try: print(data) except: pass if ":-" in data: pos_sp_par = data.split(":-")[0] pos_dia = data.split(":-")[-1].strip() elif ":" in data: pos_sp_par = data.split(":")[0] pos_dia = data.split(":")[-1].strip() elif "-" in data: pos_sp_par = data.split("-")[0] pos_dia = data.split("-")[-1].strip() elif "\"" in data: pos_sp_par = data.split("\"")[0] pos_dia = data.split("\"")[-2].strip() pos_sp_par = pos_sp_par.strip() if pos_sp_par: #print(pos_sp_par) if "(" in pos_sp_par and ")" in pos_sp_par: pos_sp = pos_sp_par.split("(")[0] pos_par = "(" + pos_sp_par.split("(")[-1] else: pos_sp = pos_sp_par pos_par = '' print(pos_sp) print(pos_par) print(pos_dia) if pos_sp: has_digit = any(chr.isdigit() for chr in pos_sp) if not has_digit and pos_sp.isupper() and pos_dia.strip(): #if pos_dia.strip(): df['data'][index] = pos_sp df['preassigned_weights'][index] ='ps7-20' if pos_par: df.loc[index + 0.3] = np.nan df.loc[index + 0.3,'data'] = pos_par new_line_no = line_no + 0.3 df.loc[index + 0.3,'line_no'] = new_line_no df.loc[index + 0.3,'isIdentified'] = 'No' df.loc[index + 0.3,'preassigned_weights'] = 'ps10-20' new_pos_sp_dial_line_nos.append(new_line_no) print("split pos_par",df.loc[index + 0.3,'line_no']) if pos_dia: print("1",df.dtypes) df.loc[index + 0.6] = np.nan print("1.5",df.dtypes) df.loc[index + 0.6,'data'] = pos_dia new_line_no = line_no + 0.6 print(type(line_no),type(new_line_no)) df.loc[index + 0.6,'line_no'] = new_line_no print("2",df.dtypes) df.loc[index + 0.6,'isIdentified'] = 'No' df.loc[index + 0.6,'preassigned_weights'] = 'ps13-20;ps14-20;ps15-20' new_pos_sp_dial_line_nos.append(new_line_no) print("split pos_dia",df.loc[index + 0.6,'line_no'],type(df.loc[index + 0.6,'line_no'])) print("3",df.dtypes) df = df.sort_index().reset_index(drop=True) for index in df.index: df['line_no'][index] = float(index + 1) return df def create_audit_df(df): audit_df = df[['line_no','data']] audit_df['Identification_Status'] = '' audit_df['data_corrected'] = '' audit_df['audited_line_no'] = '' audit_df['scene_number'] = '' audit_df['line_removed'] = 'No' audit_df['introduction'] = 'No' audit_df['appendix'] = 'No' audit_df['page_no'] = 'No' audit_df['left_indent_corrected'] = 'No' audit_df['right_indent_corrected'] = 'No' audit_df['line_wrapped_at_prescribed_right_indent'] = 'No' audit_df['case_corrected'] = 'No' audit_df['blank_inserted_before'] = 'No' audit_df['blank_inserted_after'] = 'No' audit_df['blank_deleted_before'] = 'No' audit_df['blank_deleted_after'] = 'No' audit_df['space_removed_between_characters'] = 'No' audit_df['space_added_between_characters'] = 'No' audit_df['line_merged_with_next_line'] = 'No' audit_df['line_broken_into_multiple_lines'] = 'No' audit_df['punctuation_mark_added'] = 'No' audit_df['punctuation_mark_removed'] = 'No' audit_df['language_specific_audit_comments'] = 'No' audit_df.set_index('line_no',inplace=True) return audit_df def trim_intro(df,audit_df): stopwords = ['FADE IN' ] remove_upto = -1 intro_removed = False for index in df.index: data = df['data'][index] data = ' '.join(data.split()) for sw in stopwords: if re.search(sw,data,re.IGNORECASE): print("Found Fade In",index) remove_upto = index if remove_upto <= 100 : print("removing lines till ", remove_upto) while remove_upto != -1: line_no = df['line_no'][remove_upto] audit_df['line_removed'][line_no] = 'Yes' audit_df['introduction'][line_no] = 'Yes' df.drop(remove_upto,inplace= True) remove_upto -= 1 intro_removed = True print("title and introduction removed") break if intro_removed: break def remove_page_numbers(df,audit_df): page_no_found = False for index in df.index: data = df['data'][index] if check_space(data) > 54: pos_page_no = data.strip() if pos_page_no: for ch in pos_page_no: if not re.match('[\d\.]',ch): page_no_found = False break else: page_no_found = True else: continue if page_no_found: line_no = df['line_no'][index] audit_df['line_removed'][line_no] = 'Yes' audit_df['page_no'][line_no] = 'Yes' def get_per_uppercase(text): count_upper = 0 for ch in text.strip(): if ch.isupper(): count_upper += 1 try: return (int(count_upper/(len(text.strip()))*100)) except: return 0 def prep_for_audit(df): df.reset_index(inplace=True, drop=True) import re print("Entering prep_for_audit") df['data'].fillna('',inplace =True) if 'scene_number' not in df.columns: df['scene_number'] = '' if 'Identification_Status' not in df.columns: df['Identification_Status'] = '' if 'plb' not in df.columns: df['plb'] = '' if 'nlb' not in df.columns: df['nlb'] = '' if 'ssc' not in df.columns: df['ssc'] = '' if 'lcp' not in df.columns: df['lcp'] = 0 if 'case' not in df.columns: df['case'] = '' if 'per_uppercase' not in df.columns: df['per_uppercase'] = '' if 'parenthetical' not in df.columns: df['parenthetical'] = '' if 'pnbl_line_no' not in df.columns: df['pnbl_line_no'] = '' if 'nnbl_line_no' not in df.columns: df['nnbl_line_no'] = '' if 'ppnbl_line_no' not in df.columns: df['ppnbl_line_no'] = '' if 'nnnbl_line_no' not in df.columns: df['nnnbl_line_no'] = '' if 'pdil_line_no' not in df.columns: df['pdil_line_no'] = '' if 'ndil_line_no' not in df.columns: df['ndil_line_no'] = '' print("prep_for_audit- after if") #print(str(df['line_no'])) print("593") print(df) for index in df.index: #print(index) data=df['data'][index] #print(data) if check_space(data) >= 140 or data.isspace() or (not data ): df['Identification_Status'][index] = 'blank' first_line = False last_line = False if index == 0 : first_line = True plb = "N" else: pvs_data = df['data'][index-1] if index == df.index[-1]: last_line = True nlb = "N" else: next_data = df['data'][index+1] print("616") if (not first_line): if check_space(pvs_data) >= 140 or pvs_data.isspace() or (not pvs_data ): plb = "Y" else: plb = "N" #print(plb) if (not last_line): if check_space(next_data) >= 140 or next_data.isspace() or (not next_data ): nlb = "Y" else: nlb = "N" #print(nlb) print("633") cur_indent = check_space(data) lcp = get_last_char_pos(data) case = get_case(data) per_uppercase = get_per_uppercase(data) par = '' if re.match('\(',data.strip()[:1]): if re.match('\)',data.strip()[-1:]) : par = 'Complete' elif re.search('\)',data.strip()) : par = 'PartStartMid' else: par = 'StartingLeft' elif re.match('\)',data.strip()[-1:]): if re.search('\(',data.strip()): par = 'PartMidEnd' else: par = 'EndingRight' # beginning end already checked so now if paren present it is mixed elif re.search('\(',data.strip()) and re.search('\)',data.strip()): par = 'PartMidMid' elif re.search('\(',data.strip()): par = 'MixedLeft' elif re.search('\)',data.strip()): par = 'MixedRight' else: par = 'Absent' print("660") df['plb'][index] = plb df['nlb'][index] = nlb df['ssc'][index] = cur_indent df['lcp'][index] = lcp df['case'][index] = case df['parenthetical'][index] = par df['per_uppercase'][index] = per_uppercase ## pnlb ? if first_line: pnbl_line_no = 0 elif plb == 'N': pnbl_line_no = df['line_no'][index -1] elif index - 1 == 0: pnbl_line_no = 0 else: pnbl_line_no = df['line_no'][index -2] print("678") ## nnlb ? if last_line: nnbl_line_no = 100000 elif nlb == 'N': nnbl_line_no = df['line_no'][index +1] elif index + 1 == df.index[-1]: nnbl_line_no = 100000 else: try: nnbl_line_no = df['line_no'][index +2] except Exception as e: print("Exception--",e) i = float(index) + 2 print("691",i,index) print(str(df['line_no'])) print("692",df['line_no'][i]) print(nnbl_line_no) print("694") df['pnbl_line_no'][index] = pnbl_line_no df['nnbl_line_no'][index] = nnbl_line_no print("prep_for_audit- after 1st for loop") for index in df.index: line_no = df['line_no'][index] pnbl_line_no = df['pnbl_line_no'][index] if pnbl_line_no == 0: ppnbl_line_no = 0 else: ppnbl_line_no = df.loc[df['line_no'] == pnbl_line_no, 'pnbl_line_no'].values[0] nnbl_line_no = df['nnbl_line_no'][index] print(index,line_no,pnbl_line_no,nnbl_line_no) if nnbl_line_no == 100000: nnnbl_line_no = 100000 else: nnnbl_line_no = df.loc[df['line_no'] == nnbl_line_no, 'nnbl_line_no'].values[0] df['ppnbl_line_no'][index] = ppnbl_line_no df['nnnbl_line_no'][index] = nnnbl_line_no print("prep_for_audit- after 2nd for loop") for index in df.index: data=df['data'][index] pdil_line_no = 0 cur_indent = df['ssc'][index] ##pdil ## lets find previous different indent line print(index,"looking for previous different indent line") if index == 0: df['pdil_line_no'][index] = pdil_line_no continue pdil_index = index - 1 while pdil_index >= 0 : pdil_indent = df['ssc'][pdil_index] print(cur_indent,pdil_indent) if df['Identification_Status'][pdil_index] != 'blank' and pdil_indent != cur_indent: pdil_line_no = df['line_no'][pdil_index] break else: pdil_index -= 1 df['pdil_line_no'][index] = pdil_line_no print("prep_for_audit- after 3rd for loop") for index in df.index: data=df['data'][index] ndil_line_no = 100000 cur_indent = df['ssc'][index] print("looking for next different indent line") if index == df.index[-1]: df['ndil_line_no'][index] = ndil_line_no continue ndil_index = index + 1 # ndil while ndil_index <= df.index[-1]: ndil_indent = df['ssc'][ndil_index] print(cur_indent,ndil_indent) if df['Identification_Status'][ndil_index] != 'blank' and ndil_indent != cur_indent: ndil_line_no = df['line_no'][ndil_index] break else: ndil_index += 1 df['ndil_line_no'][index] = ndil_line_no return df def remove_extra_blank_lines(df,audit_df): # remove two or more consequtive blank lines.. keep one for index in range(0,df.index[-1]): data = df['data'][index] line_no = df['line_no'][index] nl_data = df['data'][index+1] try: print(data) except: pass if not data.strip() and not nl_data.strip(): audit_df['line_removed'][line_no] = 'Yes' audit_df['Identification_Status'][line_no] = 'blank' elif not data.strip() and nl_data.strip(): df['plb'][index] = 'N' def remove_blank_line_after_parenthetical(df,audit_df): # remove two or more consequtive blank lines.. keep one for index in range(0,df.index[-1]): data = df['data'][index] line_no = df['line_no'][index] nl_data = df['data'][index+1] nl_line_no = df['line_no'][index+1] try: print(data) except: pass if df['parenthetical'][index] in ('Complete','EndingRight') and not nl_data.strip(): audit_df['line_removed'][nl_line_no] = 'Yes' df['nlb'][index] = 'N' def merge_broken_lines(df,audit_df): index_iter = iter(range(0,df.index[-1])) for index in index_iter: cur_line_data = df['data'][index] cur_line_indent = df['ssc'][index] cur_case = 'AllUpper' cur_lcp = df['lcp'][index] nnbl_line_no = df['nnbl_line_no'][index] nlb = df['nlb'][index] # if nlb == 'Y': # next_nbl_index = index +2 # if next_nbl_index > df.index[-1]: # continue # else: # next_nbl_index = index +1 try: next_nbl_data = df.loc[df['line_no'] == nnbl_line_no , 'data'].values[0] next_nbl_indent = df.loc[df['line_no'] == nnbl_line_no , 'ssc'].values[0] next_nbl_case = df.loc[df['line_no'] == nnbl_line_no , 'case'].values[0] except: next_nbl_data = '' next_nbl_indent = 0 line_no = df['line_no'][index] #next_nbl_line_no = df['line_no'][next_nbl_index] two_line_data = '' indent_dif = next_nbl_indent - cur_lcp print(line_no,indent_dif) if indent_dif > 0 and indent_dif <= 3 and next_nbl_case != 'AllUpper' and cur_case != 'AllUpper': if indent_dif == 1: two_line_data = cur_line_data.rstrip() + next_nbl_data.lstrip() else: two_line_data = cur_line_data.rstrip() + ' ' + next_nbl_data.lstrip() two_line_len = len(two_line_data.strip()) print(index,line_no,cur_line_indent,next_nbl_indent,two_line_len) print(cur_line_data) print(next_nbl_data) if two_line_len < 150: print("merging lines") df['data'][index] = two_line_data case = get_case(two_line_data) df['case'][index] = case # lcp = get_last_char_pos(two_line_data) # df['last_character_placement'][index] = lcp print(line_no) audit_df['line_merged_with_next_line'][line_no] = 'Yes' print(two_line_data) audit_df['line_removed'][nnbl_line_no] = 'Yes' # try: # df['nlb'][next_nbl_index-1] = df['nlb'][next_nbl_index] # except: # pass # try: # df['plb'][next_nbl_index+1] = df['plb'][next_nbl_index] # except: # pass if nlb == 'N': next(index_iter) else: next(index_iter) next(index_iter) else: print(cur_line_data) else: print(index,cur_line_indent,next_nbl_indent) try: print(cur_line_data) except: pass #newfile.write(cur_line_data) def remove_space_between_words(df,audit_df): lines_removed = audit_df.loc[audit_df['line_removed'] == 'Yes'].index.to_list() # remove extra spaces between the words for index in df.index: cur_indent = df['ssc'][index] line_no = df['line_no'][index] if (line_no in lines_removed) or cur_indent > 140: continue data = df['data'][index] new_data = '' words = data.lstrip().split() for word in words: #print(word) new_data += word + " " new_data = new_data.rjust(len(new_data)+cur_indent) df['data'][index] = new_data df['lcp'][index] = get_last_char_pos(df['data'][index]) if new_data.strip() != data.strip(): audit_df['space_removed_between_characters'][line_no] = 'Yes' print(index) try: print(data) print(new_data) except: pass #df = df.loc[df['line_removed'] != 'Yes',:] def get_strict_conditions(csv_strict_conditions): import pandas as pd conditions_df = pd.read_csv(csv_strict_conditions, index_col = [0], skiprows = [0]) conditions_df = conditions_df.head(30) cols = conditions_df.columns conditions_df.rename(columns= { cols[3]:'cl_plb', cols[4]:'cl_nlb', cols[5]:'cl_ssc', cols[6]:'cl_lcp', cols[7]:'cl_par', cols[8]:'cl_case', cols[9]:'cl_per_uppercase', cols[10]:'pnbl_plb', cols[11]:'pnbl_par', cols[12]:'pnbl_vs_cur_indent', cols[15]:'pnbl_case', cols[16]:'nnbl_nlb', cols[17]:'nnbl_par', cols[18]:'nnbl_vs_cur_indent', cols[21]:'nnbl_case', cols[22]:'pdil_plb', cols[23]:'pdil_nlb', cols[24]:'pdil_vs_cur_indent', cols[27]:'pdil_par', cols[29]:'ndil_plb', cols[30]:'ndil_nlb', cols[31]:'ndil_vs_cur_indent', cols[34]:'ndil_par', }, inplace = True) conditions_df = conditions_df[['cl_plb','cl_nlb','cl_ssc','cl_lcp','cl_par','cl_case','cl_per_uppercase', 'pnbl_plb','pnbl_par','pnbl_vs_cur_indent','pnbl_case', 'nnbl_nlb','nnbl_par','nnbl_vs_cur_indent','nnbl_case', 'pdil_plb','pdil_nlb','pdil_par','pdil_vs_cur_indent', 'ndil_plb','ndil_nlb','ndil_par','ndil_vs_cur_indent']] return conditions_df def test_strict_conditions(df,csv_strict_conditions): import pandas as pd left_aligned = True for index in df.index: if df['ssc'][index] > 15: left_aligned = False break ## if conversion to text is left aligned then dialogue middle wrongly getting identified as action middle so skipping strict contions if left_aligned: df.loc[df['Identification_Status'] == 'blank','isIdentified'] = 'Yes' return conditions_df = get_strict_conditions(csv_strict_conditions) #df['isIdentified'] = 'No' df['When_Identified'] = '' bb = False for index in df.index: if df['isIdentified'][index] == 'Yes': continue cl_plb = df['plb'][index] cl_nlb = df['nlb'][index] cl_indent = pd.to_numeric(df['ssc'][index]) cl_lcp = df['lcp'][index] cl_par = df['parenthetical'][index] cl_case = str(df['case'][index]) cl_per_uppercase = df['per_uppercase'][index] pnbl_plb = None pnbl_indent = None pnbl_par = None pnbl_case = None nnbl_nlb = None nnbl_indent = None nnbl_par = None nnbl_case = None pnbl = True nnbl = True pdil = True ndil = True pdil_plb = None pdil_nlb = None pdil_par = None pdil_indent = None ndil_plb = None ndil_nlb = None ndil_par = None ndil_indent = None pnbl_vs_cur_indent = "NA" nnbl_vs_cur_indent = "NA" pdil_vs_cur_indent = "NA" ndil_vs_cur_indent = "NA" try: pnbl_line_no = df['pnbl_line_no'][index] pnbl_index = df.loc[df['line_no'] == pnbl_line_no, :].index.values[0] except: pnbl = False if pnbl: pnbl_plb = df['plb'][pnbl_index] pnbl_indent = df['ssc'][pnbl_index] pnbl_par = df['parenthetical'][pnbl_index] pnbl_case = str(df['case'][pnbl_index]) if pnbl_indent > cl_indent: pnbl_vs_cur_indent = "More" elif pnbl_indent == cl_indent: pnbl_vs_cur_indent = "Same" else: pnbl_vs_cur_indent = "Less" #print(pnbl_index) try: nnbl_line_no = df['nnbl_line_no'][index] nnbl_index = df.loc[df['line_no'] == nnbl_line_no, :].index.values[0] nnbl_nlb = df['nlb'][nnbl_index] nnbl_indent = df['ssc'][nnbl_index] nnbl_par = df['parenthetical'][nnbl_index] nnbl_case = str(df['case'][nnbl_index]) if nnbl_indent > cl_indent: nnbl_vs_cur_indent = "More" elif nnbl_indent == cl_indent: nnbl_vs_cur_indent = "Same" else: nnbl_vs_cur_indent = "Less" except: nnbl = 'afterlast' try: pdil_line_no = df['pdil_line_no'][index] pdil_index = df.loc[df['line_no'] == pdil_line_no, :].index.values[0] except: pdil = False if pdil: pdil_plb = df['plb'][pdil_index] pdil_nlb = df['nlb'][pdil_index] pdil_par = df['parenthetical'][pdil_index] pdil_indent = df['ssc'][pdil_index] if pdil_indent > cl_indent: pdil_vs_cur_indent = "More" elif pdil_indent == cl_indent: pdil_vs_cur_indent = "Same" else: pdil_vs_cur_indent = "Less" try: ndil_line_no = df['ndil_line_no'][index] ndil_index = df.loc[df['line_no'] == ndil_line_no, :].index.values[0] except: ndil = False if ndil: ndil_plb = df['plb'][ndil_index] ndil_nlb = df['nlb'][ndil_index] ndil_par = df['parenthetical'][ndil_index] ndil_indent = df['ssc'][ndil_index] if ndil_indent > cl_indent: ndil_vs_cur_indent = "More" elif ndil_indent == cl_indent: ndil_vs_cur_indent = "Same" else: ndil_vs_cur_indent = "Less" cl_pos = '' ## get the conditions #for j in range(1,32): for j in range(1,18): if j in [23,24,32,33]: continue ev_cl_plb = conditions_df['cl_plb' ]["ps{0}".format(j)] ev_cl_nlb = conditions_df['cl_nlb' ]["ps{0}".format(j)] ev_cl_indent_range = conditions_df['cl_ssc' ]["ps{0}".format(j)].split('-') try: ev_cl_indent_from = pd.to_numeric(ev_cl_indent_range[0]) except: ev_cl_indent_from = 200 try: ev_cl_indent_to = pd.to_numeric(ev_cl_indent_range[1]) except: ev_cl_indent_to = ev_cl_indent_from ev_cl_lcp_range = conditions_df['cl_lcp' ]["ps{0}".format(j)].split('-') try: ev_cl_lcp_from = pd.to_numeric(ev_cl_lcp_range[0]) except: ev_cl_lcp_from = 200 try: ev_cl_lcp_to = pd.to_numeric(ev_cl_lcp_range[1]) except: ev_cl_lcp_to = ev_cl_lcp_from ev_cl_par = conditions_df['cl_par' ]["ps{0}".format(j)].split(";") ev_cl_case = [] ev_cl_case = conditions_df['cl_case' ]["ps{0}".format(j)].split(";") ev_cl_per_uppercase = conditions_df['cl_per_uppercase']["ps{0}".format(j)].split(";") try: operator = ev_cl_per_uppercase[0] value = int(ev_cl_per_uppercase[1]) except: operator = "" value = "" ev_pnbl_plb = conditions_df['pnbl_plb' ]["ps{0}".format(j)] ev_pnbl_vs_cur_indent = conditions_df['pnbl_vs_cur_indent' ]["ps{0}".format(j)].split(";") ev_pnbl_par = conditions_df['pnbl_par' ]["ps{0}".format(j)].split(";") ev_pnbl_case = conditions_df['pnbl_case' ]["ps{0}".format(j)].split(";") ev_nnbl_nlb = conditions_df['nnbl_nlb' ]["ps{0}".format(j)] ev_nnbl_vs_cur_indent = conditions_df['nnbl_vs_cur_indent' ]["ps{0}".format(j)].split(";") ev_nnbl_par = conditions_df['nnbl_par' ]["ps{0}".format(j)].split(";") ev_nnbl_case = conditions_df['nnbl_case' ]["ps{0}".format(j)].split(";") ev_pdil_plb = conditions_df['pdil_plb' ]["ps{0}".format(j)] ev_pdil_nlb = conditions_df['pdil_nlb' ]["ps{0}".format(j)] ev_pdil_vs_cur_indent = conditions_df['pdil_vs_cur_indent' ]["ps{0}".format(j)].split(";") ev_pdil_par = conditions_df['pdil_par' ]["ps{0}".format(j)].split(";") ev_ndil_plb = conditions_df['ndil_plb' ]["ps{0}".format(j)] ev_ndil_nlb = conditions_df['ndil_nlb' ]["ps{0}".format(j)] ev_ndil_vs_cur_indent = conditions_df['ndil_vs_cur_indent' ]["ps{0}".format(j)].split(";") ev_ndil_par = conditions_df['ndil_par' ]["ps{0}".format(j)].split(";") ## checks if not pnbl: pnbl_plb_check = True pnbl_indent_check = True pnbl_par_check = True pnbl_case_check = True else: pnbl_plb_check = True if (pnbl_plb == ev_pnbl_plb) or (ev_pnbl_plb == 'Maybe') else False pnbl_indent_check = True if (pnbl_vs_cur_indent in ev_pnbl_vs_cur_indent ) else False pnbl_par_check = True if pnbl_par in ev_pnbl_par else False if pnbl_case in ev_pnbl_case or ev_pnbl_case == '' : pnbl_case_check = True else: pnbl_case_check = False if not nnbl : nnbl_nlb_check = True nnbl_indent_check = True nnbl_par_check = True nnbl_case_check = True else: nnbl_nlb_check = True if (nnbl_nlb == ev_nnbl_nlb) or (ev_nnbl_nlb == 'Maybe') else False nnbl_indent_check = True if (nnbl_vs_cur_indent in ev_nnbl_vs_cur_indent ) else False nnbl_par_check = True if nnbl_par in ev_nnbl_par else False if nnbl_case in ev_nnbl_case or ev_nnbl_case == '' : nnbl_case_check = True else: nnbl_case_check = False if not pdil: pdil_plb_check = True pdil_nlb_check = True pdil_indent_check = True pdil_par_check = True else: pdil_plb_check = True if (pdil_plb == ev_pdil_plb) or (ev_pdil_plb == 'Maybe') else False pdil_nlb_check = True if (pdil_nlb == ev_pdil_nlb) or (ev_pdil_nlb == 'Maybe') else False pdil_indent_check = True if (pdil_vs_cur_indent in ev_pdil_vs_cur_indent ) else False pdil_par_check = True if pdil_par in ev_pdil_par else False if not ndil: ndil_plb_check = True ndil_nlb_check = True ndil_indent_check = True ndil_par_check = True else: ndil_plb_check = True if (ndil_plb == ev_ndil_plb) or (ev_ndil_plb == 'Maybe') else False ndil_nlb_check = True if (ndil_nlb == ev_ndil_nlb) or (ev_ndil_nlb == 'Maybe') else False ndil_indent_check = True if (ndil_vs_cur_indent in ev_ndil_vs_cur_indent ) else False ndil_par_check = True if ndil_par in ev_ndil_par else False cl_indent_check = False cl_lcp_check = False ## check conditions cl_plb_check = True if (cl_plb == ev_cl_plb) or (ev_cl_plb == 'Maybe') else False cl_nlb_check = True if (cl_nlb == ev_cl_nlb) or (ev_cl_nlb == 'Maybe') else False cl_indent_check = True if (cl_indent >= ev_cl_indent_from) and (cl_indent <= ev_cl_indent_to) else False cl_lcp_check = True if (cl_lcp >= ev_cl_lcp_from) and (cl_lcp <= ev_cl_lcp_to) else False cl_par_check = True if cl_par in ev_cl_par else False if j == 21 and cl_case in ev_cl_case: data = df['data'][index] if data.split()[-1] == data.split()[-1].upper(): cl_case_check = True else: cl_case_check = False else: cl_case_check = True if cl_case in ev_cl_case else False cl_per_uppercase_check = True ## cl percentage upper checks if operator == "lessthan": cl_per_uppercase_check = True if cl_per_uppercase <= value else False elif operator == "morethan": cl_per_uppercase_check = True if cl_per_uppercase >= value else False elif operator == "equal": cl_per_upperacse_check = True if cl_per_uppercase == value else False #cl_par_check = True checklist = [cl_plb_check,cl_nlb_check,cl_indent_check,cl_lcp_check,cl_par_check,cl_case_check,cl_per_uppercase_check, pnbl_plb_check,pnbl_indent_check,pnbl_par_check,pnbl_case_check, nnbl_nlb_check,nnbl_indent_check,nnbl_par_check,nnbl_case_check, pdil_plb_check,pdil_nlb_check,pdil_indent_check,pdil_par_check, ndil_plb_check,ndil_nlb_check,ndil_indent_check,ndil_par_check] if all(checklist): cl_pos = "ps{0}".format(j) df['Identification_Status'][index] = cl_pos df['isIdentified'][index] = 'Yes' df['When_Identified'][index] = 'FirstStrictConditions' break df.loc[df['Identification_Status'] == 'blank','isIdentified'] = 'Yes' def prep_weights_csv (weights_csv) : wts_df = pd.read_csv(weights_csv,skiprows=[0]) wts_df = wts_df.head(50) wts_df.rename(columns={wts_df.columns[1]:'Possibilities',wts_df.columns[2]:'Description', wts_df.columns[3]:'PureImpure',wts_df.columns[7]:'AllUpper', wts_df.columns[8]:'AllLower', wts_df.columns[9]:'FirstCamel', wts_df.columns[10]:'FirstUpper',wts_df.columns[11]:'FirstLowerSomeUpper', wts_df.columns[12]:'Partial',wts_df.columns[13]:'EntireLine', wts_df.columns[14]:'PartofLine',wts_df.columns[15]:'only left parenthetical present', wts_df.columns[16]:'only right parenthetical present', wts_df.columns[17]:'PLB_Yes',wts_df.columns[18]:'PLB_No', wts_df.columns[19]:'NLB_Yes',wts_df.columns[20]:'NLB_No', wts_df.columns[21]:'<15withNumeric', # wts_df.columns[28]:'<15withoutNumeric', # wts_df.columns[33]:'cur_indent_equals_pnbl', # wts_df.columns[34]:'cur_indent_equals_nnbl', # wts_df.columns[35]:'containsSpecialWords1', # wts_df.columns[36]:'containsSpecialWords2', # wts_df.columns[37]:'containsSpecialWords3', # wts_df.columns[38]:'containsSpecialWords4' wts_df.columns[29]:'<15withoutNumeric', wts_df.columns[34]:'cur_indent_equals_pnbl', wts_df.columns[35]:'cur_indent_equals_nnbl', wts_df.columns[36]:'containsSpecialWords1', wts_df.columns[37]:'containsSpecialWords2', wts_df.columns[38]:'containsSpecialWords3', wts_df.columns[39]:'containsSpecialWords4' },inplace = True) wts_df = wts_df.loc[:,['Possibilities', 'Description', 'PureImpure','AllUpper','AllLower','FirstCamel','FirstUpper', 'FirstLowerSomeUpper', 'Partial','EntireLine', 'PartofLine', 'only left parenthetical present', 'only right parenthetical present', 'PLB_Yes', 'PLB_No', 'NLB_Yes', 'NLB_No', '<15withNumeric', 'ssc_15', 'ssc_25', 'ssc_30', 'ssc_35','ssc_55','ssc_65','ssc_gt_65', '<15withoutNumeric','lcp_35','lcp_49','lcp_59','lcp_72','cur_indent_equals_pnbl','cur_indent_equals_nnbl','containsSpecialWords1','containsSpecialWords2','containsSpecialWords3','containsSpecialWords4']] ## interpolate the in between weights for the starting space count sub = wts_df.loc[:,['Possibilities','ssc_15', 'ssc_25', 'ssc_30', 'ssc_35','ssc_55','ssc_65']] sub = sub.set_index('Possibilities') for col in range(16,65): if col in [25,30,35]: continue sub['ssc_{}'.format(col)] = np.nan sub = sub.sort_index(axis =1).interpolate(axis = 1).apply(round).reset_index() wts_df = wts_df.merge(sub, how ='inner', on = ['Possibilities'], suffixes=('','_y')) ## interpolate the in between weights for the last character placement wts_df['lcp_30'] = 1 wts_df['lcp_75'] = 1 sub = pd.DataFrame() sub = wts_df.loc[:,['Possibilities','lcp_30','lcp_35','lcp_49','lcp_59','lcp_72','lcp_75']] sub = sub.set_index('Possibilities') for col in range(31,75): if col in [35,49,59,72]: continue sub['lcp_{}'.format(col)] = np.nan sub = sub.sort_index(axis =1).interpolate(axis = 1).apply(round).reset_index() wts_df = wts_df.merge(sub, how ='inner', on = ['Possibilities'], suffixes=('','_y')) wts_df.set_index('Possibilities',inplace =True) return wts_df def give_largest(df, n): largest = df.nlargest(n) data = [x for x in largest] index = [f'{i}_largest' for i in range(1, len(largest)+1)] return pd.Series(data, index=index) def n_largest(df, axis, n): ''' Function to return the n-largest value of each column/row of the input DataFrame. ''' return df.apply(give_largest, axis=axis, n=n) def update_parenthetical_neighbor_wt(df): print("updating weghts of parenthetical neighbors") ## line before complete of StartingLeft or Complete for index in df.index: par = df['parenthetical'][index] if par == 'Absent': continue if par in ('StartingLeft','Complete'): print(index,par) try: if df['plb'][index] == 'N': df["ps7"][index-1] += 10 else: df["ps7"][index-2] += 10 except: pass ##line after Complete or EndingRight if par in ('EndingRight','Complete'): print(index,par) try: if df['nlb'][index] == 'N': df['ps13'][index+1] += 15 df['ps15'][index+1] += 15 else: df['ps13'][index+2] += 15 df['ps15'][index+2] += 15 except: pass return df def gen_pos_weights(df,weights_csv): # prep weights csv wts_df = prep_weights_csv(weights_csv) import os import csv import re #mport global_file_db import sys from pathlib import Path import argparse import numpy as np import pandas as pd import math left_aligned = True contains_special = False for index in df.index: if df['ssc'][index] > 16: left_aligned = False break print("is script left aligned: ",left_aligned) for i in range(1,32): if i in ('23','24','32','33'): continue df["ps{0}".format(i)] = 0 for index in df.index: line_no = df['line_no'][index] data = df['data'][index] plb = df['plb'][index] nlb = df['nlb'][index] contains_special = False print("processing weight for ",line_no) # if index == 0: # pnbl_index = 'first' # elif df['plb'][index] == 'N' : # pnbl_index = index -1 # elif index - 1 == 0: # pnbl_index = 'first' # else: # pnbl_index = index -2 # if index == df.index[-1]: # nnbl_index = 'last' # elif df['nlb'][index] == 'N' : # nnbl_index = index + 1 # elif index+1 == df.index[-1]: # nnbl_index = 'last' # else: # nnbl_index = index + 2 pnbl_index = False nnbl_index = False try: pnbl_line_no = df['pnbl_line_no'][index] pnbl_index = df.loc[df['line_no'] == pnbl_line_no,:].index.values[0] except: pnbl_index = False try: nnbl_line_no = df['nnbl_line_no'][index] nnbl_index = df.loc[df['line_no'] == nnbl_line_no,:].index.values[0] except: nnbl_index = False try: pnbl_indent = df['ssc'][pnbl_index] except: pnbl_indent = -1 try: nnbl_indent = df['ssc'][nnbl_index] except: nnbl_indent = -1 cur_indent = df['ssc'][index] ssc_col = 'ssc_' + str(cur_indent) print(ssc_col) case = df['case'][index] try: print("processing line no",line_no, data) except: pass print(plb) print(nlb) print(pnbl_indent) print(nnbl_indent) ### wights to be assigned based on space count, case, parentheseis and plb/nlb # read the weights csv #wts_df = pd.read_csv('weights1.csv',index_col = 'Possibilities') ## create the wights for last character placement from 41-78 by interpolation using wwights 51 63 78 lcp = df['lcp'][index] #print("lcp ",lcp) try: lcp_col = "lcp_" + str(int(lcp)) except: lcp_col = "lcp_" + str(lcp) #print(wts_df.head(0)) # make space dict for getting relevant space columns for weights sp_bin_dict = {1:'0-14',2:'15',3:'16-24',4:'25',5:'26-29',6:'30',7:'31-34',8:'35',9:'36-73',10:'74onwards' } #loop over for the possibilities for i in range(1,32): if i in ('23','24','32','33'): continue df["ps{0}".format(i)][index] = 0 ## get weights for the case if case in ('EndUpper','MidUpper'): case = 'FirstLowerSomeUpper' if case != 'None': df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),case] ## get weights based on the starting space count try: df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),ssc_col] #print("starting wight code was here") except: pass print("ps{0}".format(i),df["ps{0}".format(i)][index]) ## get weights for <19 with Numeric character or <19 without Numeric character try: start_num = True if re.search('[0-9]',data.strip()[0]) else False except: start_num = False pos_num = re.search('[0-9]',data) if (pos_num!= None) and start_num and cur_indent<15: df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'<15withNumeric'] elif check_space(data)<15: df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'<15withoutNumeric'] if cur_indent>65: df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'ssc_gt_65'] ## get weights based on the last character placement print("ps{0}".format(i),df["ps{0}".format(i)][index]) try: df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),lcp_col] print("code was here") print(wts_df.loc["ps{0}".format(i),lcp_col]) except Exception as e: print ("lcp exception is",e) pass # how far is it from position 51 63 78 # 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 # modify the wights matrix and create in between weights #print("ps{0}".format(i),df["ps{0}".format(i)][index]) # Calculation of weights based on plb and nlb(L-O column in sheet) if plb == "Y": df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'PLB_Yes'] if plb == "N": df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'PLB_No'] if nlb == "Y": df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'NLB_Yes'] if nlb == "N": df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'NLB_No'] #print("ps{0}".format(i),df["ps{0}".format(i)][index]) # Calculation of weights based on parenthesis(H-K column in sheet) if re.match('\(',data.strip()[:1]) and re.match('\)',data.strip()[-1:]) : # print('EntireLine') df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'EntireLine'] elif re.search('\(',data.strip()) and re.search('\)',data.strip()) : #print('PartofLine') df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'PartofLine'] elif re.search('\(',data.strip()) and not(re.search('\)',data.strip())) : #print('only left parenthetical present') df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'only left parenthetical present'] elif not(re.search('\(',data.strip())) and re.search('\)',data.strip()) : #print('only right parenthetical present') df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'only right parenthetical present'] #df["ps{0}".format(i)][index] = math.trunc(df["ps{0}".format(i)][index]) #print("i is ",i) #print(math.trunc(ps_dict["ps{0}".format(i)])) ## Calculation of weights based on indent equals previous / next non blank line if cur_indent == pnbl_indent: df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'cur_indent_equals_pnbl'] if cur_indent == nnbl_indent: df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'cur_indent_equals_nnbl'] print("ps{0}".format(i),df["ps{0}".format(i)][index]) print("Special Words Check") ## calculation of weights based on special words sp_words1 = ['cut to','CUT BACK TO','FLASHCUT TO','dissolve to', 'intercut', 'Inter Cut','PBS', 'interval', 'Flashback','FADE IN','FADE TO BLACK'] for sp_word in sp_words1: print(sp_word) search_data = data.replace(":","") match = re.match(sp_word,search_data.strip(),re.IGNORECASE) if match: contains_special = True break print (contains_special,search_data) if not contains_special: search_data = data.strip() ## check if within quotes if search_data: if len(search_data) > 3: if (search_data.startswith('ā€œ') or search_data.startswith('"')): if (search_data.endswith('ā€') or search_data.endswith('"')): contains_special = True if contains_special: try: print("found match in ",data) except: print("found match ") for i in range(1,32): if i in ('23','24','32','33'): continue df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'containsSpecialWords1'] ## calculation of weights based on special slug words sp_words3 = ['INT.','EXT.','I/E','E/I','EXT-','INT-'] if not contains_special: for sp_word in sp_words3: print(sp_word) #search_data = data.replace(":","") found = re.search(sp_word,data.strip()[0:8]) if found: contains_special = True try: print("found match in ",data) except: print("found match ") for i in range(1,32): if i in ('23','24','32','33'): continue df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'containsSpecialWords3'] break ## calculation of weights based on special slug endings sp_words4 = [' - MORNING',' - DAY',' - EVENING',' - EVE',' - NIGHT',' - LATER',' - AFTERNOON'] for sp_word in sp_words4: found = re.search(sp_word,data.strip()) if found: contains_special = True try: print("found match in ",data) except: print("found match ") for i in range(1,32): if i in ('23','24','32','33'): continue df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'containsSpecialWords4'] break # speaker possble if single all caps word if left_aligned and not contains_special: if case == 'AllUpper' and len(data.split()) <= 2 and "." not in data and ":" not in data and df['lcp'][index] < 30 : print("boosting speaker possibility") df["ps7"][index] += 30 ## add preassigned weight if not contains_special: if df['preassigned_weights'][index]: pre_psw_list = df['preassigned_weights'][index].split(';') for psw in pre_psw_list: ps = psw.split('-')[0] wt = psw.split('-')[1] df[ps][index] += int(wt) df = update_parenthetical_neighbor_wt(df) if 'actual_element' not in df.columns: df['actual_element'] = '' return df def sort_pos_decr_wts(df): ## sort in decreasing order x = n_largest(df[['ps1', 'ps2', 'ps3', 'ps4', 'ps5', 'ps6', 'ps7', 'ps8', 'ps9', 'ps10', 'ps11', 'ps12', 'ps13', 'ps14', 'ps15', 'ps16', 'ps17', 'ps18', 'ps19', 'ps21', 'ps22', 'ps25', 'ps26', 'ps27','ps28','ps29', 'ps30', 'ps31']], axis=1, n=28) df.insert(8, "1_largest", x['1_largest']) df.insert(9, "2_largest", x['2_largest']) df.insert(10, "3_largest", x['3_largest']) df.insert(11, "4_largest", x['4_largest']) df.insert(12, "5_largest", x['5_largest']) df.insert(13, "6_largest", x['6_largest']) df.insert(14, "7_largest", x['7_largest']) df.insert(15, "8_largest", x['8_largest']) df.insert(16, "9_largest", x['9_largest']) df.insert(17, "10_largest", x['10_largest']) df.insert(18, "11_largest", x['11_largest']) df.insert(19, "12_largest", x['12_largest']) df.insert(20, "13_largest", x['13_largest']) df.insert(21, "14_largest", x['14_largest']) df.insert(22, "15_largest", x['15_largest']) df.insert(23, "16_largest", x['16_largest']) df.insert(24, "17_largest", x['17_largest']) df.insert(25, "18_largest", x['18_largest']) df.insert(26, "19_largest", x['19_largest']) df.insert(27, "20_largest", x['20_largest']) df.insert(28, "21_largest", x['21_largest']) df.insert(29, "22_largest", x['22_largest']) df.insert(30, "23_largest", x['23_largest']) df.insert(31, "24_largest", x['24_largest']) df.insert(32, "25_largest", x['25_largest']) df.insert(33, "26_largest", x['26_largest']) df.insert(34, "27_largest", x['27_largest']) df.insert(35, "28_largest", x['28_largest']) b = df[['ps1', 'ps2', 'ps3', 'ps4', 'ps5', 'ps6', 'ps7', 'ps8', 'ps9', 'ps10', 'ps11', 'ps12', 'ps13', 'ps14', 'ps15', 'ps16', 'ps17', 'ps18', 'ps19', 'ps21', 'ps22', 'ps25', 'ps26', 'ps27', 'ps28','ps29', 'ps30', 'ps31']] Tops = pd.DataFrame(b.apply(lambda x: list(b.columns[np.array(x).argsort()[::-1][:28]]), axis=1).to_list(), columns=['Top1', 'Top2', 'Top3', 'Top4', 'Top5', 'Top6', 'Top7', 'Top8', 'Top9', 'Top10', 'Top11', 'Top12', 'Top13', 'Top14', 'Top15', 'Top16', 'Top17', 'Top18', 'Top19', 'Top20', 'Top21', 'Top22', 'Top23', 'Top24', 'Top25', 'Top26', 'Top27','Top28']) print(Tops) res = pd.concat([df, Tops], axis=1) #print("Ye kuch result hai:",res) res['first_largest'] = res['Top1'] + "-" + res['1_largest'].astype(str) res['second_largest'] = res['Top2'] + "-" + res['2_largest'].astype(str) res['third_largest'] = res['Top3'] + "-" + res['3_largest'].astype(str) res['fourth_largest'] = res['Top4'] + "-" + res['4_largest'].astype(str) res['fifth_largest'] = res['Top5'] + "-" + res['5_largest'].astype(str) res['sixth_largest'] = res['Top6'] + "-" + res['6_largest'].astype(str) res['seventh_largest'] = res['Top7'] + "-" + res['7_largest'].astype(str) res['eight_largest'] = res['Top8'] + "-" + res['8_largest'].astype(str) res['ninth_largest'] = res['Top9'] + "-" + res['9_largest'].astype(str) res['tenth_largest'] = res['Top10'] + "-" + res['10_largest'].astype(str) res['eleventh_largest'] = res['Top11'] + "-" + res['11_largest'].astype(str) res['twelth_largest'] = res['Top12'] + "-" + res['12_largest'].astype(str) res['thirteenth_largest'] = res['Top13'] + "-" + res['13_largest'].astype(str) res['fourteenth_largest'] = res['Top14'] + "-" + res['14_largest'].astype(str) res['fifteenth_largest'] = res['Top15'] + "-" + res['15_largest'].astype(str) res['sixteenth_largest'] = res['Top16'] + "-" + res['16_largest'].astype(str) res['seventeenth_largest'] = res['Top17'] + "-" + res['17_largest'].astype(str) res['eighteenth_largest'] = res['Top18'] + "-" + res['18_largest'].astype(str) res['ninteenth_largest'] = res['Top19'] + "-" + res['19_largest'].astype(str) res['tewenty_largest'] = res['Top20'] + "-" + res['20_largest'].astype(str) res['tone_largest'] = res['Top21'] + "-" + res['21_largest'].astype(str) res['ttwo_largest'] = res['Top22'] + "-" + res['22_largest'].astype(str) res['tthree_largest'] = res['Top23'] + "-" + res['23_largest'].astype(str) res['tfour_largest'] = res['Top24'] + "-" + res['24_largest'].astype(str) res['tfive_largest'] = res['Top25'] + "-" + res['25_largest'].astype(str) res['tsix_largest'] = res['Top26'] + "-" + res['26_largest'].astype(str) res['tseven_largest'] = res['Top27'] + "-" + res['27_largest'].astype(str) res['teight_largest'] = res['Top28'] + "-" + res['28_largest'].astype(str) # res['largest1'] = res['Top1'] # res['largest2'] = res['Top2'] # res['largest3'] = res['Top3'] # res['largest4'] = res['Top4'] # res['largest5'] = res['Top5'] # res['largest6'] = res['Top6'] # res['largest7'] = res['Top7'] # res['largest8'] = res['Top8'] # res['largest9'] = res['Top9'] # res['largest10'] = res['Top10'] # res['largest11'] = res['Top11'] # res['largest12'] = res['Top12'] # res['largest13'] = res['Top13'] # res['largest14'] = res['Top14'] # res['largest15'] = res['Top15'] # res['largest16'] = res['Top16'] # res['largest17'] = res['Top17'] # res['largest18'] = res['Top18'] # res['largest19'] = res['Top19'] # res['largest20'] = res['Top20'] # res['largest21'] = res['Top21'] # res['largest22'] = res['Top22'] # res['largest23'] = res['Top23'] # res['largest24'] = res['Top24'] # res['largest25'] = res['Top25'] # res['largest26'] = res['Top26'] # res['largest27'] = res['Top27'] # res['largest28'] = res['Top28'] # print(res) # res.drop(['Top1', 'Top2', 'Top3', 'Top4', 'Top5', 'Top6', 'Top7', 'Top8', 'Top9', 'Top10', 'Top11', 'Top12', 'Top13', 'Top14', 'Top15', # 'Top16', 'Top17', 'Top18', 'Top19', 'Top20', 'Top21', 'Top22', 'Top23', 'Top24', 'Top25', 'Top26', 'Top27','Top28'], axis=1, inplace=True) y = ['line_no', 'data', 'actual_element', 'Identification_Status', 'scene_number', 'plb', 'nlb', 'ssc', 'lcp', 'case', 'parenthetical', 'isIdentified', 'When_Identified', 'first_largest', 'second_largest', 'third_largest', 'fourth_largest', 'fifth_largest', 'sixth_largest', 'seventh_largest', 'eight_largest', 'ninth_largest', 'tenth_largest', 'eleventh_largest', 'twelth_largest', 'thirteenth_largest', 'fourteenth_largest', 'fifteenth_largest', 'sixteenth_largest', 'seventeenth_largest', 'eighteenth_largest', 'ninteenth_largest', 'tewenty_largest', 'tone_largest', 'ttwo_largest', 'tthree_largest', 'tfour_largest', 'tfive_largest', 'tsix_largest', 'tseven_largest', 'teight_largest', # 'largest1', # 'largest2', # 'largest3', # 'largest4', # 'largest5', # 'largest6', # 'largest7', # 'largest8', # 'largest9', # 'largest10', # 'largest11', # 'largest12', # 'largest13', # 'largest14', # 'largest15', # 'largest16', # 'largest17', # 'largest18', # 'largest19', # 'largest20', # 'largest21', # 'largest22', # 'largest23', # 'largest24', # 'largest25', # 'largest26', # 'largest27', 'ps1', 'ps2', 'ps3', 'ps4', 'ps5', 'ps6', 'ps7', 'ps8', 'ps9', 'ps10', 'ps11', 'ps12', 'ps13', 'ps14', 'ps15', 'ps16', 'ps17', 'ps18', 'ps19', 'ps21', 'ps22', 'ps25', 'ps26', 'ps27', 'ps28','ps29', 'ps30', 'ps31', 'pnbl_line_no', 'nnbl_line_no', 'ppnbl_line_no', 'nnnbl_line_no', 'pdil_line_no', 'ndil_line_no' ] df = res.reindex(columns=y) return(df) def prep_for_pos_elimination(df): all_pos = [ "ps{0}".format(ps) for ps in range(1,35) ] df.insert(12,'Identification_Status_with_weights','') # In[24]: for x in ['ps23','ps24','ps32','ps33','ps34']: all_pos.remove(x) # In[25]: ## go through all lines ## if line is identified add all possibilties ## get weights of the psssibilities ## sort the possibilities in decreasing order of weights def useWeights(ps): return int(ps.split("-")[1]) for index in df.index: if df["isIdentified"][index] == 'No': cur_line_pos = all_pos ## append the weight to the possibilites pos_with_weights = [] for pos in cur_line_pos: wt = '' pos_wt = str(pos) try: wt = df[pos][index].astype(int) pos_wt += '-' + str(wt) except: continue pos_with_weights.append(pos_wt) # now sort in descending order using the weights as key pos_with_weights = sorted(pos_with_weights,key=useWeights , reverse = True) line_pos_string_with_weights = ';'.join([str(elem) for elem in pos_with_weights]) df['Identification_Status_with_weights'][index] = line_pos_string_with_weights ## copy over to identification status without the weights but in order of decreasing weights pos_without_weight = [] for pos in pos_with_weights: pos_without_weight.append(pos.split("-")[0]) line_pos_string = ';'.join([str(elem) for elem in pos_without_weight]) print(line_pos_string) df['Identification_Status'][index] = line_pos_string ## make a column which indicates the possibilities not to be removed ## possibilities not to eliminate df['ps_not_to_remove'] = 'ps34' # df['parenthetical'] = '' for index in df.index: # par = '' # # print(data) # if re.match('\(',data.strip()[:1]): # if re.match('\)',data.strip()[-1:]) : # par = 'Complete' # elif re.search('\)',data.strip()) : # par = 'PartBeginningMid' # else: # par = 'Beginning' # elif re.match('\)',data.strip()[-1:]): # if re.search('\(',data.strip()): # par = 'PartMidEnd' # else: # par = 'End' # # beginning end already checked so now if paren present it is mixed # elif re.search('\(',data.strip()) and re.search('\)',data.strip()): # par = 'PartMidMid' # elif re.search('\(',data.strip()): # par = 'MixedBeginning' # elif re.search('\)',data.strip()): # par = 'MixedEnd' # else: # par = 'Absent' # df['Parenthetical'][index] = par data = df['data'][index] pos_not_to_remove = [] if df["isIdentified"][index] == 'No': ## find the top possibilities - max weight pos_with_wts = df["Identification_Status_with_weights"][index].split(";") max_pos_index = 0 wt1 = 0 wt2 = 0 print(df["line_no"][index]) for k in range(0,len(pos_with_wts)-1): wt1 = pos_with_wts[k].split("-")[1] wt2 = pos_with_wts[k+1].split("-")[1] print(wt1,wt2) if wt2 == wt1: max_pos_index = k+1 continue else: break print (max_pos_index) for j in range(0,max_pos_index+1): pos_not_to_remove.append(df["Identification_Status"][index].split(";")[j]) # if entire line in parenthetical don't remove ps8,ps10 if re.match('\(',data.strip()[:1]) and re.match('\)',data.strip()[-1:]): pos_not_to_remove.append('ps8') pos_not_to_remove.append('ps10') words = data.split() if len(words[0]) > 1 and words[0].isupper(): pos_not_to_remove.append('ps8') pos_not_to_remove.append('ps25') pos_not_to_remove.append('ps30') if len(words) == 1: pos_not_to_remove.append('ps7') left_p = True if re.search('\(',data) else False right_p = True if re.search('\)',data) else False if left_p and right_p and not(re.match('\(',data.strip()[:1])): if (re.search('\)',data).start() -re.search('\(',data).start()) > 0: pos_not_to_remove.append('ps25') pos_not_to_remove.append('ps27') pos_not_to_remove = list(set(pos_not_to_remove)) df['ps_not_to_remove'][index] = (";").join(str(elem) for elem in pos_not_to_remove) ## go thorugh all the lines , if parenthetical is absent remove ps 8,10,11,12,25,26,27 pos_to_remove = ['ps10','ps11','ps12','ps25','ps26','ps27'] for index in df.index: if df['parenthetical'][index] == 'Absent': cur_line_pos = df["Identification_Status"][index].split(";") pos_not_to_remove = df['ps_not_to_remove'][index].split(";") cur_line_pos = [pos for pos in cur_line_pos if pos not in pos_to_remove] pos_not_to_remove = [pos for pos in pos_not_to_remove if pos not in pos_to_remove] df["Identification_Status"][index] = ";".join(cur_line_pos) df['ps_not_to_remove'][index] = ";".join(pos_not_to_remove) pos_to_remove = ['ps11','ps12'] for index in df.index: if df['parenthetical'][index] in ('PartMidEnd','PartStartMid','PartMidMid'): cur_line_pos = df["Identification_Status"][index].split(";") pos_not_to_remove = df['ps_not_to_remove'][index].split(";") cur_line_pos = [pos for pos in cur_line_pos if pos not in pos_to_remove] pos_not_to_remove = [pos for pos in pos_not_to_remove if pos not in pos_to_remove] df["Identification_Status"][index] = ";".join(cur_line_pos) df['ps_not_to_remove'][index] = ";".join(pos_not_to_remove) ## refine the possibilties of first and last line first_line_index = 0 if df['case'][first_line_index] == 'None': first_line_index += 1 last_line_index = df.index[-1] if df['case'][last_line_index] == 'None': last_line_index -= 1 ## keep possibilities of first and last line eligible_pos = ['ps1','ps2','ps17','ps18'] first_line_pos = df["Identification_Status"][first_line_index].split(";") first_line_pos = [ps for ps in first_line_pos if ps in eligible_pos ] df['Identification_Status'][first_line_index] = ";".join(first_line_pos) df['ps_not_to_remove'][first_line_index] = "" ## keep possibilities of last line eligible_pos = ['ps6','ps15','ps16','ps17','ps29','ps30','ps31'] last_line_pos = df["Identification_Status"][last_line_index].split(";") last_line_pos = [ps for ps in eligible_pos if ps in last_line_pos ] df['Identification_Status'][last_line_index] = ";".join(last_line_pos) df['ps_not_to_remove'][last_line_index] = "" # In[26]: for index in df.index: cur_line_pos = df["Identification_Status"][index].split(";") if len(cur_line_pos) == 1: continue ## append the weight to the possibilites pos_with_weights = [] for pos in cur_line_pos: wt = '' pos_wt = str(pos) try: wt = df[pos][index].astype(int) pos_wt += '-' + str(wt) except: continue pos_with_weights.append(pos_wt) # now sort in descending order using the weights as key pos_with_weights = sorted(pos_with_weights,key=useWeights , reverse = True) line_pos_string_with_weights = ';'.join([str(elem) for elem in pos_with_weights]) df['Identification_Status_with_weights'][index] = line_pos_string_with_weights ## copy over to identification status without the weights but in order of decreasing weights pos_without_weight = [] for pos in pos_with_weights: pos_without_weight.append(pos.split("-")[0]) line_pos_string = ';'.join([str(elem) for elem in pos_without_weight]) print(line_pos_string) df['Identification_Status'][index] = line_pos_string def examine_speaker_pos(df,audit_df): print("examining speaker possibilties") speaker_list = df.loc[df['Identification_Status'] == 'ps7','data'].astype(str) speaker_list = [ elem.strip() for elem in speaker_list ] speaker_lines_list = df.loc[(df['Identification_Status'] == 'ps7') | (df['Identification_Status'] == 'ps8'),'line_no'].to_list() unique_speaker_list = [] speaker_in_two_lines_list = [] for speaker in speaker_list: speaker = speaker.strip() #print(speaker) if speaker not in unique_speaker_list: unique_speaker_list.append(speaker) ## strip the blank spaces try: print(unique_speaker_list) except: pass print(speaker_lines_list) # In[173]: for index in df.index: line_no = df['line_no'][index] data = df['data'][index].strip() if df['Identification_Status'][index] == 'ps7': continue if df['Identification_Status'][index] == 'ps8': continue try: if ("".join(data.split()).upper() in unique_speaker_list) or (data.upper() in unique_speaker_list): print (line_no,data) if line_no not in speaker_lines_list and df['isIdentified'][index] != 'Yes': speaker_lines_list.append(line_no) except: print(line_no,data,"data is not str") pass if index != df.index[-1]: nl_data = df['data'][index+1] cur_par = df['parenthetical'][index] if data.strip() and nl_data.strip() and cur_par == 'Absent': two_line_data = "".join((data+nl_data).split()) #print(two_line_data) #print(two_line_data.lstrip().split(" ")[0]) #print(unique_speaker_list) if two_line_data in unique_speaker_list: #print("Yes") print(line_no,data) print(line_no,nl_data) speaker_in_two_lines_list.append(line_no) elif two_line_data.lstrip().split("(")[0] in unique_speaker_list and data.lstrip().split("(")[0].strip() not in unique_speaker_list : print(line_no,data) print(line_no,nl_data) speaker_in_two_lines_list.append(line_no) speaker_lines_list.sort() speaker_in_two_lines_list.sort() print(speaker_lines_list) print(speaker_in_two_lines_list) # In[174]: # create new df with line no as index df_line_index =df.copy().set_index('line_no') df_line_index.head() ## correct the speaker in two lines ## first go through speaker in two lines for sp_line in speaker_in_two_lines_list: if df_line_index['Identification_Status'][sp_line] == 'ps8': continue data = df_line_index['data'][sp_line] line_no = sp_line nl_data = df_line_index['data'][sp_line+1] new_data = data.strip() + nl_data.strip() if re.search('\(',new_data): par_pos = re.search('\(',new_data).start() before_par = new_data[:par_pos] after_par = new_data[par_pos:] df_line_index['data'][sp_line] = before_par df_line_index['data'][sp_line+1] = after_par if re.match('\)',after_par.strip()[-1]): df_line_index['Identification_Status'][sp_line+1] = 'ps10' df_line_index['parenthetical'][sp_line+1] = 'Complete' df_line_index['When_Identified'][sp_line+1] = 'ExaminingSpeakerLines' elif re.match('\)',after_par.strip()): df_line_index['Identification_Status'][sp_line+1] = 'ps26' df_line_index['parenthetical'][sp_line+1] = 'PartStartMid' df_line_index['When_Identified'][sp_line+1] = 'ExaminingSpeakerLines' else: df_line_index['Identification_Status'][sp_line+1] = 'ps11' df_line_index['parenthetical'][sp_line+1] = 'StartingLeft' df_line_index['When_Identified'][sp_line+1] = 'ExaminingSpeakerLines' # add line no to speaker lines speaker_lines_list.append(sp_line) # print to report # audit_report.write("%s.line no: %s , Found Speaker in two lines with continuing parenthetical, Separated speaker \n" %(audit_sno,sp_line)) # audit_sno += 1 else: # print to report print("%s.line no: %s , Found Speaker in two lines merged the line \n" %(audit_sno,sp_line)) # audit_report.write("%s.line no: %s , Found Speaker in two lines merged the line \n" %(audit_sno,sp_line)) # audit_sno += 1 # correct the line df_line_index['data'][sp_line] = new_data # add line no to speaker lines speaker_lines_list.append(sp_line) # delete the next line df_line_index.drop((sp_line+1),inplace= True) audit_df['line_removed'][sp_line+1] = 'Yes' print (data,nl_data,new_data) # In[176]: ## go through the speaker lines and rectify them , print to audit report for sp_line in speaker_lines_list: line_no = sp_line ##identify the above identified speaker lines as ps7 if df_line_index['Identification_Status'][sp_line] in ('ps7','ps8'): continue df_line_index['Identification_Status'][sp_line] = 'ps7' df_line_index['parenthetical'][sp_line] = 'Absent' if df_line_index['When_Identified'][sp_line] != 'FirstStrictConditions' : df_line_index['When_Identified'][sp_line] = 'ExaminingSpeakerLines' sp_data = df_line_index['data'][sp_line].strip() print(sp_line) try: print(sp_data) except: pass cur_indent = df_line_index['ssc'][sp_line] try: new_speaker_indent = int(cur_indent) except: new_speaker_indent = 35 ## check and correct case if df_line_index['case'][sp_line] != 'AllUpper': try: print(sp_data) except: pass df_line_index['data'][sp_line] = sp_data.rjust(len(sp_data)+new_speaker_indent).upper() df_line_index['case'][sp_line] = 'AllUpper' try: print("case corrected to:",df_line_index['data'][sp_line]) except: pass audit_df['case_corrected'][line_no] = 'Speaker Case corrected to All Upper Case' ## check and correct gap between speaker name sp_data_nogap = "".join(sp_data.split()) if sp_data != sp_data_nogap: try: print(sp_data) except: pass ## removing gap only if first world is single first_word = sp_data.split()[0] if len(first_word) == 1: df_line_index['data'][sp_line] = (sp_data_nogap.upper()).rjust(len(sp_data_nogap)+new_speaker_indent) try: print("speaker name corrected to:",df_line_index['data'][sp_line]) except: pass audit_df['space_removed_between_characters'][line_no] = 'Yes' df_line_index.reset_index(inplace=True) df = df_line_index.sort_index().reset_index(drop=True) ## mixed speaker identification ## use the speakers to idenfity mixed speaker lines ps8, 25,30,21,28 # In[184]: df['ps_not_to_remove'] = df['ps_not_to_remove'].astype(str) try: print(unique_speaker_list) except: pass print("performing checks for speaker followed by parenthetical") for index in df.index: if df['Identification_Status'][index] == 'ps8': continue line_no = df['line_no'][index] print("line_no",line_no) print(df.dtypes) new_line_no = 0.0 data = df['data'][index] cur_pos_list = df['Identification_Status'][index].split(";") top_pos = cur_pos_list[0] new_pos_list = cur_pos_list try: pos_not_to_remove = df['ps_not_to_remove'][index].split(";") except: pos_not_to_remove = [] set_1 = set(cur_pos_list) set_2 = set(pos_not_to_remove) pos_not_to_remove = list(set.intersection(set_1,set_2)) for speaker in unique_speaker_list: check_done =False if re.search(speaker,data) and df['Identification_Status'][index] not in ('ps7','ps8') : # check if speaker is at start of line followed by something (like parenthetical) pos_starts = re.search(speaker,data,re.IGNORECASE).start() pos_end = re.search(speaker,data,re.IGNORECASE).end() before_speaker = data[:pos_starts] after_speaker = data[pos_end:] print("match found") try: print("data 2347:",data) print("speaker 2348:", speaker) print("before speaker 2349 :", before_speaker) print("after speaker 2350:",after_speaker) except: pass try: char1_after_speaker = after_speaker.lstrip()[0] except: char1_after_speaker = '' cur_indent = df['ssc'][index] try: new_speaker_indent = int(cur_indent) except: new_speaker_indent = 35 try: print(before_speaker) print(after_speaker) print("char1_after_speaker 2367:",char1_after_speaker) except: pass ## separate parenthtical if speaker is followed by parenthtical if before_speaker.isspace() and char1_after_speaker == '(' and df['parenthetical'][index] == 'PartMidEnd' and "V.O." not in str(after_speaker): #print("before_speaker 2372:", before_par) print ("Seperating parenthetical") print("Identifying speaker") print(index) df['data'][index] = before_speaker + speaker df['parenthetical'][index] = 'Absent' df['When_Identified'][index] = 'ExaminingSpeakerLines' df['case'][index] = 'AllUpper' df['Identification_Status'][index] = 'ps7' nlb = df['nlb'][index] df['nlb'][index] = 'N' audit_df['line_broken_into_multiple_lines'][line_no] = 'Separated Speaker and Parenthetical' #print(df['Identification_Status'][index]) #new_line_no = str(int(line_no) + 0.5) new_line_no = line_no + 0.5 print(type(line_no),type(new_line_no)) print("identifying parenthetical") print(df.dtypes) df.loc[index + 0.25] = np.nan df.loc[index + 0.25,'data'] = str(after_speaker) df.loc[index + 0.25,'parenthetical'] = 'Complete' df.loc[index + 0.25,'When_Identified'] ='ExaminingSpeakerLines' df.loc[index + 0.25,'Identification_Status'] = 'ps10' df.loc[index + 0.25,'case'] = '' df.loc[index + 0.25,'plb'] = 'N' df.loc[index + 0.25,'nlb'] = nlb df.loc[index + 0.25,'line_no'] = new_line_no #print(df.loc[index + 0.25]['line_no'] ,new_line_no) #df['line_no'] = pd.to_numeric(df['line_no'],errors ='coerce') df = df.sort_index().reset_index(drop=True) audit_df.loc[new_line_no] = np.nan audit_df['line_removed'][new_line_no] = 'No' print(df.dtypes) print(audit_df.dtypes) continue elif before_speaker.isspace() and char1_after_speaker == '(' and df['parenthetical'][index] != 'PartMidMid': ##make the line possibilities as ps8;ps25 ## next non-blank characteer after speker is left ( print ("parenthetical mix") new_pos_list = ['ps8','ps25'] ## check and change the indent to speaker indent of 35 if df['ssc'][index] != 35: df['ssc'][index] = new_speaker_indent df['data'][index] = data.strip().rjust(len(data.strip()) + new_speaker_indent) # with open(audit_report_path,'a') as audit_report: # audit_report.write("%s. line no: %s , Corrected Speaker Mix indent to \n" %(audit_sno,index)) # audit_sno += 1 #df_line_index['Identification_Status'][index] = 'ps8;ps25' # break elif before_speaker.isspace() and (not after_speaker.isspace()) : ## add code to remove # cannot be ps1,ps2,ps3,ps7,ps9,ps10,ps11,ps12,ps16,ps17,ps18,ps19,ps21,ps22,ps26,ps27,ps28,ps29,ps31, ps_remove = ['ps1','ps2','ps3','ps7','ps9','ps10','ps11','ps12','ps16','ps17','ps18','ps19','ps21','ps22','ps26','ps27','ps28','ps29','ps31'] for ps in ps_remove: try: new_pos_list.remove(ps) except: continue #df_line_index['Identification_Status'][index] = 'ps30;ps4;ps5;ps6;ps8;ps13;ps14;ps15' print ("present but not parenthetical removed except - ps30;ps4;ps5;ps6;ps8;ps13;ps14;ps15") # break elif (not before_speaker.isspace()) and after_speaker.isspace(): new_pos_list = ['ps21','ps28','ps5','ps4'] #df_line_index['Identification_Status'][index] = 'ps21;ps28' print ("before speaker present") check_done = True else: new_pos_list = cur_pos_list print("no change done") ## append the posibility which were not to be removed back to the list if it got eliminated in the intersection for ps in pos_not_to_remove: if ps not in new_pos_list: new_pos_list.insert(0,ps) df['Identification_Status'][index] = ";".join([str(elem) for elem in new_pos_list]) print('\n') if check_done: break for index in df.index: if df['Identification_Status'][index] == 'blank' or (len(df['Identification_Status'][index].split(";")) == 1): df['isIdentified'][index] = 'Yes' else: df['isIdentified'][index] = 'No' return df def examine_speaker_next_lines(df,audit_df): df.reset_index(inplace=True, drop=True) ## identifying lines after speaker ## get the speaker lines speaker_lines_list = df.loc[(df['Identification_Status'] == 'ps7') | (df['Identification_Status'] == 'ps8'),'line_no'].to_list() ## go through the speaker lines and identify the lines after speaker for line in speaker_lines_list: blank_to_delete = [] index = df.loc[df['line_no'] == line,:].index.values[0] data = df['data'][index] speaker_name = data.strip() print("\n") print("speaker index",index) print("speaker line no",line) try: print("data:\n",data) except: pass # move the index to next nbl line to check it if df['nlb'][index] == 'Y': index += 2 else: index += 1 possible_dialog_line = False parenthetical_begun = False ## examine the lines(if any) after speaker and before dialougue ## move index till end paranthetical comes while not possible_dialog_line: data = df['data'][index] line_no = df['line_no'][index] cur_line_par = df['parenthetical'][index] print("examining line") try: print(data) print(line_no,cur_line_par) except: pass if df['Identification_Status'][index] == 'ps10': index += 1 #possible_dialog_line = True elif cur_line_par == 'Complete': # complete line in paranthetical print("Identifying as Parenthetical Complete") df['Identification_Status'][index] = 'ps10' df['When_Identified'][index] = 'ExaminingLinesAfterSpeaker' index +=1 possible_dialog_line = True elif cur_line_par == 'StartingLeft' and not(parenthetical_begun) : # line has paranthetical in beginning only print("identifying as parenthetical Beginning") df['Identification_Status'][index] = 'ps11' df['When_Identified'][index] = 'ExaminingLinesAfterSpeaker' parenthetical_begun = True index +=1 elif cur_line_par == 'EndingRight' and parenthetical_begun: # line has paranthetical at end only print("Identifying as parenthetical end") df['Identification_Status'][index] = 'ps12' df['When_Identified'][index] = 'ExaminingLinesAfterSpeaker' index +=1 possible_dialog_line = True elif cur_line_par == 'Absent' and parenthetical_begun: # multiple lines could be in paranthetical print("Identifying as parenthetical middle") df['Identification_Status'][index] = 'ps20' df['When_Identified'][index] = 'ExaminingLinesAfterSpeaker' index +=1 elif df['Identification_Status'][index] == 'ps13' or df['Identification_Status'][index] == 'ps14': ## dialogue end cans still be there index += 1 # elif cur_line_par == 'PartBeginningMid': # # starting part of line in paranthetical # print("Identifying as parenthetical mix with succeeding dialogue") # df['Identification_Status'][index] = 'ps26' # df['When_Identified'][index] = 'ExaminingLinesAfterSpeaker' # index +=1 # possible_dialog_line = True # elif cur_line_par == 'PartMidEnd': # # end part line in parenthetical # print("Identifying as parenthetical mix with preceeding dialogue") # df['Identification_Status'][index] = 'ps27' # df['When_Identified'][index] = 'ExaminingLinesAfterSpeaker' # ## breaking the line to dialogue and parenthtical complete- # index +=1 # possible_dialog_line = True elif cur_line_par == 'Absent': print("line should be dialogue") possible_dialog_line = True else: print("line could be dialogue") possible_dialog_line = True print(line_no,possible_dialog_line,parenthetical_begun) ## skip if blank if df['Identification_Status'][index] == 'blank': print("skipping blank line") blank_to_delete.append(index) index +=1 # check if the current line has possibility of being a dialogue , if not move to next speaker line if 'ps15' not in df['Identification_Status'][index].split(";") : print("line does not have possibility of dialogue, so cannot process") continue else: print("dialogue line(s) after speaker") cur_indent = df['ssc'][index] cur_line_par = df['parenthetical'][index] next_line_blank = True if df['nlb'][index] == 'Y' else False if index+2 > df.index[-1]: break if next_line_blank: next_nbl_indent = df['ssc'][index+2] next_nbl_par = df['parenthetical'][index+2] next_nbl_case = df['case'][index+2] next_nbl_data = df['data'][index+2] j = index + 2 else: next_nbl_indent = df['ssc'][index+1] next_nbl_par = df['parenthetical'][index+1] next_nbl_case = df['case'][index+1] next_nbl_data = df['data'][index+1] j = index + 1 start_index = index rev_index = index lines_count = 1 data = df['data'][index] print("all lines after speaker with same indent with parentheticals") try: print(data) except: pass dia_indent = cur_indent print(dia_indent) ## adding condition - next line can be parenthetical ## removing conditions - next to next line check not required ## get the number of line with same indent while (cur_indent == next_nbl_indent or dia_indent == next_nbl_indent or next_nbl_par in ('Complete','StartingLeft','EndingRight')) and not (next_nbl_case == 'AllUpper') and (not next_line_blank): # or next_nbl_case in ('AllLower','MidUpper','Partial')): ## considering dialogue ends if next blank except next line blank and nn line case Alllower #and len(next_nbl_data.split()) == 1): lines_count +=1 cur_indent = next_nbl_indent cur_line_par = next_nbl_par next_line_blank = True if df['nlb'][j] == 'Y' else False data = df['data'][j] try: print(data) print(cur_indent) except: pass rev_index = j if j+1 >= df.index[-1]: break if next_line_blank: break ## as not considering space between dialogue .. action becoming dialogue #next_nbl_indent = df['ssc'][j+2] #next_nbl_par = df['parenthetical'][j+2] #next_nbl_case = df['case'][j+2] #next_nbl_data = df['data'][j+2] #j += 2 else: next_nbl_indent = df['ssc'][j+1] next_nbl_par = df['parenthetical'][j+1] next_nbl_case = df['case'][j+1] next_nbl_data = df['data'][j+1] j += 1 print("\n Next line indent is",next_nbl_indent) ## now lets start examing these lines in reverse order ## if last line has parenthtical end or complete then it is action mixed not dialogue last_line_par = cur_line_par data = df['data'][rev_index] dialogue_end_identified = False dia_end = rev_index ### # last line is mixed with parenthetical. if it is parenthetical Mid end -> separate to new action line if last_line_par == 'PartMidEnd': print("Dialogue mixed with parenthetical") #separate line to before and after parenthetical par_start = re.search('\(',data).start() before_par = data[:par_start] after_par = data[par_start:] # make current line as before par and tag as dialogue print(" identifying before parenthentical line as ps15") try: print(before_par) except: pass df['data'][rev_index] = before_par next_line_flag = df['nlb'][rev_index] df['nlb'][rev_index] = 'N' df['Identification_Status'][rev_index] = 'ps15' df['parenthetical'][rev_index] = 'Absent' df['When_Identified'][rev_index] = 'ExaminingLinesAfterSpeaker' df['isIdentified'][rev_index] = 'Yes' dialogue_end_identified = True #make new next line as action line ps6 print("action after dialogue, separating to newline , identifying line as ps6") try: print(after_par) print("after_par is here") except: pass print("df['line_no'][rev_index]:",df['line_no'][rev_index]) print(df['line_no'][rev_index]) line_no = df['line_no'][rev_index] next_line_no = df['line_no'][rev_index+1] new_line_no = (float(line_no) + float(next_line_no)) / 2 if new_line_no in audit_df.index: new_line_no = (float(new_line_no) + float(next_line_no))/2 new_line_no = new_line_no audit_df.loc[new_line_no] = np.nan audit_df.loc[new_line_no]['line_removed'] = 'No' audit_df.loc[line_no]['line_broken_into_multiple_lines'] = 'Separated Action from Dialogue and added Speaker' # add line before action end df.loc[rev_index + 0.25] = np.nan df.loc[rev_index + 0.25,'ssc'] = 0 new_data = speaker_name.capitalize() + ' ' + after_par.replace('(','').replace(')','') df.loc[rev_index + 0.25,'data'] = new_data df.loc[rev_index + 0.25,'case'] = '' df.loc[rev_index + 0.25,'plb'] = 'N' df.loc[rev_index + 0.25,'nlb'] = next_line_flag df.loc[rev_index + 0.25,'Identification_Status'] = 'ps6' df.loc[rev_index + 0.25,'When_Identified'] = 'ExaminingLinesAfterSpeaker' df.loc[rev_index + 0.25,'isIdentified'] = 'Yes' df.loc[rev_index + 0.25,'parenthetical'] = 'Absent' df.loc[rev_index + 0.25,'line_no'] = new_line_no try: print(new_data) except: pass # insert audit report df = df.sort_index().reset_index(drop=True) elif last_line_par == 'EndingRight': print("last line has parenthtical end") # if parentical is end then find the beginning and split after begiining identify as action j=1 beginning_not_found = True while beginning_not_found: print("looking for beginning parenthtical") data = df['data'][rev_index-j] try: print(data) except: pass if df['parenthetical'][rev_index-j] in ('StartingLeft','MixedLeft') : beginning_not_found = False else: j+=1 if beginning_not_found == False : print("parenthetical beginning found") if df['parenthetical'][rev_index-j] == 'MixedLeft' and (rev_index-j)>=start_index: data = df['data'][rev_index-j] #separate line to before and after parenthetical par_start = re.search('\(',data).start() before_par = data[:par_start] after_par = data[par_start:] # make current line as before par and tag as dialogue print(" splitting and identifying before parenthentical line as ps15") try: print(before_par) except: pass print("here") df['data'][rev_index-j] = before_par next_line_flag = df['nlb'][rev_index-j] df['nlb'][rev_index-j] = 'N' df['Identification_Status'][rev_index-j] = 'ps15' df['parenthetical'][rev_index-j] = 'Absent' df['When_Identified'][rev_index-j] = 'ExaminingLinesAfterSpeaker' df['isIdentified'][rev_index-j] = 'Yes' dialogue_end_identified = True dia_end = rev_index-j #make new next line as action line print("action after dialogue, separating to newline ") try: print(after_par) print("after par") except: pass line_no = df['line_no'][rev_index-j] print("2799",type(line_no)) try: next_line_no = df['line_no'][rev_index-j+1] print("2802",type(new_line_no)) except: next_line_no = df['line_no'][int(rev_index-j+1)] print("2805",rev_index-j+1) try: new_line_no = (line_no + next_line_no) / 2 except: new_line_no = (float(line_no) + float(next_line_no)) / 2 try: if new_line_no in audit_df.index: new_line_no = (new_line_no + next_line_no)/2 except: if new_line_no in audit_df.index: new_line_no = (float(new_line_no) + float(next_line_no))/2 audit_df.loc[new_line_no] = np.nan audit_df.loc[new_line_no]['line_removed'] = 'No' audit_df.loc[line_no]['line_broken_into_multiple_lines'] = 'Separated Action from Dialogue and added Speaker' # add new action line , audit report, and change flag df.loc[rev_index-j + 0.25] = np.nan df.loc[rev_index-j + 0.25,'ssc'] = 0 df.loc[rev_index-j + 0.25,'data'] = after_par df.loc[rev_index-j + 0.25,'case'] = '' df.loc[rev_index-j + 0.25,'plb'] = 'N' df.loc[rev_index-j + 0.25,'nlb'] = next_line_flag #df.loc[index + 0.25,'Identification_Status'] = 'ps6' df.loc[rev_index-j + 0.25,'When_Identified'] = 'ExaminingLinesAfterSpeaker' df.loc[rev_index-j + 0.25,'isIdentified'] = 'Yes' df.loc[rev_index-j + 0.25,'parenthetical'] = 'StartingLeft' df.loc[rev_index-j + 0.25,'line_no'] = new_line_no df = df.sort_index().reset_index(drop=True) rev_index += 1 # insert audit report if df['parenthetical'][rev_index-j] == 'StartingLeft' and (rev_index-j)>=start_index: if j >=1: df['Identification_Status'][rev_index-j] = 'ps4' df['When_Identified'][rev_index-j] = 'ExaminingLinesAfterSpeaker' df['isIdentified'][rev_index-j] = 'Yes' cur_data = df['data'][rev_index-j] new_data = speaker_name.capitalize() + ' ' + cur_data.replace('(','').strip() try: print(new_data) except: pass df['data'][rev_index-j] = new_data df['parenthetical'][rev_index-j] = 'Absent' j -= 1 while j != 0: df['Identification_Status'][rev_index-j] = 'ps5' df['When_Identified'][rev_index-j] = 'ExaminingLinesAfterSpeaker' df['isIdentified'][rev_index-j] = 'Yes' try: print(df['data'][rev_index-j]) except: pass j -= 1 df['Identification_Status'][rev_index] = 'ps6' df['When_Identified'][rev_index] = 'ExaminingLinesAfterSpeaker' df['isIdentified'][rev_index] = 'Yes' cur_data = df['data'][rev_index] new_data = cur_data.replace(')','').strip() df['data'][rev_index] = new_data df['parenthetical'][rev_index] = 'Absent' try: print(new_data) except: pass # insert audit report elif last_line_par == 'Absent' and df['case'][rev_index] != 'AllUpper': print("Identifying as dialogue end") df['Identification_Status'][rev_index] = 'ps15' df['When_Identified'][rev_index] = 'ExaminingLinesAfterSpeaker' df['isIdentified'][rev_index] = 'Yes' dialogue_end_identified = True ### Now the last line or lines till parenthtical start have been examined ## if dialogue end is not identified then any last dialogue will be end if not dialogue_end_identified: print("Could not identify the dialogue") continue else: print("dialogue end identfied as") data = df['data'][dia_end] try: print(data) except: pass ## not for the remaining lines identify first as dialoguee beginning and others as middle j = start_index cur_line_par = df['parenthetical'][j] data = df['data'][j] parenthetical_begun = False if j < dia_end : print("\n") try: print(data) except: pass if cur_line_par == 'Absent': print("Identifying as dialogue begining") df['Identification_Status'][j] = 'ps13' df['When_Identified'][j] = 'ExaminingLinesAfterSpeaker' df['isIdentified'][j] = 'Yes' elif cur_line_par in ('PartMidEnd'): print("Identifying as dialogue mixed with parenthetical") df['Identification_Status'][j] = 'ps27' df['When_Identified'][j] = 'ExaminingLinesAfterSpeaker' df['isIdentified'][j] = 'Yes' elif cur_line_par == 'MixedLeft': print("Identifying as dialogue mixed with parenthtical") df['Identification_Status'][j] = 'ps27' df['When_Identified'][j] = 'ExaminingLinesAfterSpeaker' df['isIdentified'][j] = 'Yes' parenthetical_begun = True elif cur_line_par == 'StartingLeft': print("Identifying as parenthetical beginning") df['Identification_Status'][j] = 'ps11' df['When_Identified'][j] = 'ExaminingLinesAfterSpeaker' df['isIdentified'][j] = 'Yes' parenthetical_begun = True elif cur_line_par in ('PartStartMid'): print("Identifying as parenthetical mixed with dialog") df['Identification_Status'][j] = 'ps26' df['When_Identified'][j] = 'ExaminingLinesAfterSpeaker' df['isIdentified'][j] = 'Yes' elif cur_line_par in ('PartMidMid') : print("Identifying as dialogue mixed with parenthtical ") df['Identification_Status'][j] = 'ps26;ps27' df['When_Identified'][j] = 'ExaminingLinesAfterSpeaker' df['isIdentified'][j] = 'Yes' next_line_blank = True if df['nlb'][j] == 'Y' else False if next_line_blank : j += 2 else: j += 1 else: continue ## now end and beginning have been examined . rest are middle if parentheical absent cur_line_par = df['parenthetical'][j] data = df['data'][j] while j < dia_end : print("\n") try: print(data) except: pass if cur_line_par == 'Absent': print("Identifying as dialogue middle") df['Identification_Status'][j] = 'ps14' df['When_Identified'][j] = 'ExaminingLinesAfterSpeaker' df['isIdentified'][j] = 'Yes' elif cur_line_par in ('PartMidEnd'): print("Identifying as dialogue mixed with parenthetical") df['Identification_Status'][j] = 'ps27' df['When_Identified'][j] = 'ExaminingLinesAfterSpeaker' df['isIdentified'][j] = 'Yes' elif cur_line_par == 'MixedLeft': print("Identifying dialogue mixed with parenthtical") df['Identification_Status'][j] = 'ps27' df['When_Identified'][j] = 'ExaminingLinesAfterSpeaker' df['isIdentified'][j] = 'Yes' parenthetical_begun = True elif cur_line_par == 'StartingLeft': print("Identifying as parenthetical beginning") df['Identification_Status'][j] = 'ps11' df['When_Identified'][j] = 'ExaminingLinesAfterSpeaker' df['isIdentified'][j] = 'Yes' parenthetical_begun = True elif cur_line_par in ('Absent') and parenthetical_begun: print("Identifying as parenthetical middle") df['Identification_Status'][j] = 'ps20' df['When_Identified'][j] = 'ExaminingLinesAfterSpeaker' df['isIdentified'][j] = 'Yes' parenthetical_begun = True elif cur_line_par in ('EndingRight') and parenthetical_begun: print("Identifying as parenthetical ending") df['Identification_Status'][j] = 'ps12' df['When_Identified'][j] = 'ExaminingLinesAfterSpeaker' df['isIdentified'][j] = 'Yes' parenthetical_begun = False elif cur_line_par in ('MixedRight') and parenthetical_begun: print("Identifying as dialogue mixed with parenthetical ") df['Identification_Status'][j] = 'ps26' df['When_Identified'][j] = 'ExaminingLinesAfterSpeaker' df['isIdentified'][j] = 'Yes' parenthetical_begun = False elif cur_line_par in ('PartStartMid'): print("Identifying as parenthetical mixed with dialog") df['Identification_Status'][j] = 'ps26' df['When_Identified'][j] = 'ExaminingLinesAfterSpeaker' df['isIdentified'][j] = 'Yes' elif cur_line_par in ('PartMidMid') : print("Identifying as dialogue mixed with parenthtical ") df['Identification_Status'][j] = 'ps26;ps27' df['When_Identified'][j] = 'ExaminingLinesAfterSpeaker' df['isIdentified'][j] = 'Yes' next_line_blank = True if df['nlb'][j] == 'Y' else False if next_line_blank : j += 2 else: j += 1 data = df['data'][j] print("printing_data before loop") cur_line_par = df['parenthetical'][j] for index in df.index: if df['Identification_Status'][index] == 'blank' or (len(df['Identification_Status'][index].split(";")) == 1): df['isIdentified'][index] = 'Yes' else: df['isIdentified'][index] = 'No' return df def prep_pnnbl_wts(csv_pnbl_nnbl,cur_dir): pnbl_nnbl_df = pd.read_csv(csv_pnbl_nnbl,skiprows = [0]) pnbl_df = pnbl_nnbl_df.iloc[:,[0,28,29,30,31,32,33,34,35,36,37,38,39]] nnbl_df = pnbl_nnbl_df.iloc[:,[0,41,42,43,44,45,46,47,48,49,50,51,52]] pnbl_df.rename(columns={pnbl_df.columns[0]:'Possibilities', pnbl_df.columns[1]:'ps2', pnbl_df.columns[2]:'ps1', pnbl_df.columns[3]:'ps3', pnbl_df.columns[4]:'ps4', pnbl_df.columns[5]:'ps5', pnbl_df.columns[6]:'ps6', pnbl_df.columns[7]:'ps7', pnbl_df.columns[8]:'ps10', pnbl_df.columns[9]:'ps13', pnbl_df.columns[10]:'ps14', pnbl_df.columns[11]:'ps15', pnbl_df.columns[12]:'ps16', },inplace=True) nnbl_df.rename(columns={nnbl_df.columns[0]:'Possibilities', nnbl_df.columns[1]:'ps3', nnbl_df.columns[2]:'ps2', nnbl_df.columns[3]:'ps1', nnbl_df.columns[4]:'ps16', nnbl_df.columns[5]:'ps13', nnbl_df.columns[6]:'ps14', nnbl_df.columns[7]:'ps15', nnbl_df.columns[8]:'ps10', nnbl_df.columns[9]:'ps7', nnbl_df.columns[10]:'ps4', nnbl_df.columns[11]:'ps5', nnbl_df.columns[12]:'ps6', },inplace=True) pnbl_df.to_csv(os.path.join(cur_dir,'pnbl_weights.csv'),index = False) nnbl_df.to_csv(os.path.join(cur_dir,'nnbl_weights.csv'),index = False) #return pnbl_df,nnbl_df def identify_using_pnbl_nnbl(df,identify_using,iteration): def takeNumeric(ps): return int(ps[2:]) def useWeights(ps): return int(ps.split("-")[1]) count_lines_identified = 0 all_pos = [ "ps{0}".format(ps) for ps in range(1,35) ] new_lines_identified = False ## column to store the current identification status ## for 1st iteration this is the stage1 output (output after the strict conditions) ##last_identification = 'stage-1_output' df['Identification_Status'] = df[identify_using] #pos_count_column_name_before = 'CountofPossibilities_before_Iteration' + str(iteration) pos_count_column_name = 'CountofPossibilities_afterIteration' + str(iteration) df[pos_count_column_name] = '' line_nos_identified = [] total_pos_before = 0 total_pos_after = 0 pos_decreased = False for index in df.index: ## process the line only if it is unidentified (inclduing more than 1 possibilities) cur_line_pos = df['Identification_Status'][index].split(";") line_no = df['line_no'][index] # if (df['isIdentified'][index] == 'No'): # print(line_no,": line currently unidentified") # print(df['data'][index]) # line_pos_using_pnbl = all_pos # line_pos_using_nnbl = all_pos # total_pos_before += len(all_pos) # el if len(cur_line_pos) > 1: print(line_no,": line currently has more than one possibilties") try: print(df['data'][index]) except: pass print(cur_line_pos) line_pos_using_pnbl = cur_line_pos line_pos_using_nnbl = cur_line_pos total_pos_before += len(cur_line_pos) else: print(line_no, ": line already identified as",df['Identification_Status'][index]) total_pos_before += 1 total_pos_after += 1 continue ## for unidentified line find the previous-line which is non-blank j=1 pnbl = 'not found' print("looking for previous non-blank line") while (pnbl == 'not found') and ((index-j) >= 0 ): if (df['Identification_Status'][index-j] == 'blank'): print ("previous line is blank") print ("moving to subsequent previous line") j+=1 else: pnbl = "found" print("found previous non-blank line") pnbl_line_pos = df['Identification_Status'][index-j].split(";") # check if the previous line found is identified or not (unidentified or having more than one possibilities) if (pnbl == 'found'): if (df['isIdentified'][index-j] == 'No') : print("but as previous non-blank line is unidentified so cannot perform pnbl check, so skipping") elif len(pnbl_line_pos) > 1: print("but as previous non-blank line is unidentified (has more than one possibilties) so cannot perform pnbl check, so skipping") else: print("AND previous non-blank line is already identified as",df['Identification_Status'][index-j] ) try: print(df['data'][index-j]) except: pass pnbl_identified_as = df['Identification_Status'][index-j] df['pnbl_identified_as'][index] = pnbl_identified_as try: line_pos_using_pnbl = sorted(list(pnbl_df.loc[pnbl_df[pnbl_identified_as] > 0,pnbl_identified_as].index),key=takeNumeric) line_pos_string = ';'.join([str(elem) for elem in line_pos_using_pnbl]) df['pos_using_pnbl'][index] = line_pos_string except: print("pnbl weights sheet does not have column",pnbl_identified_as) pass ## for unidentified line find the next-line which is non-blank j=1 nnbl = 'not found' print("looking for next non-blank line") while (nnbl == 'not found') and ((index+j) < (len(df))): if (df['Identification_Status'][index+j] == 'blank'): print ("next line is blank") print ("moving to subsequent next line") j+=1 else: nnbl = "found" print("found next non-blank line") nnbl_line_pos = df['Identification_Status'][index+j].split(";") # check if the line found is identified or not if (nnbl == 'found'): if (df['isIdentified'][index+j] == 'No'): print("but as next non-blank line is unidentified so cannot perform nnbl check, so skipping") elif len(nnbl_line_pos)>1: print("but as next non-blank line is unidentified(has multiple possibilties) so cannot perform nnbl check, so skipping") else: print("AND next non-blank line is already identified as",df['Identification_Status'][index+j] ) try: print(df['data'][index+j]) except: pass nnbl_identified_as = df['Identification_Status'][index+j] df['nnbl_identified_as'][index] = nnbl_identified_as try: line_pos_using_nnbl = sorted(list(nnbl_df.loc[nnbl_df[nnbl_identified_as] > 0,nnbl_identified_as].index),key=takeNumeric) line_pos_string = ';'.join([str(elem) for elem in line_pos_using_nnbl]) df['pos_using_nnbl'][index] = line_pos_string except: print("nnbl weights sheet does not have column",nnbl_identified_as) pass ## now get the combined possibility , doing this for unidentified lines if(df['isIdentified'][index] == 'No') or len(cur_line_pos)> 1: set_a = set(line_pos_using_pnbl) set_b = set(line_pos_using_nnbl) ## for some lines having mulitple possibilites some possibilities might get eliminated ## so take intersection with current possibilites if len(cur_line_pos)> 1: set_c = set(cur_line_pos) pos_using_pnbl_nnbl = sorted(set.intersection(set_a,set_b,set_c)) else: pos_using_pnbl_nnbl = sorted(set.intersection(set_a,set_b)) ## append the top and other posibility back to the beginning list if it got eliminated in the intersection ## append the posibility which were not to be removed back to the list if it got eliminated in the intersection try: pos_not_to_remove = df['ps_not_to_remove'][index].split(";") except: pos_not_to_remove = [] for ps in pos_not_to_remove: if ps not in pos_using_pnbl_nnbl: pos_using_pnbl_nnbl.insert(0,ps) print("pos_using_pnbl_nnbl is ",pos_using_pnbl_nnbl) ## if only one poss then that means identified if len(pos_using_pnbl_nnbl) == 1: new_lines_identified = True identify_using = 'Identification_Status' count_lines_identified += 1 line_nos_identified.append(df['line_no'][index]) df['When_Identified'][index] = 'PNBL_NNBL' ## append the weight to the possibilites pos_with_weights = [] for pos in pos_using_pnbl_nnbl: wt = '' pos_wt = str(pos) try: wt = df[pos_wt][index] except: print("could not find weight for pos ",pos," at index ",index) continue try: wt = int(wt) pos_wt += '-' + str(wt) except: print("could not convert wt to int for pos ",pos," at index ",index) continue pos_with_weights.append(pos_wt) print("pos_with_weights is ",pos_with_weights) # now sort in descending order using the weights as key pos_with_weights = sorted(pos_with_weights,key=useWeights , reverse = True) print("sorted pos_with_weights is ",pos_with_weights) line_pos_string_with_weights = ';'.join([str(elem) for elem in pos_with_weights]) df['Identification_Status_with_weights'][index] = line_pos_string_with_weights ## copy over to identification status without the weights but in order of decreasing weights pos_without_weight = [] for pos in pos_with_weights: pos_without_weight.append(pos.split("-")[0]) line_pos_string = ';'.join([str(elem) for elem in pos_without_weight]) print("line new possibilites",line_pos_string) df['pos_using_pnbl_nnbl'][index] = line_pos_string df['Identification_Status'][index] = line_pos_string print(df['Identification_Status'][index]) ##write the count of possibilities to a column, make new column for each iteration df[pos_count_column_name][index] = len(pos_without_weight) total_pos_after += len(pos_without_weight) print( "new lines identified :" ,new_lines_identified) print(total_pos_before,total_pos_after) if (total_pos_before - total_pos_after) > 0: pos_decreased = True else: pos_decreased = False return df,new_lines_identified,identify_using,count_lines_identified,line_nos_identified,pos_decreased def remove_ineligible_pos(df,identify_using,iteration): def useWeights(ps): return int(ps.split("-")[1]) def check_pos_eligibility(pos,pvs_line_pos,nxt_line_pos,first_line_flag,last_line_flag): pos_eligible = False pos_eligible = ps_conditions_dict.get(pos,pos) # if pos == 'ps7': # print(pvs_line_pos) # print(nxt_line_pos) # pos_eligible = 'ps7' if (any(ps in pvs_line_pos for ps in ['blank']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['ps10','ps11','ps13']) or last_line_flag) else False # else : # pos_eligible = pos return pos_eligible count_lines_identified = 0 pos_count_column_name = 'CountofEligiblePossibilities_afterIteration' + str(iteration) df[pos_count_column_name] = '' new_lines_identified = False total_pos_before = 0 total_pos = 0 pos_decreased = False line_not_identified = False for index in df.index: #for index in range(5,8): total_pos_before += len(df[identify_using][index].split(";")) line_not_identified = True if (len(df[identify_using][index].split(";")) > 1) else False print (index,line_not_identified) if line_not_identified : line_pos = df[identify_using][index].split(";") pvs_line_pos = [] nxt_line_pos = [] first_line_flag = False last_line_flag = False if index == 0: first_line_flag = True nxt_line_pos = df[identify_using][index+1].split(";") elif index == df.index[-1]: pvs_line_pos = df[identify_using][index-1].split(";") last_line_flag = True else: pvs_line_pos = df[identify_using][index-1].split(";") nxt_line_pos = df[identify_using][index+1].split(";") line_eligible_pos = [] print('\n') print (index) print(pvs_line_pos) print(nxt_line_pos) ps_conditions_dict = { 'ps1': 'ps1' if (any(ps in pvs_line_pos for ps in ['blank','ps6','ps15','ps16','ps17']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['blank','ps4','ps6']) or last_line_flag) else False, 'ps2': 'ps2' if (any(ps in pvs_line_pos for ps in ['blank','ps6','ps15','ps16','ps17']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['ps3']) or last_line_flag) else False, 'ps3': 'ps3' if (any(ps in pvs_line_pos for ps in ['ps2']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['blank','ps4','ps6']) or last_line_flag) else False, 'ps4': 'ps4' if (any(ps in pvs_line_pos for ps in ['blank','ps1','ps3','ps15']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['blank','ps5','ps6']) or last_line_flag) else False, 'ps5': 'ps5' if (any(ps in pvs_line_pos for ps in ['ps4','ps5']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['ps5','ps6']) or last_line_flag) else False, 'ps6': 'ps6' if (any(ps in pvs_line_pos for ps in ['blank','ps1','ps3','ps4','ps5','ps15']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['blank','ps1','ps2','ps4','ps6','ps7','ps8','ps16']) or last_line_flag) else False, 'ps7': 'ps7' if (any(ps in pvs_line_pos for ps in ['blank','ps6']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['blank','ps10','ps11','ps13','ps15']) or last_line_flag) else False, 'ps8': 'ps8' if (any(ps in pvs_line_pos for ps in ['blank','ps6']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['ps9','ps10','ps11','ps13','ps15']) or last_line_flag) else False, 'ps9': 'ps9' if (any(ps in pvs_line_pos for ps in ['ps7']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['ps10','ps11','ps13','ps15']) or last_line_flag) else False, 'ps10':'ps10' if (any(ps in pvs_line_pos for ps in ['ps7','ps8','ps9']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['ps13','ps15']) or last_line_flag) else False, 'ps11':'ps11' if (any(ps in pvs_line_pos for ps in ['ps7','ps8','ps9','ps14','blank']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['ps12','ps26']) or last_line_flag) else False, 'ps12':'ps12' if (any(ps in pvs_line_pos for ps in ['ps11']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['ps13']) or last_line_flag) else False, 'ps13':'ps13' if (any(ps in pvs_line_pos for ps in ['blank','ps7','ps8','ps9','ps10','ps12']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['blank','ps14','ps15']) or last_line_flag) else False, 'ps14':'ps14' if (any(ps in pvs_line_pos for ps in ['ps13','ps14','blank']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['ps14','ps15','blank']) or last_line_flag) else False, 'ps15':'ps15' if (any(ps in pvs_line_pos for ps in ['ps7','ps8','ps9','ps10','ps12','ps13','ps14','blank']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['blank','ps1','ps3','ps4','ps6','ps7','ps8','ps16']) or last_line_flag) else False, 'ps16':'ps16' if (any(ps in pvs_line_pos for ps in ['blank','ps6','ps15']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['blank','ps1','ps3']) or last_line_flag) else False, 'ps17':'ps17' if (any(ps in pvs_line_pos for ps in ['blank','ps1','ps3','ps6','ps15','ps16','ps17','ps18']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['ps1','ps2','ps8','ps16','ps17','ps18','ps19','blank']) or last_line_flag) else False, 'ps18':'ps18' if (any(ps in pvs_line_pos for ps in ['blank','ps6','ps15','ps16']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['blank','ps5','ps16']) or last_line_flag) else False, 'ps19':'ps19' if (any(ps in pvs_line_pos for ps in ['blank','ps15','ps6']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['blank','ps4','ps6']) or last_line_flag) else False, 'ps21':'ps21' if (any(ps in pvs_line_pos for ps in ['blank','ps1','ps3','ps4','ps5','ps15']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['ps10','ps11','ps13']) or last_line_flag) else False, 'ps22':'ps22' if (any(ps in pvs_line_pos for ps in ['blank','ps1','ps3','ps4','ps5','ps15']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['blank','ps1','ps3']) or last_line_flag) else False, 'ps25':'ps25' if (any(ps in pvs_line_pos for ps in ['blank','ps6']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['ps12','ps13']) or last_line_flag) else False, 'ps26':'ps26' if (any(ps in pvs_line_pos for ps in ['ps7','ps8','ps9','ps11','blank','ps27']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['ps14','ps15','ps27']) or last_line_flag) else False, 'ps27':'ps27' if (any(ps in pvs_line_pos for ps in ['ps7','ps8','ps9','ps10','ps12','ps13','ps14','ps26','blank']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['ps26','ps14','ps15']) or last_line_flag) else False, 'ps28':'ps28' if (any(ps in pvs_line_pos for ps in ['ps7','ps8','ps9','ps10','ps12','ps13','ps14','blank']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['blank','ps9','ps10','ps11','ps13','ps15']) or last_line_flag) else False, 'ps29':'ps29' if (any(ps in pvs_line_pos for ps in ['ps7','ps8','ps9','ps10','ps12','ps13','ps14']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['blank','ps1','ps2','ps4','ps6','ps7','ps8','ps16']) or last_line_flag) else False, 'ps30':'ps30' if (any(ps in pvs_line_pos for ps in ['blank','ps6','ps7']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['ps14','ps15','blank']) or last_line_flag) else False, 'ps31':'ps31' if (any(ps in pvs_line_pos for ps in ['blank','ps15','ps6']) or first_line_flag) and (any(ps in nxt_line_pos for ps in ['blank','ps1','ps2']) or last_line_flag) else False } print("current possibilities",line_pos) for pos in line_pos: #print (pos) pos_checked = check_pos_eligibility(pos,pvs_line_pos,nxt_line_pos,first_line_flag,last_line_flag) if pos_checked: line_eligible_pos.append(pos_checked) print("eligible possibilities",line_eligible_pos) ## append back the possibilties (top and other which are not to be removed) ## append the top and other posibility back to the beginning list if it got eliminated in the intersection try: pos_not_to_remove = df['ps_not_to_remove'][index].split(";") except: pos_not_to_remove = [] for ps in pos_not_to_remove: if ps not in line_eligible_pos: line_eligible_pos.insert(0,ps) print (";".join(line_pos)) print (";".join(line_eligible_pos)) df['Identification_Status_ineligible_removed'][index] = ";".join(line_eligible_pos) df['CountofPossibilities_afterIneligibleRemoved'][index] = len(line_eligible_pos) ##write the count of possibilities to a column, make new column for each iteration df[pos_count_column_name][index] = len(line_eligible_pos) total_pos += len(line_eligible_pos) if len(line_eligible_pos) == 1: count_lines_identified +=1 new_lines_identified = True df['When_Identified'][index] = 'RemovingIneligiblePossibilities' # if len(line_pos) - len(line_eligible_pos) > 0: # pos_decreased = True else: df['Identification_Status_ineligible_removed'][index] = df[identify_using][index] total_pos += 1 continue ## copy over the inelgible removed to Identification Status and sort in decreasing order of weights ##df.loc[:,'Identification_Status'] = df.loc[:,'Identification_Status_ineligible_removed'] pos_eligible = df['Identification_Status_ineligible_removed'][index].split(";") ## append the weight to the possibilites print("test") pos_with_weights = [] for pos in pos_eligible: wt = '' pos_wt = str(pos) try: wt = df[pos_wt][index] except: print("could not find weight for pos ",pos," at index ",index) continue try: wt = int(wt) pos_wt += '-' + str(wt) except: print("could not convert wt to int for pos ",pos," at index ",index) continue pos_with_weights.append(pos_wt) # now sort in descending order using the weights as key pos_with_weights = sorted(pos_with_weights,key=useWeights , reverse = True) print(pos_with_weights) line_pos_string_with_weights = ';'.join([str(elem) for elem in pos_with_weights]) df['Identification_Status_with_weights'][index] = line_pos_string_with_weights ## copy over to identification status without the weights but in order of decreasing weights pos_without_weight = [] for pos in pos_with_weights: pos_without_weight.append(pos.split("-")[0]) line_pos_string = ';'.join([str(elem) for elem in pos_without_weight]) print(line_pos_string) df['Identification_Status_ineligible_removed'][index] = line_pos_string if (total_pos_before - total_pos) > 0: pos_decreased = True else: pos_decreased = False print(total_pos_before,total_pos) return df,new_lines_identified,pos_decreased,count_lines_identified,total_pos def do_while_pnnbl_ineligible(df): ## import the pnbl and nnbl weights ##pnbl_df,nnbl_df = prep_pnnbl_wts(csv_pnbl_nnbl) # pnbl_df.set_index('Possibilities',inplace= True) # nnbl_df.set_index('Possibilities',inplace= True) cur_dir = mypath pnbl_df = pd.read_csv(os.path.join(cur_dir,'pnbl_weights.csv') , index_col = 'Possibilities' , keep_default_na = False) pnbl_df = pnbl_df.head(34) pnbl_df = pnbl_df.apply(pd.to_numeric,errors ='ignore') nnbl_df= pd.read_csv(os.path.join(cur_dir,'nnbl_weights.csv'),index_col = 'Possibilities' , keep_default_na = False ) nnbl_df = nnbl_df.head(34) nnbl_df = nnbl_df.apply(pd.to_numeric,errors ='ignore') run_again = True total_pos_initial = 0 for index in df.index: total_pos_initial += len(df['Identification_Status'][index].split(";")) while run_again: ## run the identification using pnbl_nnbl till no new lines get identified new_lines_identified = True iteration = 1 line_nos_identified_iteration = [] line_nos_identified = [] count_total = 0 pos_decreased = False if 'Identification_Status' in df.columns: identify_using = 'Identification_Status' else: identify_using = 'stage-1_output' if 'Identification_Status_with_weights' not in df: df['Identification_Status_with_weights'] = '' if 'pnbl_identified_as' not in df: df['pnbl_identified_as'] = '' if 'pos_using_pnbl' not in df: df['pos_using_pnbl'] = '' if 'nnbl_identified_as' not in df: df['nnbl_identified_as'] = '' if 'pos_using_nnbl' not in df: df['pos_using_nnbl'] = '' if 'pos_using_pnbl_nnbl' not in df: df['pos_using_pnbl_nnbl'] = '' while new_lines_identified or pos_decreased: print("Identifying lines using pnbl_nnbl ") print("using:",identify_using) df,new_lines_identified,identify_using,count,line_nos_identified_iteration,pos_decreased = identify_using_pnbl_nnbl(df,identify_using,iteration) #df.to_csv( 'After_Iteration' + str(iteration) + '.csv') print("New lines identified in Iteration",iteration,": ",count) iteration += 1 count_total+= count line_nos_identified.append(line_nos_identified_iteration) print("lines identified in iteration",line_nos_identified) print(df['Identification_Status'].value_counts()) print ("Total new lines identified in pnbl nnbl after all iteration:",count_total) print ("line nos identified in all iterations",line_nos_identified) ## run the identification by eliminating possibilities no new lines get identified new_lines_identified = True pos_decreased = True iteration = 1 count_total = 0 total_pos_start = 0 total_pos_after = 0 identify_using = 'Identification_Status' if 'Identification_Status_ineligible_removed' not in df.columns: df['Identification_Status_ineligible_removed'] = '' df['CountofPossibilities_afterIneligibleRemoved'] = '' for index in df.index: total_pos_start += len(df['Identification_Status'][index].split(";")) # df.to_csv('test_pnnbl.csv') while new_lines_identified or pos_decreased: print("\n Identifying lines using eliminating ineligible possibilities ") print("using:",identify_using) df,new_lines_identified,pos_decreased,count,total_pos_after = remove_ineligible_pos(df,identify_using,iteration) #df.to_csv( 'eligiblePossibilitiesAfter_Iteration' + str(iteration) + '.csv') print("New lines identified in Iteration",iteration,": ",count) identify_using = 'Identification_Status_ineligible_removed' iteration += 1 count_total+= count print ("Total new lines identified by eliminating ineligible possibilities after all iteration:",count_total) ## copy over column df['Identification_Status'] = df['Identification_Status_ineligible_removed'] print(df['Identification_Status'].value_counts()) print(total_pos_start,total_pos_after,iteration) ## run both pnnbl and pos ineligible if ## pos is decreased using ineligble code run_again = True if total_pos_start > total_pos_after else False print(total_pos_initial,total_pos_after) return df def examine_same_content_lines(df): df_udn = df.loc[df['isIdentified'] == 'No', : ] df_udn['data_strip'] = df_udn['data'].str.strip() df_occurences = df_udn.value_counts(['data_strip']).reset_index(name='count') pos_sp_list = df_occurences.loc[df_occurences['count'] > 1,'data_strip'].to_list() for pos_sp in pos_sp_list: print (pos_sp) for index in df.index: if df['isIdentified'][index] == 'Yes': continue if 'ps7' not in df['Identification_Status'][index].split(";"): continue #df['data'][index].strip ## preceeded by prev_line_blank = True if df['plb'][index] == 'Y' else False if prev_line_blank: if index - 2 >= 0: pnbl = index - 2 else: print("start of script \n") continue else: if index - 1 >= 0: pnbl = index - 1 else: print("start of script \n") continue if df['data'][index].strip() == pos_sp and df['Identification_Status'][pnbl] == ('ps15' or 'ps6'): print(index) try: print(df['data'][pnbl]) except: pass print(df['Identification_Status'][pnbl]) if df['parenthetical'][index] == 'Absent': df['Identification_Status'][index] == 'ps7' df['isIdentified'][index] = 'Yes' df['When_Identified'][index] = 'ExaminingSameContentLines' else: df['Identification_Status'][index] == 'ps8;ps25' return df def examine_action_possibilities_part1(df): # loop through the lines and check lines possibility for being action for index in df.index[2:-2]: if df['isIdentified'][index] == 'Yes': continue print("unidentified line index is",index) cur_line_indent = df['ssc'][index] next_line_blank = df['nlb'][index] prev_line_blank = df['plb'][index] if next_line_blank == 'N': next_nbl_line_indent = df['ssc'][index+1] next_nbl_line_pos = df['Identification_Status'][index+1].split(";") else: next_nbl_line_indent = df['ssc'][index+2] next_nbl_line_pos = df['Identification_Status'][index+2].split(";") if prev_line_blank == 'N': prev_nbl_line_indent = df['ssc'][index-1] prev_nbl_line_pos = df['Identification_Status'][index-1].split(";") else: prev_nbl_line_indent = df['ssc'][index-2] prev_nbl_line_pos = df['Identification_Status'][index-2].split(";") ## check for ps5,ps4 if cur_line_indent >=15 and cur_line_indent <=25: data = df['data'][index] # if cur_line_indent == prev_nbl_line_indent and cur_line_indent == next_nbl_line_indent: # ps4_in_prev = True if 'ps4' in prev_nbl_line_pos[0] else False # ps5_in_prev = True if 'ps5' in prev_nbl_line_pos[0] else False # ps5_in_next = True if 'ps5' in next_nbl_line_pos else False # ps6_in_next = True if 'ps6' in next_nbl_line_pos else False # ps16_in_prev = True if 'ps16' in prev_nbl_line_pos[0] else False # ps17_in_prev = True if 'ps17' in prev_nbl_line_pos[0] else False # next_line_flag = True if next_line_blank == 'Y' else False # prev_action_special_transition = True if any([ps4_in_prev,ps5_in_prev,ps16_in_prev,ps17_in_prev]) else False # prev_action = True if any([ps4_in_prev,ps5_in_prev]) else False # if all([ps5_in_next,ps6_in_next,prev_action_special_transition]): # cur_line_new_pos = 'ps1;ps5;ps4' # df['Identification_Status'][index] = cur_line_new_pos # df['When_Identified'][index] = 'ExaminingActionPossibilities' # print(data) # print(cur_line_new_pos) # print("\n") # continue # elif prev_action and ((ps5_in_next and ps6_in_next) or next_line_flag): # cur_line_new_pos = 'ps6;ps5;ps4' # df['Identification_Status'][index] = cur_line_new_pos # df['When_Identified'][index] = 'ExaminingActionPossibilities' # print(data) # print(cur_line_new_pos) # print("\n") # continue # elif prev_action and (ps5_in_next and ps6_in_next) : # cur_line_new_pos = 'ps5;ps4' # df['Identification_Status'][index] = cur_line_new_pos # df['When_Identified'][index] = 'ExaminingActionPossibilities' # print(data) # print(cur_line_new_pos) # print("\n") # continue ## check for ps6 if len(prev_nbl_line_pos) == 1 and cur_line_indent == prev_nbl_line_indent and next_nbl_line_pos[0] == 'ps7': ps4_in_prev = True if 'ps4' == prev_nbl_line_pos[0] else False ps5_in_prev = True if 'ps5' == prev_nbl_line_pos[0] else False if any([ps4_in_prev,ps5_in_prev]): cur_line_new_pos = 'ps6' df['Identification_Status'][index] = cur_line_new_pos df['When_Identified'][index] = 'ExaminingActionPossibilities' try: print(data) except: pass print(cur_line_new_pos) print("\n") continue return df def examine_action_possibilities_part2(df): # loop through the lines and check lines possibility for being action for index in df.index[2:-2]: if df['isIdentified'][index] == 'Yes': continue cur_line_pos = df['Identification_Status'][index].split(";") if cur_line_pos[0] == 'ps1': continue if df['plb'][index] == 'N' : pnbl_pos = df['Identification_Status'][index-1].split(";") else: pnbl_pos = df['Identification_Status'][index-2].split(";") if df['nlb'][index] == 'N' : nnbl_pos = df['Identification_Status'][index+1].split(";") else: nnbl_pos = df['Identification_Status'][index+2].split(";") line_no = df['line_no'][index] data = df['data'][index] ## declare ps6 if nnbl ps7 and pnbl has either ps1 ,ps3, ps15, ps6 as possibility if len(pnbl_pos) == 1 and len(nnbl_pos) == 1 and nnbl_pos[0] == 'ps7': ps1_equal_prev = True if 'ps1' == pnbl_pos[0] else False ps3_equal_prev = True if 'ps3' == pnbl_pos[0] else False ps6_equal_prev = True if 'ps6' == pnbl_pos[0] else False ps15_equal_prev = True if 'ps15' == pnbl_pos[0] else False if any([ps1_equal_prev,ps3_equal_prev,ps6_equal_prev,ps15_equal_prev]): print("Identifying line as ps6 as before speaker and after 1,3,6, 15",) try: print(line_no , data) except: pass df['Identification_Status'][index] = 'ps6' df['When_Identified'][index] = 'ExaminingActionPossibilitiesAfterIneligible' df['isIdentified'][index] = 'Yes' continue # loop through to examine for ps5 for index in df.index[2:-2]: if df['Identification_Status'][index] == 'blank': continue if len(df['Identification_Status'][index].split(";")) == 1 : continue if df['plb'][index] == 'N' : pnbl_pos = df['Identification_Status'][index-1].split(";") else: pnbl_pos = df['Identification_Status'][index-2].split(";") if df['nlb'][index] == 'N' : nnbl_pos = df['Identification_Status'][index+1].split(";") else: nnbl_pos = df['Identification_Status'][index+2].split(";") line_no = df['line_no'][index] data = df['data'][index] ## declare ps5 if prev ps4,ps5 and next ps5,ps6 if pnbl_pos in ('ps4','ps5') and nnbl_pos in ('ps5','ps6'): print("Identifying line as ps5 as between 4,5 and 5,6") df['Identification_Status'][index] = 'ps5' df['When_Identified'][index] = 'ExaminingActionPossibilitiesAfterIneligible' df['isIdentified'][index] = 'Yes' continue ### declare ps5 if prev has a possibility as ps4,ps5 and next ps5,ps6 # ps4_in_prev = True if 'ps4' in pnbl_pos else False # ps5_in_prev = True if 'ps5' in pnbl_pos else False # ps5_in_next = True if 'ps5' in nnbl_pos else False # ps6_in_next = True if 'ps6' in nnbl_pos else False # if any([ps4_in_prev,ps5_in_prev]) and any([ps5_in_next,ps6_in_next]): # print("Identifying line as ps5 in between possibilities of 4,5 and 5,6") # df['Identification_Status'][index] = 'ps5' # df['When_Identified'][index] = 'ExaminingActionPossibilitiesAfterIneligible' # continue return df def examine_same_indent_bunch(df): total_pos_before = 0 total_pos_after = 0 for index in df.index: line_pos = df['Identification_Status'][index].split(";") total_pos_before += len(line_pos) index_iter = iter(df.index) for index in index_iter: # print("index",index) # print(df['Identification_Status'][index]) # print(len(df['Identification_Status'][index].split(";"))) line_pos = df['Identification_Status'][index].split(";") if len(line_pos) == 1: continue print(index) cur_indent = df['ssc'][index] next_line_blank = True if df['nlb'][index] == 'Y' else False if index+2 > df.index[-1]: break if next_line_blank: next_nbl_indent = df['ssc'][index+2] nbl_identified = True if len(df['Identification_Status'][index+2].split(";")) == 1 else False j = index + 2 lines_count = 2 else: next_nbl_indent = df['ssc'][index+1] nbl_identified = True if len(df['Identification_Status'][index+1].split(";")) == 1 else False j = index + 1 lines_count = 1 start_index = index rev_index = index nbl_lines_count = 1 data = df['data'][index] print("lines with same indent") try: print(data) except: pass bunch_index = [] bunch_index.append(start_index) ## get the number of line with same indent while cur_indent == next_nbl_indent and not nbl_identified: nbl_lines_count +=1 cur_indent = next_nbl_indent next_line_blank = True if df['nlb'][j] == 'Y' else False data = df['data'][j] try: print(data) except: pass bunch_index.append(j) rev_index = j if j+2 >= df.index[-1]: break if next_line_blank: next_nbl_indent = df['ssc'][j+2] j += 2 else: next_nbl_indent = df['ssc'][j+1] j += 1 print(nbl_lines_count) ## preceeded by prev_line_blank = True if df['plb'][index] == 'Y' else False if prev_line_blank: if start_index - 2 >= 0: pnbl = start_index - 2 else: print("start of script \n") continue else: if start_index - 1 >= 0: pnbl = start_index - 1 else: print("start of script \n") continue print("preceeded by",df['Identification_Status'][pnbl]) try: print(df['data'][pnbl]) except: pass ## followed by next_line_blank = True if df['nlb'][rev_index] == 'Y' else False if next_line_blank: if rev_index + 2 <= df.index[-1]: nnbl = rev_index + 2 else: print("end of script \n") continue else: if rev_index + 1 <= df.index[-1]: nnbl = rev_index + 1 else: print("end of script \n") continue try: print(df['data'][nnbl]) except: pass print("followed by",df['Identification_Status'][nnbl]) print("\n") if df['Identification_Status'][pnbl] == 'ps15' and df['Identification_Status'][nnbl] == 'ps7': last_line_pos = df['Identification_Status'][rev_index].split(";") if nbl_lines_count == 1 and len(df['Identification_Status'][rev_index].split(";")) > 1: if line_pos[0] == 'ps1': continue print("CASE A2") # single line is ps6 df['Identification_Status'][rev_index] = 'ps6' df['isIdentified'][rev_index] = 'Yes' df['When_Identified'][rev_index] = 'ExaminingSameIndentBunch' bunch_index.remove(rev_index) print("ps6", df['data'][rev_index]) elif nbl_lines_count > 1 and len(last_line_pos) > 1 and 'ps15' not in last_line_pos: print("CASE A1") # last line is ps6 df['Identification_Status'][rev_index] = 'ps6' df['isIdentified'][rev_index] = 'Yes' df['When_Identified'][rev_index] = 'ExaminingSameIndentBunch' bunch_index.remove(rev_index) print("ps6", df['data'][rev_index]) # rest of lines ; remove possibility other than slugline,transition and action bunch_iter = iter(bunch_index) # remove possibilities other than action, slug , transition ps_not_to_remove = ['ps1','ps2','ps3','ps4','ps5','ps6','ps16','ps18','ps19'] for k in bunch_iter: cur_line_pos = df['Identification_Status'][k].split(";") print(cur_line_pos) new_line_pos = [] for pos in cur_line_pos: if pos in ps_not_to_remove: new_line_pos.append(pos) df['Identification_Status'][k] = ";".join(new_line_pos) df['When_Identified'][k] = 'ExaminingSameIndentBunch' print(df['Identification_Status'][k], df['data'][k]) # if nbl_lines_count > 1: # print("CASE A") # # last line is ps6 # df['Identification_Status'][rev_index] = 'ps6' # df['When_Identified'][rev_index] = 'ExaminingSameIndentBunch' # bunch_index.remove(rev_index) # print("ps6", df['data'][rev_index]) # # first line is ps4 # df['Identification_Status'][start_index] = 'ps4' # df['When_Identified'][start_index] = 'ExaminingSameIndentBunch' # bunch_index.remove(start_index) # print("ps4", df['data'][start_index]) # bunch_iter = iter(bunch_index) # # middle lines are ps5 # for k in bunch_iter: # df['Identification_Status'][k] = 'ps5' # df['When_Identified'][k] = 'ExaminingSameIndentBunch' # print("ps5", df['data'][k]) # if df['Identification_Status'][pnbl] != 'ps15' and df['Identification_Status'][nnbl] in ('ps4','ps6'): # print("CASE B") # bunch_iter = iter(bunch_index) # # remove possibilities other than action, slug , transition # ps_not_to_remove = ['ps1','ps2','ps3','ps4','ps5','ps6','ps16','ps18','ps19'] # for k in bunch_iter: # cur_line_pos = df['Identification_Status'][k].split(";") # print(cur_line_pos) # new_line_pos = [] # for pos in cur_line_pos: # if pos in ps_not_to_remove: # new_line_pos.append(pos) # df['Identification_Status'][k] = ";".join(new_line_pos) # df['When_Identified'][k] = 'ExaminingSameIndentBunch' # print(df['Identification_Status'][k], df['data'][k]) for advance in range(start_index,rev_index): next(index_iter) for index in df.index: line_pos = df['Identification_Status'][index].split(";") total_pos_after += len(line_pos) # In[ ]: print(total_pos_before,total_pos_after) return df def examine_relative_indent(df): total_pos_before = 0 total_pos_after = 0 for index in df.index: line_pos = df['Identification_Status'][index].split(";") total_pos_before += len(line_pos) index_iter = iter(df.index) for index in index_iter: # print("index",index) # print(df['Identification_Status'][index]) # print(len(df['Identification_Status'][index].split(";"))) line_pos = df['Identification_Status'][index].split(";") if len(line_pos) == 1: continue data = df['data'][index] cur_indent = df['ssc'][index] cur_parenthetical_absent = True if df['parenthetical'][index] == 'Absent' else False ## preceeded by prev_line_blank = True if df['plb'][index] == 'Y' else False if prev_line_blank: if index - 2 >= 0: pnbl = index - 2 else: # print("start of script \n") continue else: if index - 1 >= 0: pnbl = index - 1 else: # print("start of script \n") continue # print("preceeded by",df['Identification_Status'][pnbl]) pnbl_data = df['data'][pnbl] pnbl_indent = df['ssc'][pnbl] pnbl_parenthetical_absent = True if df['parenthetical'][pnbl] == 'Absent' else False # print (pnbl,pnbl_indent,pnbl_data) ## followed by next_line_blank = True if df['nlb'][index] == 'Y' else False if next_line_blank: if index + 2 <= df.index[-1]: nnbl = index + 2 else: print("end of script \n") continue else: if index + 1 <= df.index[-1]: nnbl = index + 1 else: # print("end of script \n") continue nnbl_data = df['data'][nnbl] nnbl_indent = df['ssc'][nnbl] nnbl_parenthetical_absent = True if df['parenthetical'][nnbl] == 'Absent' else False nnbl_line_pos = df['Identification_Status'][nnbl].split(";") # print("followed by",df['Identification_Status'][nnbl]) # print(nnbl,nnbl_indent,nnbl_data) ## followed followed by next_next_line_blank = True if df['nlb'][nnbl] == 'Y' else False if next_next_line_blank: if nnbl + 2 <= df.index[-1]: nnnbl = nnbl + 2 else: print("end of script \n") continue else: if nnbl + 1 <= df.index[-1]: nnnbl = nnbl + 1 else: # print("end of script \n") continue nnnbl_data = df['data'][nnnbl] nnnbl_indent = df['ssc'][nnnbl] nnnbl_parenthetical_absent = True if df['parenthetical'][nnnbl] == 'Absent' else False try: if cur_indent > nnbl_indent and nnbl_indent > nnnbl_indent and cur_parenthetical_absent and nnbl_parenthetical_absent and nnnbl_parenthetical_absent: try: print(data) except: pass print("current possibility",line_pos) if 'ps7' in line_pos and 'ps7' not in nnbl_line_pos: print("Identifying as ps7") df['Identification_Status'][index] = 'ps7' df['When_Identified'][index] = 'ExaminingRelativeIndent' df['isIdentified'][index] = 'Yes' print("\n") except: pass for index in df.index: line_pos = df['Identification_Status'][index].split(";") total_pos_after += len(line_pos) print(total_pos_before,total_pos_after) # total_pos_before = 0 # total_pos_after = 0 # for index in df.index: # line_pos = df['Identification_Status'][index].split(";") # total_pos_before += len(line_pos) # index_iter = iter(df.index) # for index in index_iter: # # print("index",index) # # print(df['Identification_Status'][index]) # # print(len(df['Identification_Status'][index].split(";"))) # line_pos = df['Identification_Status'][index].split(";") # if len(line_pos) == 1: # continue # data = df['data'][index] # cur_indent = df['data_begins/Space count'][index] # cur_parenthetical_absent = True if df['Parenthetical'][index] == 'Absent' else False # ## preceeded by # prev_line_blank = True if df['prvious_line_blank'][index] == 'Y' else False # if prev_line_blank: # if index - 2 >= 0: # pnbl = index - 2 # else: # # print("start of script \n") # continue # else: # if index - 1 >= 0: # pnbl = index - 1 # else: # # print("start of script \n") # continue # # print("preceeded by",df['Identification_Status'][pnbl]) # pnbl_data = df['data'][pnbl] # pnbl_indent = df['data_begins/Space count'][pnbl] # pnbl_parenthetical_absent = True if df['Parenthetical'][pnbl] == 'Absent' else False # # print (pnbl,pnbl_indent,pnbl_data) # ## followed by # next_line_blank = True if df['next_line_blank'][index] == 'Y' else False # if next_line_blank: # if index + 2 <= df.index[-1]: # nnbl = index + 2 # else: # print("end of script \n") # continue # else: # if index + 1 <= df.index[-1]: # nnbl = index + 1 # else: # # print("end of script \n") # continue # nnbl_data = df['data'][nnbl] # nnbl_indent = df['data_begins/Space count'][nnbl] # nnbl_parenthetical_absent = True if df['Parenthetical'][nnbl] == 'Absent' else False # # print("followed by",df['Identification_Status'][nnbl]) # # print(nnbl,nnbl_indent,nnbl_data) # if cur_indent > pnbl_indent and cur_indent > nnbl_indent and cur_parenthetical_absent and pnbl_parenthetical_absent and nnbl_parenthetical_absent: # print(data) # print("current possibility",line_pos) # print("Identifying as ps7") # df['Identification_Status'][index] = 'ps7' # df['When_Identified'][index] = 'ExaminingRelativeIndent' # df['Identified'][index] = 'identified' # print("\n") # for index in df.index: # line_pos = df['Identification_Status'][index].split(";") # total_pos_after += len(line_pos) # # In[ ]: # print(total_pos_before,total_pos_after) return df def examine_pos_sp_indent(df,csv_removed_space_between_words,csv_pnnbl_ineligble_after_relative_indent): df_indents = pd.read_csv(csv_removed_space_between_words,usecols = ['line_no','ssc']) df_indents['ssc'].value_counts().sort_index() identification_status = pd.read_csv(csv_pnnbl_ineligble_after_relative_indent, usecols = ['line_no','Identification_Status','isIdentified'] ) identification_status['line_no'] = identification_status['line_no'].astype(float) df_indents = df_indents.merge(identification_status, how = 'inner' , on = 'line_no') df_indents['ssc'].value_counts().sort_index() df_indents.loc[df_indents['isIdentified'] == 'Yes' , 'ssc'].value_counts().sort_index() sp_indents_df = df_indents.loc[df_indents['Identification_Status'] == 'ps7','ssc'].value_counts().sort_values(ascending = False).head(5) sp_indents_list = sp_indents_df.index.values.tolist() sp_indents_list.sort() sp_indents_list try: pos_sp_indent = sp_indents_list[-1] except: pos_sp_indent = 200 margin = 3 for index in df.index: if df['isIdentified'][index] == 'Yes': continue cur_indent = df['ssc'][index] if cur_indent >= pos_sp_indent-margin and cur_indent <= pos_sp_indent+margin: data = df['data'][index] word_count = len(data.split()) #print(word_count) if 'ps7' in df['Identification_Status'][index] and df['parenthetical'][index] == 'Absent' and word_count <= 2: try: print(index,data) except: pass print("Identifying as speaker") df['Identification_Status'][index] = 'ps7' df['isIdentified'][index] = 'Yes' df['When_Identified'][index] = 'ExaminingPossibleSpeakerIndent' return df def examine_action_middle_possibilities_using_pnnbl_top(df): # loop through to examine for ps5 for index in df.index[2:-2]: if df['isIdentified'][index] == 'Yes': continue if df['plb'][index] == 'N' : pnbl_pos = df['Identification_Status'][index-1].split(";") pnbl_index = index -1 else: pnbl_pos = df['Identification_Status'][index-2].split(";") pnbl_index = index -2 if df['nlb'][index] == 'N' : nnbl_pos = df['Identification_Status'][index+1].split(";") else: nnbl_pos = df['Identification_Status'][index+2].split(";") try: if df['plb'][pnbl_index] == 'N' : ppnbl_pos = df['Identification_Status'][pnbl_index-1].split(";") else: ppnbl_pos = df['Identification_Status'][pnbl_index-2].split(";") except: pass line_no = df['line_no'][index] data = df['data'][index] cur_line_pos = df['Identification_Status'][index].split(";") ## from here # ## declare ps5 if prev ps4,ps5 and next ps5,ps6 # if cur_line_pos[0] == 'ps5' and pnbl_pos[0] in ('ps4','ps5') and nnbl_pos[0] in ('ps5','ps6'): # ## if cur parenthtical or # if df['Parenthetical'][index] != 'Absent': # print("skipping as current has parenthetical ") # print(line_no,data) # continue # if ppnbl_pos[0] == 'ps16' : # print(" skipping as pre previous top transition") # print(line_no,data) # continue # try: # if ppnbl_pos[1] == 'ps16' or pnbl_pos[1] == 'ps16': # print("skipping as previous or pre previous top2 transition") # print(line_no,data) # continue # except: # pass # print("Identifying line as ps5 as between 4,5 and 5,6") # print(data) # df['Identification_Status'][index] = 'ps5' # df['When_Identified'][index] = 'ExaminingActionMiddlePossibilitiesUsingTopPnnbl' # df['Identified'] = 'identified' # continue ## till here return df def examine_speaker_extension(df,audit_df): # loop through to examine speaker extensiton top 'ps8' for index in df.index[2:-2]: if df['isIdentified'][index] == 'Yes': continue # if df['prvious_line_blank'][index] == 'N' : # pnbl_pos = df['Identification_Status'][index-1].split(";") # pnbl_index = index -1 # else: # pnbl_pos = df['Identification_Status'][index-2].split(";") # pnbl_index = index -2 if df['nlb'][index] == 'N' : nnbl_pos = df['Identification_Status'][index+1].split(";") nnbl_par = df['parenthetical'][index+1] else: nnbl_pos = df['Identification_Status'][index+2].split(";") nnbl_par = df['parenthetical'][index+2] # try: # if df['prvious_line_blank'][pnbl_index] == 'N' : # ppnbl_pos = df['Identification_Status'][pnbl_index-1].split(";") # else: # ppnbl_pos = df['Identification_Status'][pnbl_index-2].split(";") # except: # pass line_no = df['line_no'][index] data = df['data'][index] cur_line_pos = df['Identification_Status'][index].split(";") cur_line_par = df['parenthetical'][index] extn_found = False extn_list = ['O.S.','V.O.',"CONT'D","CONTā€™D",'VOICE'] for extn in extn_list: if extn in str(data): extn_found = True break ## if hishest is ps8 if cur_line_pos[0] == 'ps8' and cur_line_par == 'PartMidEnd' and nnbl_par == 'Absent' and not extn_found: try: print(data) except: pass if re.search('\(',data,re.IGNORECASE) : pos_starts = re.search('\(',data,re.IGNORECASE).start() #pos_end = re.search('(',data,re.IGNORECASE).end() before_par = data[:pos_starts] after_par = data[pos_starts:] print ("Separating Parenthetical") print("Identifying as speaker") print(index) try: print(before_par) except: pass df['data'][index] = before_par df['parenthetical'][index] = 'Absent' df['When_Identified'][index] = 'ExaminingSpeakerLines' df['case'][index] = 'AllUpper' df['Identification_Status'][index] = 'ps7' nlb = df['nlb'][index] df['nlb'][index] = 'N' line_no = df['line_no'][index] next_line_no = df['line_no'][index+1] new_line_no = (line_no + next_line_no) / 2 if new_line_no in audit_df.index: new_line_no = (new_line_no + next_line_no)/2 audit_df.loc[new_line_no] = '' audit_df.loc[new_line_no]['line_removed'] = 'No' audit_df.loc[line_no]['line_broken_into_multiple_lines'] = 'Separated Speaker and Parenthetical' print(df['Identification_Status'][index]) try: print(after_par) except: pass print("identifying parenthetical") df.loc[index + 0.25] = '' df.loc[index + 0.25,'data'] = after_par df.loc[index + 0.25,'parenthetical'] = 'Complete' df.loc[index + 0.25,'When_Identified'] ='ExaminingSpeakerLines' df.loc[index + 0.25,'Identification_Status'] = 'ps10' df.loc[index + 0.25,'case'] = '' df.loc[index + 0.25,'plb'] = 'N' df.loc[index + 0.25,'nlb'] = nlb df.loc[index + 0.25,'line_no'] = new_line_no df = df.sort_index().reset_index(drop=True) continue ##now examine the speakers having : or apstrophe after them and separate to new line speaker_list = df.loc[df['Identification_Status'] == 'ps7','data'].astype(str) speaker_list = [ elem.strip() for elem in speaker_list ] speaker_lines_list = df.loc[df['Identification_Status'] == 'ps7','line_no'].to_list() unique_speaker_list = [] speaker_in_two_lines_list = [] for speaker in speaker_list: speaker = speaker.strip() #print(speaker) if speaker not in unique_speaker_list: unique_speaker_list.append(speaker) ## strip the blank spaces print(unique_speaker_list) for index in df.index[2:-2]: if df['isIdentified'][index] == 'Yes': continue line_no = df['line_no'][index] data = df['data'][index] cur_line_pos = df['Identification_Status'][index].split(";") extn_found = False extn_list = ['O.S.','V.O.',"CONT'D","CONTā€™D",'VOICE'] for extn in extn_list: if extn in str(data): extn_found = True break for speaker in unique_speaker_list: if re.search(speaker,data) and df['Identification_Status'][index] not in ('ps7','ps8','ps9') : # check if speaker is at start of line followed by something (like : apostrpohe) print(index) pos_starts = re.search(speaker,data,re.IGNORECASE).start() pos_end = re.search(speaker,data,re.IGNORECASE).end() before_speaker = data[:pos_starts] after_speaker = data[pos_end:] print("speaker match found") try: print("data 4567:", data) print("speaker 4568:",speaker) print("before speaker:",before_speaker) print("after speaker:",after_speaker) except: pass try: char1_after_speaker = after_speaker.lstrip()[0] except: char1_after_speaker = '' try: print("char1_after_speaker 4579 :",char1_after_speaker) except: pass speaker_skip_list = ['MONTAGES','MUSICAL MONTAGES','MORNING','AT HOTEL','TV','ESSENTIALS','ESSENTIAL','LATER'] ## separate parenthtical if speaker is followed by parenthtical if before_speaker.isspace() and char1_after_speaker == '(' and df['parenthetical'][index] == 'PartMidEnd' and not extn_found: print("before speaker inside the if condition:",before_speaker) print ("Seperating Parenthetical") print("Identifying speaker") print(index) df['data'][index] = before_speaker + speaker df['parenthetical'][index] = 'Absent' df['When_Identified'][index] = 'ExaminingSpeakerLines' df['case'][index] = 'AllUpper' df['Identification_Status'][index] = 'ps7' nlb = df['nlb'][index] df['nlb'][index] = 'N' line_no = df['line_no'][index] next_line_no = df['line_no'][index+1] new_line_no = (line_no + next_line_no) / 2 if new_line_no in audit_df.index: new_line_no = (new_line_no + next_line_no)/2 audit_df.loc[new_line_no] = '' audit_df.loc[new_line_no]['line_removed'] = 'No' audit_df.loc[line_no]['line_broken_into_multiple_lines'] = 'Separated Speaker and Parenthetical' #print(df['Identification_Status'][index]) print("identifying parenthetical") df.loc[index + 0.25] = '' df.loc[index + 0.25,'data'] = after_speaker df.loc[index + 0.25,'parenthetical'] = 'Complete' df.loc[index + 0.25,'When_Identified'] ='ExaminingSpeakerLines' df.loc[index + 0.25,'Identification_Status'] = 'ps10' df.loc[index + 0.25,'case'] = '' df.loc[index + 0.25,'plb'] = 'N' df.loc[index + 0.25,'nlb'] = nlb df.loc[index + 0.25,'line_no'] = new_line_no df = df.sort_index().reset_index(drop=True) continue elif before_speaker.isspace() and char1_after_speaker == ':' and not extn_found and speaker not in speaker_skip_list: print("before speaker in elif condition 4624:", before_speaker) print ("Seperating : colon dialogue") print("Identifying speaker") print(index) df['data'][index] = before_speaker + speaker df['parenthetical'][index] = 'Absent' df['When_Identified'][index] = 'ExaminingSpeakerLines' df['case'][index] = 'AllUpper' df['Identification_Status'][index] = 'ps7' nlb = df['nlb'][index] df['nlb'][index] = 'N' #print(df['Identification_Status'][index]) line_no = df['line_no'][index] next_line_no = df['line_no'][index+1] new_line_no = (line_no + next_line_no) / 2 if new_line_no in audit_df.index: new_line_no = (new_line_no + next_line_no)/2 audit_df.loc[new_line_no] = '' audit_df.loc[new_line_no]['line_removed'] = 'No' audit_df.loc[line_no]['line_broken_into_multiple_lines'] = 'Separated Speaker and Dialogue seperated by colon:' print("possible dialogue") print(after_speaker) df.loc[index + 0.25] = '' df.loc[index + 0.25,'data'] = after_speaker #df.loc[index + 0.25,'Parenthetical'] = 'Complete' df.loc[index + 0.25,'When_Identified'] ='ExaminingSpeakerLines' df.loc[index + 0.25,'Identification_Status'] = ";".join(cur_line_pos) df.loc[index + 0.25,'case'] = '' df.loc[index + 0.25,'plb'] = 'N' df.loc[index + 0.25,'nlb'] = nlb df.loc[index + 0.25,'line_no'] = new_line_no df = df.sort_index().reset_index(drop=True) continue elif before_speaker.isspace() and (char1_after_speaker == 'ā€˜' or char1_after_speaker == '"') and not extn_found: print("before speaker in seperating apostrophe:", before_speaker) print ("Seperating apostrophe") print("Identifying speaker") print(index) df['data'][index] = before_speaker + speaker df['parenthetical'][index] = 'Absent' df['When_Identified'][index] = 'ExaminingSpeakerLines' df['case'][index] = 'AllUpper' df['Identification_Status'][index] = 'ps7' nlb = df['nlb'][index] df['nlb'][index] = 'N' line_no = df['line_no'][index] next_line_no = df['line_no'][index+1] new_line_no = (line_no + next_line_no) / 2 if new_line_no in audit_df.index: new_line_no = (new_line_no + next_line_no)/2 audit_df.loc[new_line_no] = '' audit_df.loc[new_line_no]['line_removed'] = 'No' audit_df.loc[line_no]['line_broken_into_multiple_lines'] = 'Separated Speaker and Dialogue seperated by colon:' #print(df['Identification_Status'][index]) print("identifying as parenthetical") df.loc[index + 0.25] = '' df.loc[index + 0.25,'data'] = '(' + after_speaker.strip() + ')' df.loc[index + 0.25,'parenthetical'] = 'Complete' df.loc[index + 0.25,'When_Identified'] ='ExaminingSpeakerLines' df.loc[index + 0.25,'Identification_Status'] = 'ps10' df.loc[index + 0.25,'case'] = '' df.loc[index + 0.25,'plb'] = 'N' df.loc[index + 0.25,'nlb'] = nlb df.loc[index + 0.25,'line_no'] = new_line_no df = df.sort_index().reset_index(drop=True) continue # df.to_csv(p.output_file_path,index=False) # lines_not_removed = audit_df.loc[audit_df['line_removed'] != 'Yes'].index.to_list() # audit_df.sort_index(inplace= True) # audit_df.reset_index(inplace= True) # for line in lines_not_removed: # new_data = '' # try: # new_data =df.loc[df['line_no'] == line, 'data'].values[0] # except: # pass # #print(new_data) # audit_df.loc[audit_df['line_no'] == line, 'data_corrected'] = new_data # #print(audit_df.loc[audit_df['line_no'] == line, 'data_corrected']) return df def examine_action_using_top2_part1(df): # loop through for index in df.index[2:-2]: if df['isIdentified'][index] == 'Yes': continue cur_indent = df['ssc'][index] nnbl_indent = 0 if df['plb'][index] == 'N' : pnbl_pos = df['Identification_Status'][index-1].split(";") pnbl_index = index -1 else: pnbl_pos = df['Identification_Status'][index-2].split(";") pnbl_index = index -2 if df['nlb'][index] == 'N' : nnbl_pos = df['Identification_Status'][index+1].split(";") nnbl_par = df['parenthetical'][index+1] nnbl_indent = df['ssc'][index+1] else: nnbl_pos = df['Identification_Status'][index+2].split(";") nnbl_par = df['parenthetical'][index+2] nnbl_indent = df['ssc'][index+2] # try: # if df['prvious_line_blank'][pnbl_index] == 'N' : # ppnbl_pos = df['Identification_Status'][pnbl_index-1].split(";") # else: # ppnbl_pos = df['Identification_Status'][pnbl_index-2].split(";") # except: # pass line_no = df['line_no'][index] data = df['data'][index] cur_line_pos = df['Identification_Status'][index].split(";") ## skip if next is dialogue if ("".join(nnbl_pos) == 'ps13') or ("".join(nnbl_pos) == 'ps15'): continue try: pnbl_top2 = pnbl_pos[1] except: pnbl_top2 = '' try: nnbl_top2 = nnbl_pos[1] except: nnbl_top2 = '' ## ps4 identification made stricter if pnbl top is ps8 try: if pnbl_pos[0] == 'ps8': continue except: pass ## examine ps4 = action beginning line_identified = False if cur_line_pos[0] == 'ps4' and pnbl_pos[0] != 'ps4' and nnbl_pos[0] != 'ps7': try: print(data) except: pass print(pnbl_pos[0],cur_line_pos[0],nnbl_pos[0]) if 'ps6' in (pnbl_pos[0]) and df['nlb'][index] == 'N' and ('ps7' not in cur_line_pos): ## can make strict by indent also print('identifying as ps4 case 1 top 1') df['Identification_Status'][index] = 'ps4' df['When_Identified'][index] = 'UsingTop2PNNBL' line_identified = True # elif pnbl_top2 and 'ps6' in pnbl_top2: # print('identifying as ps4 case 1 top2') # df['Identification_Status'][index] = 'ps4' # df['When_Identified'][index] = 'UsingTop2PNNBL' # line_identified = True else: print("ps6 not in previous") # if line_identified : # # run pnnbl ineligible # do_while_pnnbl_ineligible_v035.run_pnnbl_ineligible(df) if cur_line_pos[0] == 'ps4' and not line_identified and pnbl_pos[0] != 'ps4': print("checking for ps5/6 in next") if pnbl_pos[0] == 'ps5': print("skipping as previous top is ps5" ) continue elif pnbl_top2 and 'ps5' in pnbl_top2: print("skipping as previous top2 is ps5" ) continue if cur_indent == nnbl_indent: if 'ps5' in nnbl_pos[0] or 'ps6' in nnbl_pos[0]: try: print(data,'identifying as ps4 case 2 top1') except: pass df['Identification_Status'][index] = 'ps4' df['When_Identified'][index] = 'UsingTop2PNNBL' elif nnbl_top2 and 'ps5' in nnbl_top2: try: print(data,'identifying as ps4 case 2 top2 ps5') except: pass df['Identification_Status'][index] = 'ps4' df['When_Identified'][index] = 'UsingTop2PNNBL' elif nnbl_top2 and 'ps6' in nnbl_top2: try: print(data,'identifying as ps4 case 2 top2 ps6') except: pass df['Identification_Status'][index] = 'ps4' df['When_Identified'][index] = 'UsingTop2PNNBL' else: print("current indent is not equal to next indent") print("\n") ## commented as ps6 getting wrong # for index in df.index[2:-2]: # if df['isIdentified'][index] == 'Yes': # continue # if df['plb'][index] == 'N' : # pnbl_pos = df['Identification_Status'][index-1].split(";") # pnbl_index = index -1 # else: # pnbl_pos = df['Identification_Status'][index-2].split(";") # pnbl_index = index -2 # if df['nlb'][index] == 'N' : # nnbl_pos = df['Identification_Status'][index+1].split(";") # nnbl_par = df['parenthetical'][index+1] # else: # nnbl_pos = df['Identification_Status'][index+2].split(";") # nnbl_par = df['parenthetical'][index+2] # # try: # # if df['prvious_line_blank'][pnbl_index] == 'N' : # # ppnbl_pos = df['Identification_Status'][pnbl_index-1].split(";") # # else: # # ppnbl_pos = df['Identification_Status'][pnbl_index-2].split(";") # # except: # # pass # line_no = df['line_no'][index] # data = df['data'][index] # cur_line_pos = df['Identification_Status'][index].split(";") # try: # pnbl_top2 = pnbl_pos[1] # except: # pnbl_top2 = '' # try: # nnbl_top2 = nnbl_pos[1] # except: # nnbl_top2 = '' # ## examine action end # if cur_line_pos[0] == 'ps6': # try: # print("pnbl",df['data'][pnbl_index]) # except: # pass # #print(pnbl_pos) # if 'ps4' in pnbl_pos[0] or 'ps5' in pnbl_pos[0] : # #print(pnbl_pos[0]) # try: # print(data) # except: # pass # print("identifying as ps6 using top1 pnbl") # df['Identification_Status'][index] = 'ps6' # df['When_Identified'][index] = 'UsingTop2PNNBL' # elif pnbl_top2 and ('ps4' in pnbl_top2 or 'ps5' in pnbl_top2): # try: # print(data) # except: # pass # df['Identification_Status'][index] = 'ps6' # df['When_Identified'][index] = 'UsingTop2PNNBL' # print("identifying as ps6 using top2 pnbl") return df def refine_action_possibilties(df): for index in df.index[1:-1]: if df['isIdentified'][index] == 'Yes': continue pnbl_pos = [] nnbl_pos = [] if index == 0: pnbl_pos = ['blank'] elif df['plb'][index] == 'N' : pnbl_pos = df['Identification_Status'][index-1].split(";") pnbl_index = index -1 elif index - 1 == 0: pnpl_pos = ['blank'] else: pnbl_pos = df['Identification_Status'][index-2].split(";") pnbl_index = index -2 if index == df.index[-1]: nnbl_pos = ['blank'] elif df['nlb'][index] == 'N' : nnbl_pos = df['Identification_Status'][index+1].split(";") nnbl_par = df['parenthetical'][index+1] elif index+1 == df.index[-1]: nnbl_pos = ['blank'] else: nnbl_pos = df['Identification_Status'][index+2].split(";") nnbl_par = df['parenthetical'][index+2] # try: # if df['prvious_line_blank'][pnbl_index] == 'N' : # ppnbl_pos = df['Identification_Status'][pnbl_index-1].split(";") # else: # ppnbl_pos = df['Identification_Status'][pnbl_index-2].split(";") # except: # pass line_no = df['line_no'][index] data = df['data'][index] cur_line_pos = df['Identification_Status'][index].split(";") # try: # pnbl_top2 = pnbl_pos[1] # except: # pnbl_top2 = '' # try: # nnbl_top2 = nnbl_pos[1] # except: # nnbl_top2 = '' # print(line_no,data) # print(pnbl_pos) # print(cur_line_pos) # print(nnbl_pos) line_new_pos = [] #using pnbl and nnbl identified lines refine/identify current line # if "".join(pnbl_pos) in ('ps15','ps6') and cur_line_pos[0] == 'ps4': # print(line_no,data) # print("pnbl is 15 or 6 and current top is 'ps4'") # print("Identifying as ps4") # df['Identification_Status'][index] = 'ps4' # cur_line_pos = ['ps4'] # df['When_Identified'][index] = 'RefiningActionPossibilities' line_new_pos = cur_line_pos if "".join(nnbl_pos) == 'ps7': try: print(line_no,data) except: pass print("remove ps5,14") if 'ps5' in line_new_pos: line_new_pos.remove('ps5') if 'ps14' in line_new_pos: line_new_pos.remove('ps14') if "".join(nnbl_pos) == 'ps4': try: print(line_no,data) except: pass print("remove ps5") if 'ps5' in line_new_pos: line_new_pos.remove('ps5') if "".join(pnbl_pos) == 'ps4': try: print(line_no,data) except: pass print("remove ps3 and 7") if 'ps3' in line_new_pos: line_new_pos.remove('ps3') if 'ps7' in line_new_pos: line_new_pos.remove('ps7') df['Identification_Status'][index] = ";".join(line_new_pos) return df def prep_pnnbl_eligible_csv(pnbl_eligibility_matrix,nnbl_eligibility_matrix): cur_dir = mypath # cur_dir = os.getcwd() pnbl_eligible_df = pd.read_csv(pnbl_eligibility_matrix, skiprows = [0]) nnbl_eligible_df = pd.read_csv(nnbl_eligibility_matrix, skiprows = [0]) pnbl_eligible_df.rename(columns={pnbl_eligible_df.columns[1]:'Possibilities',pnbl_eligible_df.columns[0]:'Description'} ,inplace = True) nnbl_eligible_df.rename(columns={nnbl_eligible_df.columns[1]:'Possibilities',nnbl_eligible_df.columns[0]:'Description'} ,inplace = True) pnbl_eligible_df.to_csv(os.path.join(cur_dir,'pnbl_eligible_pos.csv'), index =False) nnbl_eligible_df.to_csv(os.path.join(cur_dir,'nnbl_eligible_pos.csv'), index =False) pnbl_eligible_df = pd.read_csv(os.path.join(cur_dir,'pnbl_eligible_pos.csv'), index_col = ['Possibilities']) nnbl_eligible_df = pd.read_csv(os.path.join(cur_dir,'nnbl_eligible_pos.csv'), index_col = ['Possibilities']) def check_eligibility_using_identified_pnnbl(df): total_pos_before = 0 total_pos_after = 0 lines_identified = 0 cur_dir = mypath pnbl_eligible_df = pd.read_csv(os.path.join(cur_dir,'pnbl_eligible_pos.csv')) nnbl_eligible_df = pd.read_csv(os.path.join(cur_dir,'nnbl_eligible_pos.csv')) for index in df.index: if df['isIdentified'][index] == 'Yes': total_pos_before += 1 total_pos_after += 1 print(total_pos_before,total_pos_after) continue line_no = df['line_no'][index] data = df['data'][index] cur_line_pos = df['Identification_Status'][index].split(";") if cur_line_pos[0] != '': total_pos_before += len(cur_line_pos) pnbl_pos = [] nnbl_pos = [] if index == 0: pnbl_pos = ['blank'] elif df['plb'][index] == 'N' : pnbl_pos = df['Identification_Status'][index-1].split(";") pnbl_index = index -1 elif index - 1 == 0: pnpl_pos = ['blank'] else: pnbl_pos = df['Identification_Status'][index-2].split(";") pnbl_index = index -2 if index == df.index[-1]: nnbl_pos = ['blank'] elif df['nlb'][index] == 'N' : nnbl_pos = df['Identification_Status'][index+1].split(";") nnbl_par = df['parenthetical'][index+1] elif index+1 == df.index[-1]: nnbl_pos = ['blank'] else: nnbl_pos = df['Identification_Status'][index+2].split(";") nnbl_par = df['parenthetical'][index+2] line_new_pos = cur_line_pos try: print(line_no,data) except: pass print("current line pos", cur_line_pos,df['Identification_Status'][index]) try: print("previous line pos",pnbl_pos) print("next line pos",nnbl_pos) except: pass if len(pnbl_pos) == 1 and pnbl_pos[0] != 'blank': print("pnbl is identified as ", pnbl_pos) ## keep only possibilities which can exist with this pnbl ## filter pnbl_eligible_pos = pnbl_eligible_df.loc[pnbl_eligible_df[pnbl_pos[0]] == 'yes','Possibilities'].to_list() print("eligible possibilties as per pnbl",pnbl_eligible_pos) line_new_pos = [ps for ps in line_new_pos if ps in pnbl_eligible_pos] print("line new possibilities", line_new_pos) else: print("previous line not identified") if len(nnbl_pos) == 1 and nnbl_pos[0] != 'blank': print("nnbl is identified as ", nnbl_pos) ## keep only possibilities which can exist with this pnbl ## filter nnbl_eligible_pos = nnbl_eligible_df.loc[nnbl_eligible_df[nnbl_pos[0]] == 'yes','Possibilities'].to_list() print("eligible possibilties as per nnbl",nnbl_eligible_pos) line_new_pos = [ps for ps in line_new_pos if ps in nnbl_eligible_pos] print("line new possibilities", line_new_pos) else: print("next line not identified") ## make null as special term if len(line_new_pos) == 0: print("making null possibility special term ps17") line_new_pos = ['ps17'] if len(line_new_pos) == 1: df['isIdentified'][index] = 'Yes' lines_identified += 1 df['Identification_Status'][index] = (";").join(line_new_pos) total_pos_after += len(line_new_pos) print(total_pos_before,total_pos_after) print(total_pos_before,total_pos_after) pos_decreased = True if total_pos_after < total_pos_before else False return df,pos_decreased,lines_identified def do_while_examine_using_identified_pnnbl(df): pos_decreased = True total_lines_identified = 0 iteration = 0 while pos_decreased : iteration += 1 df,pos_decreased,lines_identified = check_eligibility_using_identified_pnnbl(df) total_lines_identified += lines_identified print(iteration,total_lines_identified) print(iteration,total_lines_identified) return df def start_top_identifications_part1(df): # loop through to examine speaker extension for index in df.index[1:-1]: if df['isIdentified'][index] == 'Yes': continue pnbl_pos = [] nnbl_pos = [] pnbl_index = index -1 nnbl_index = index +1 if index == 0: pnbl_pos = ['blank'] elif df['plb'][index] == 'N' : pnbl_pos = df['Identification_Status'][index-1].split(";") pnbl_index = index -1 elif index - 1 == 0: pnpl_pos = ['blank'] else: pnbl_pos = df['Identification_Status'][index-2].split(";") pnbl_index = index -2 if index == df.index[-1]: nnbl_pos = ['blank'] elif df['nlb'][index] == 'N' : nnbl_pos = df['Identification_Status'][index+1].split(";") nnbl_par = df['parenthetical'][index+1] nnbl_index = index +1 elif index+1 == df.index[-1]: nnbl_pos = ['blank'] else: nnbl_pos = df['Identification_Status'][index+2].split(";") nnbl_par = df['parenthetical'][index+2] nnbl_index = index +2 cur_indent = df['ssc'][index] pnbl_indent = df['ssc'][pnbl_index] nnbl_indent = df['ssc'][nnbl_index] try: if df['plb'][pnbl_index] == 'N' : ppnbl_pos = df['Identification_Status'][pnbl_index-1].split(";") else: ppnbl_pos = df['Identification_Status'][pnbl_index-2].split(";") ppnbl_exists = True except: ppnbl_exists = False pass line_no = df['line_no'][index] data = df['data'][index] cur_line_pos = df['Identification_Status'][index].split(";") cur_par = df['parenthetical'][index] try: pnbl_par = df['parenthetical'][pnbl_index] pnbl_case = df['case'][pnbl_index] pnbl_data = df['data'][pnbl_index] except: pass try: pnbl_top2 = pnbl_pos[1] except: pnbl_top2 = '' # try: # nnbl_top2 = nnbl_pos[1] # except: # nnbl_top2 = '' # print(line_no,data) # print(pnbl_pos) # print(cur_line_pos) # print(nnbl_pos) line_new_pos = [] #using pnbl and nnbl identified lines refine/identify current line # if "".join(pnbl_pos) in ('ps15','ps6') and cur_line_pos[0] == 'ps4': # print(line_no,data) # print("pnbl is 15 or 6 and current top is 'ps4'") # print("Identifying as ps4") # df['Identification_Status'][index] = 'ps4' # cur_line_pos = ['ps4'] # df['When_Identified'][index] = 'RefiningActionPossibilities' cur_line_pos = df['Identification_Status'][index].split(";") top1 = cur_line_pos[0] top2 = top1 top3 = top1 top4 = top1 top5 = top1 if len(cur_line_pos) == 5: top5 = cur_line_pos[4] if len(cur_line_pos) >= 4: top4 = cur_line_pos[3] if len(cur_line_pos) >= 3: top3 = cur_line_pos[2] if len(cur_line_pos) >= 2: top2 = cur_line_pos[1] ## if top is 1,6,7,16 identify them ## identify as 7 where 9 is top and 7 is 2nd and parenthetical absent if cur_line_pos[0] == 'ps1' or (cur_line_pos[0] =='ps6' and pnbl_par =='Absent' and "".join(nnbl_pos) != 'ps6' and nnbl_pos[0] != 'ps5' and nnbl_pos[0] != 'ps6') or cur_line_pos[0] == 'ps7' or cur_line_pos[0] == 'ps9' or cur_line_pos[0] == 'ps8' or (cur_line_pos[0] == 'ps16' and nnbl_pos[0] != 'ps13' and nnbl_pos[0] != 'ps15' and nnbl_pos[0] != 'ps10'): try: print(line_no,data) except: pass if(len(cur_line_pos) > 1): if cur_line_pos[0] == 'ps6' and 'ps15' in (top1,top2,top3,top4,top5): print("not identifying as ps6 can also be ps15 ") continue if cur_line_pos[0] == 'ps6' and ( pnbl_pos[0] == 'ps13' or pnbl_pos == 'ps14'): print("not identifying as ps6 as could be ps15") continue if cur_line_pos[0] == 'ps6' and (pnbl_case == 'AllUpper' and len(pnbl_data.split()) == 1) : print("not identifying as ps6 can also be ps15 ") continue if cur_line_pos[0] == 'ps6' and (len(data.split()) == 1 and cur_indent > pnbl_indent ) : print("not identifying as ps6 can also be ps7 ") continue print("identifying as top",cur_line_pos[0]) if cur_line_pos[0] == 'ps9' and top2 == 'ps7' and cur_par == 'Absent': line_new_pos.append(top2) else: line_new_pos.append(cur_line_pos[0]) try: if nnbl_pos[0] == 'ps1' and 'ps2' in cur_line_pos : line_new_pos.append('ps2') print("added ps2 to ps1") except: print("possiblity next line pos not available") pass try: if pnbl_pos[0] == 'ps1' and 'ps3' in cur_line_pos : line_new_pos.append('ps3') print("added ps3 to ps1") except: print("possiblity previous line pos not available") pass if top1 == 'ps1' and (top2 == 'ps6' or top3 == 'ps6' or top2 == 'ps8'): ## not indentifying as ps1 continue if cur_line_pos[0] == 'ps1' and 'ps30' in cur_line_pos : line_new_pos.append('ps30') print("added ps30 to ps1") df['Identification_Status'][index] = ";".join(line_new_pos) df['When_Identified'][index] = 'StartIdentifyingTopsPart1' continue ppnbl_top_not_16 = True if ppnbl_exists: ppnbl_top_not_16 = False if ppnbl_pos[0] == 'ps16' else True if cur_line_pos[0] == 'ps5' or cur_line_pos[1] == 'ps5': if 'ps16' not in pnbl_top2: if (pnbl_pos[0] == 'ps4' or (pnbl_pos[0] == 'ps5' and ppnbl_top_not_16 )) and df['nlb'][index] == 'N' and cur_indent == pnbl_indent and cur_indent == nnbl_indent : print("code commented") # print("identifying current as ps5") # print(line_no,data) # df['Identification_Status'][index] = 'ps5' # df['When_Identified'][index] = 'StartIdentifyingTops' elif pnbl_pos[0] == 'ps4' and df['nlb'][index] == 'Y' and cur_indent == pnbl_indent : print("identifying current as ps6 as next also blank") try: print(line_no,data) except: pass df['Identification_Status'][index] = 'ps6' df['When_Identified'][index] = 'StartIdentifyingTopsPart1' ## additonally identify the ps8 #speaker_list = df.loc[df['Identification_Status'] == 'ps7','data'].to_list() for index in df.index[1:-1]: if df['isIdentified'][index] == 'Yes': continue cur_line_pos = df['Identification_Status'][index].split(";") data = df['data'][index] extn_found = False extn_list = ['O.S.','V.O.',"CONT'D","CONTā€™D",'VOICE','CONT.'] for extn in extn_list: if extn in str(data): extn_found = True break if cur_line_pos[0] == 'ps8' and extn_found: df['Identification_Status'][index] = 'ps8' df['When_Identified'][index] = 'StartIdentifyingTopsPart1' return df def start_top_identifications_part1_diluted(df): print("in tops diluted") # loop through to examine speaker extension for index in df.index[1:-1]: if df['isIdentified'][index] == 'Yes': continue print(index) pnbl_pos = [] nnbl_pos = [] pnbl_index = index -1 nnbl_index = index +1 if index == 0: pnbl_pos = ['blank'] elif df['plb'][index] == 'N' : pnbl_pos = df['Identification_Status'][index-1].split(";") pnbl_index = index -1 elif index - 1 == 0: pnpl_pos = ['blank'] else: pnbl_pos = df['Identification_Status'][index-2].split(";") pnbl_index = index -2 if index == df.index[-1]: nnbl_pos = ['blank'] elif df['nlb'][index] == 'N' : nnbl_pos = df['Identification_Status'][index+1].split(";") nnbl_par = df['parenthetical'][index+1] nnbl_index = index +1 elif index+1 == df.index[-1]: nnbl_pos = ['blank'] else: nnbl_pos = df['Identification_Status'][index+2].split(";") nnbl_par = df['parenthetical'][index+2] nnbl_index = index +2 cur_indent = df['ssc'][index] try: pnbl_indent = df['ssc'][pnbl_index] pnbl_case = df['case'][pnbl_index] except: pnbl_indent = -1 pnbl_case = '' try: nnbl_indent = df['ssc'][nnbl_index] nnbl_case = df['case'][nnbl_index] except: nnbl_indent = -1 nnbl_case = '' #nnbl_indent = df['ssc'][nnbl_index] try: if df['plb'][pnbl_index] == 'N' : ppnbl_pos = df['Identification_Status'][pnbl_index-1].split(";") else: ppnbl_pos = df['Identification_Status'][pnbl_index-2].split(";") ppnbl_exists = True except: ppnbl_exists = False pass line_no = df['line_no'][index] data = df['data'][index] cur_line_pos = df['Identification_Status'][index].split(";") cur_line_case = df['case'][index] pnbl_par = df['parenthetical'][pnbl_index] try: pnbl_top2 = pnbl_pos[1] except: pnbl_top2 = '' # try: # nnbl_top2 = nnbl_pos[1] # except: # nnbl_top2 = '' # print(line_no,data) # print(pnbl_pos) # print(cur_line_pos) # print(nnbl_pos) line_new_pos = [] #using pnbl and nnbl identified lines refine/identify current line # if "".join(pnbl_pos) in ('ps15','ps6') and cur_line_pos[0] == 'ps4': # print(line_no,data) # print("pnbl is 15 or 6 and current top is 'ps4'") # print("Identifying as ps4") # df['Identification_Status'][index] = 'ps4' # cur_line_pos = ['ps4'] # df['When_Identified'][index] = 'RefiningActionPossibilities' cur_line_pos = df['Identification_Status'][index].split(";") top1 = cur_line_pos[0] top2 = top1 top3 = top1 top4 = top1 top5 = top1 if len(cur_line_pos) == 5: top5 = cur_line_pos[4] if len(cur_line_pos) >= 4: top4 = cur_line_pos[3] if len(cur_line_pos) >= 3: top3 = cur_line_pos[2] if len(cur_line_pos) >= 2: top2 = cur_line_pos[1] if top1 == 'ps1' and (top2 == 'ps6' or top3 == 'ps6' or top2 == 'ps8'): ## not indentifying as ps1 continue ## if top is 1,6,7,16 identify them if cur_line_pos[0] == 'ps1' or (cur_line_pos[0] =='ps6' and pnbl_par =='Absent' and "".join(nnbl_pos) != 'ps6' and nnbl_pos[0] != 'ps5') or cur_line_pos[0] == 'ps7' or (cur_line_pos[0] == 'ps16' and nnbl_pos[0] != 'ps15'): try: print(line_no,data) except: pass if(len(cur_line_pos) > 1): if cur_line_pos[0] == 'ps6' and 'ps15' in (top1,top2,top3,top4,top5): print("not identifying as ps6 can also be ps15 ") continue print("identifying as top",cur_line_pos[0]) line_new_pos.append(cur_line_pos[0]) if cur_line_pos[0] == 'ps1' and 'ps30' in cur_line_pos : line_new_pos.append('ps30') print("added ps30 to ps1") df['Identification_Status'][index] = ";".join(line_new_pos) df['When_Identified'][index] = 'StartIdentifyingTopsDiluted' continue ppnbl_top_not_16 = True if ppnbl_exists: ppnbl_top_not_16 = False if ppnbl_pos[0] == 'ps16' else True if cur_line_pos[0] == 'ps5' or cur_line_pos[1] == 'ps5': if 'ps16' not in pnbl_top2: if (pnbl_pos[0] == 'ps4' or (pnbl_pos[0] == 'ps5' and ppnbl_top_not_16 )) and df['nlb'][index] == 'N' and cur_indent == pnbl_indent and cur_indent == nnbl_indent and pnbl_case != 'AllUpper' and cur_line_case != 'AllUpper' and nnbl_case !='AllUpper': print("Lenient: code not commented") print("identifying current as ps5") try: print(line_no,data) except: pass df['Identification_Status'][index] = 'ps5' df['When_Identified'][index] = 'StartIdentifyingTopsDiluted' elif pnbl_pos[0] == 'ps4' and df['nlb'][index] == 'Y' and cur_indent == pnbl_indent : print("identifying current as ps6 as next also blank") try: print(line_no,data) except: pass df['Identification_Status'][index] = 'ps6' df['When_Identified'][index] = 'StartIdentifyingTopsDiluted' return df def examine_speaker_mix_part1(df,audit_df): df = df.sort_index().reset_index(drop=True) audit_df = df.sort_index().reset_index(drop=True) #df = df.sort_index().reset_index(drop=True) for index in df.index: if df['isIdentified'][index] == 'Yes' or pd.isna(df['Identification_Status'][index]): continue line_no = df['line_no'][index] data = df['data'][index] cur_line_pos = df['Identification_Status'][index].split(";") ## if parenthetical at last then split to new line if 'ps30' not in cur_line_pos: continue if cur_line_pos[0] == 'ps1' or cur_line_pos[0] == 'ps2' or cur_line_pos[0] == 'ps14' or cur_line_pos[0] == 'ps5' or cur_line_pos[0] == 'ps13' or cur_line_pos[0] == 'ps4' : # skipping as could be slugline continue extn_found = False extn_list = ['O.S.','V.O.',"CONT'D","CONTā€™D",'VOICE'] for extn in extn_list: if extn in str(data): extn_found = True break if df['parenthetical'][index] == 'PartMidEnd' and not extn_found : try: print(data) except: pass if re.search('\(',data,re.IGNORECASE): pos_starts = re.search('\(',data,re.IGNORECASE).start() #pos_end = re.search('(',data,re.IGNORECASE).end() before_par = data[:pos_starts] after_par = data[pos_starts:] print("before_par = data[:pos_starts] line 5557:", before_par) print("after_par = data[pos_starts:] line 5558 :", after_par) print ("Seperating Parenthetical") print("Identifying as speaker mix with dialogue and current pos") print(cur_line_pos) print(index) if not before_par.isupper(): # skip as possibly not speaker continue # try: # print(before_par) # except: # pass #print("df['data'][index]:",df['data'][index]) try: df['data'][index] = before_par except: df['data'][int(index)] = before_par df['parenthetical'][index] = 'Absent' df['When_Identified'][index] = 'ExaminingSpeakerMix' df['Identification_Status'][index] = ";".join(cur_line_pos) nlb = df['nlb'][index] df['nlb'][index] = 'N' line_no = df['line_no'][index] next_line_no = df['line_no'][index+1] try: new_line_no = (line_no + next_line_no) / 2 except: new_line_no = (int(line_no) + int(next_line_no)) / 2 if new_line_no in audit_df.index: new_line_no = (new_line_no + next_line_no)/2 audit_df.loc[new_line_no] = np.nan audit_df.loc[new_line_no]['line_removed'] = 'No' print( "index:",index,"\n", "df['data'][index]:",df['data'][index],"\n", "df['parenthetical'][index]:",df['parenthetical'][index],"\n", "df['When_Identified'][index]:",df['When_Identified'][index],"\n", "df['Identification_Status'][index]:",df['Identification_Status'][index],"\n", "df['nlb'][index]:",df['nlb'][index],"\n", ) try: audit_df.loc[line_no]['line_broken_into_multiple_lines'] = 'Separated Speaker Mixed with Parenthetical' except: audit_df.loc[np.float64(line_no)]['line_broken_into_multiple_lines'] = 'Separated Speaker Mixed with Parenthetical' print(df['Identification_Status'][index]) try: print(after_par) except: pass print("identifying parenthetical") df.loc[index + 0.25] = np.nan df.loc[index + 0.25,'data'] = after_par df.loc[index + 0.25,'parenthetical'] = 'Complete' df.loc[index + 0.25,'When_Identified'] ='ExaminingSpeakerMix' df.loc[index + 0.25,'Identification_Status'] = 'ps10' df.loc[index + 0.25,'case'] = '' df.loc[index + 0.25,'plb'] = 'N' df.loc[index + 0.25,'nlb'] = nlb df.loc[index + 0.25,'line_no'] = new_line_no df = df.sort_index().reset_index(drop=True) continue return df # df.to_csv(p.output_file_path,index=False) # lines_not_removed = audit_df.loc[audit_df['line_removed'] != 'Yes'].index.to_list() # audit_df.sort_index(inplace= True) # audit_df.reset_index(inplace= True) # for line in lines_not_removed: # new_data = '' # try: # new_data =df.loc[df['line_no'] == line, 'data'].values[0] # except: # pass # #print(new_data) # audit_df.loc[audit_df['line_no'] == line, 'data_corrected'] = new_data # #print(audit_df.loc[audit_df['line_no'] == line, 'data_corrected']) # audit_df.to_csv(p.audit_report_path, index = False) def examine_speaker_mix_part2(df,audit_df): ## examine the ps30s and split with colon and all caps speaker print("Start speaker mix part2") for index in df.index: line_no = df['line_no'][index] data = df['data'][index] cur_line_pos = df['Identification_Status'][index].split(";") if 'ps30' not in cur_line_pos[0]: continue try: if 'ps30' not in cur_line_pos[1]: continue except: pass if cur_line_pos[0] == 'ps1' or cur_line_pos[0] == 'ps2' : # skipping as could be slugline continue speaker = '' dialogue = '' ## search colon and spearate after colon try: print("data:\n",data) except: pass extn_found = False extn_list = ['O.S.','V.O.',"CONT'D","CONTā€™D",'VOICE'] print(extn_list) for extn in extn_list: if extn in str(data): extn_found = True break if re.search('\:',data,re.IGNORECASE) and not extn_found: pos_starts = re.search('\:',data,re.IGNORECASE).start() #pos_end = re.search('(',data,re.IGNORECASE).end() before_colon = data[:pos_starts] after_colon = data[pos_starts+1:] if not before_colon or before_colon.strip().isspace(): print ("nothing before colon") continue print ("Seperating speaker dialogue separated by colon") print(index) try: print(before_colon) except: pass df['data'][index] = before_colon df['parenthetical'][index] = 'Absent' df['When_Identified'][index] = 'ExaminingSpeakerMixDialogue' #df['case_format'][index] = 'AllUpper' df['Identification_Status'][index] = 'ps7' nlb = df['nlb'][index] df['nlb'][index] = 'N' line_no = df['line_no'][index] next_line_no = df['line_no'][index+1] new_line_no = (line_no + next_line_no) / 2 if new_line_no in audit_df.index: new_line_no = (new_line_no + next_line_no)/2 audit_df.loc[new_line_no] = '' audit_df.loc[new_line_no]['line_removed'] = 'No' audit_df.loc[line_no]['line_broken_into_multiple_lines'] = 'Separated Speaker and Dialogue mixed with colon:' #print(df['Identification_Status'][index]) try: print(after_colon) except: pass print("identifying after colon as dialogue end") df.loc[index + 0.25] = '' df.loc[index + 0.25,'data'] = after_colon df.loc[index + 0.25,'parenthetical'] = 'Absent' df.loc[index + 0.25,'When_Identified'] ='ExaminingSpeakerMixDialogue' df.loc[index + 0.25,'Identification_Status'] = 'ps15' df.loc[index + 0.25,'case'] = '' df.loc[index + 0.25,'plb'] = 'N' df.loc[index + 0.25,'nlb'] = nlb df.loc[index + 0.25,'line_no'] = new_line_no df = df.sort_index().reset_index(drop=True) continue elif cur_line_pos[0] == 'ps30': words = data.lstrip().split(" ") k = 0 for word in words: try: print(word) except: pass if word.isupper(): k += 1 else: break print(k) if k != 0: for i in range(0,k): speaker += words[i] + ' ' for j in range(k,len(words)): dialogue += words[j] + '' print ("Seperating speaker dialogue for ps30") print(index) try: print(speaker.strip()) except: pass if not speaker or not speaker.strip() or not dialogue.strip(): print("unable to separate speaker from line, speaker possibly blank or line is not speaker dialogue mix",index) continue df['data'][index] = speaker.strip() print("df['data'][index]:",df['data'][index]) df['parenthetical'][index] = 'Absent' print("df['parenthetical'][index]:",df['parenthetical'][index]) df['When_Identified'][index] = 'ExaminingSpeakerMixDialogue' print("df['When_Identified'][index]:",df['When_Identified'][index]) df['case'][index] = 'AllUpper' print("df['case'][index]:",df['case'][index]) df['Identification_Status'][index] = 'ps7' print("df['Identification_Status'][index]:",df['Identification_Status'][index]) nlb = df['nlb'][index] print("nlb",nlb) df['nlb'][index] = 'N' print("df['nlb'][index]:",df['nlb'][index]) #print(df['Identification_Status'][index]) line_no = df['line_no'][index] print("line_no", line_no) next_line_no = df['line_no'][index+1] print("next_line_no:", next_line_no) try: print("entering") new_line_no = (float(line_no) + float(next_line_no)) / 2 except: a = float(line_no) b = float(next_line_no) c = float((line_no + new_line_no)/2) print(type(c)) new_line_no = c print("new_line_no:", new_line_no) try: print("try block") if new_line_no in audit_df.index: print("inside if block") new_line_no = (new_line_no + next_line_no)/2 except Exception as e: print(f"An error occurred: {e}") # if new_line_no in audit_df.index: # print("inside if block") # new_line_no = (new_line_no + next_line_no)/2 print("after if block") #audit_df.loc[new_line_no] = '' #audit_df.loc[new_line_no]['line_removed'] = 'No' try: print("try") audit_df.loc[new_line_no, 'line_removed'] = 'No' except: print("except") audit_df.loc[new_line_no] = '' audit_df.loc[new_line_no, 'line_removed'] = 'No' print("audit_df.loc[new_line_no]['line_removed']:",audit_df.loc[new_line_no]['line_removed']) try: print("try") audit_df.loc[line_no ,'line_broken_into_multiple_lines'] = 'Separated Speaker and Dialogue ' except: print("except") audit_df.loc[line_no] = '' audit_df.loc[line_no ,'line_broken_into_multiple_lines'] = 'Separated Speaker and Dialogue ' print("audit_df.loc[line_no]['line_broken_into_multiple_lines']:",audit_df.loc[line_no]['line_broken_into_multiple_lines']) #audit_df.loc[line_no]['line_broken_into_multiple_lines'] = 'Separated Speaker and Dialogue ' print("identifying dialogue from ps30 as ps13;ps15") try: print(dialogue.strip()) except: pass df.loc[index + 0.25] = '' df.loc[index + 0.25,'data'] = dialogue.strip() df.loc[index + 0.25,'parenthetical'] = 'Absent' df.loc[index + 0.25,'When_Identified'] ='ExaminingSpeakerMixDialogue' df.loc[index + 0.25,'Identification_Status'] = 'ps15;ps13' df.loc[index + 0.25,'case'] = '' df.loc[index + 0.25,'plb'] = 'N' df.loc[index + 0.25,'nlb'] = nlb df.loc[index + 0.25,'line_no'] = new_line_no df = df.sort_index().reset_index(drop=True) continue return df # df.to_csv(p.output_file_path, index = False) # lines_not_removed = audit_df.loc[audit_df['line_removed'] != 'Yes'].index.to_list() # audit_df.sort_index(inplace= True) # audit_df.reset_index(inplace= True) # for line in lines_not_removed: # new_data = '' # try: # new_data =df.loc[df['line_no'] == line, 'data'].values[0] # except: # pass # #print(new_data) # audit_df.loc[audit_df['line_no'] == line, 'data_corrected'] = new_data # #print(audit_df.loc[audit_df['line_no'] == line, 'data_corrected']) # audit_df.to_csv(p.audit_report_path, index = False) def start_top_identifications_part2(df): for index in df.index: if df['isIdentified'][index] == 'Yes' or pd.isna(df['Identification_Status'][index]): continue pnbl_pos = [] nnbl_pos = [] pnbl_index = -1 prev_flag = False next_flag = False pnbl_index = index -1 nnbl_index = index +1 if index == 0: pnbl_pos = ['blank'] pnbl_index = 'first' elif df['plb'][index] == 'N' : pnbl_pos = df['Identification_Status'][index-1].split(";") pnbl_index = index -1 elif index - 1 == 0: pnpl_pos = ['blank'] pnbl_index = 'first' else: pnbl_pos = df['Identification_Status'][index-2].split(";") pnbl_index = index -2 if index == df.index[-1]: nnbl_pos = ['blank'] nnbl_index = 'last' elif df['nlb'][index] == 'N' : nnbl_pos = df['Identification_Status'][index+1].split(";") nnbl_par = df['parenthetical'][index+1] nnbl_index = index + 1 elif index+1 == df.index[-1]: nnbl_pos = ['blank'] nnbl_index = 'last' else: nnbl_pos = df['Identification_Status'][index+2].split(";") nnbl_par = df['parenthetical'][index+2] nnbl_index = index + 2 cur_indent = df['ssc'][index] try: pnbl_indent = df['ssc'][pnbl_index] except: pnbl_indent = -1 try: nnbl_indent = df['ssc'][nnbl_index] except: nnbl_indent = -1 # try: # if df['prvious_line_blank'][pnbl_index] == 'N' : # ppnbl_pos = df['Identification_Status'][pnbl_index-1].split(";") # else: # ppnbl_pos = df['Identification_Status'][pnbl_index-2].split(";") # except: # pass line_no = df['line_no'][index] data = df['data'][index] cur_line_pos = df['Identification_Status'][index].split(";") pnbl_par = df['parenthetical'][pnbl_index] if pnbl_index != 'first' else False try: pnbl_top2 = pnbl_pos[1] except: pnbl_top2 = '' # try: # nnbl_top2 = nnbl_pos[1] # except: # nnbl_top2 = '' cur_indent = df['ssc'][index] #print(cur_indent) #print("examining") #print(line_no,cur_indent,data) #print(cur_line_pos) # print(pnbl_pos) # print(cur_line_pos) # print(nnbl_pos) line_new_pos = [] pdil_pos = [] ndil_pos = [] ## lets find previous different indent line #print("looking for previous different indent line") pdil_index = index while pdil_index != 0: pdil_indent = df['ssc'][pdil_index] pdil_line_no = df['line_no'][pdil_index] if df['Identification_Status'][pdil_index] != 'blank' and pdil_indent != cur_indent: #print(pdil_line_no,pdil_indent,df['Identification_Status'][pdil_index]) #print(df['data'][pdil_index]) pdil_pos = df['Identification_Status'][pdil_index].split(";") break pdil_index -= 1 if pdil_index == 0: prev_flag = 'start' #print(prev_flag) #print("looking for next different indent line") ndil_index = index while ndil_index != df.index[-1]: ndil_indent = df['ssc'][ndil_index] ndil_line_no = df['line_no'][ndil_index] if df['Identification_Status'][ndil_index] != 'blank' and ndil_indent != cur_indent: #print(ndil_line_no,ndil_indent,df['Identification_Status'][ndil_index]) #print(df['data'][ndil_index]) ndil_pos = df['Identification_Status'][ndil_index].split(";") break ndil_index += 1 if ndil_index == df.index[-1]: next_flag = 'end' #print(next_flag) cur_line_pos = df['Identification_Status'][index].split(";") top1 = cur_line_pos[0] top2 = top1 top3 = top1 top4 = top1 top5 = top1 if len(cur_line_pos) == 5: top5 = cur_line_pos[4] if len(cur_line_pos) >= 4: top4 = cur_line_pos[3] if len(cur_line_pos) >= 3: top3 = cur_line_pos[2] if len(cur_line_pos) >= 2: top2 = cur_line_pos[1] ## if cur line contains both 15 and 6 if 'ps6' in cur_line_pos and 'ps15' in cur_line_pos: print("CURRENT CONATINS 15 6") try: print(data) except: pass print("check pdil , ndil possibilties") print(pdil_pos) print(cur_line_pos) print(ndil_pos) print(prev_flag) print(next_flag) line_new_pos = cur_line_pos if prev_flag != 'start' and next_flag != 'end' : if pdil_pos[0] == 'ps15' or pdil_pos[0] == 'ps16' : if ndil_pos[0] == 'ps7' or ndil_pos[0] == 'ps10': print("remove ps15") line_new_pos.remove('ps15') print(line_new_pos) df['Identification_Status'][index] = ";".join(line_new_pos) elif pdil_pos[0] == 'ps7' or pdil_pos[0] == 'ps10' : if ndil_pos[0] == 'ps1' or ndil_pos[0] == 'ps4' or ndil_pos[0] == 'ps6' or ndil_pos[0] == 'ps16': print("remove ps6") line_new_pos.remove('ps6') df['Identification_Status'][index] = ";".join(line_new_pos) print("\n") lcp = df['lcp'][index] if top1 == 'ps1' or top2 == 'ps1' or top3 == 'ps1' or top4 == 'ps1' or top5 == 'ps1': if lcp < 60 : print(pnbl_pos) print(nnbl_pos) print("pssible slug",data) print(top1,top2,top3,top4,top5) if pnbl_pos == 'ps6' or pnbl_pos == 'ps15' or pnbl_pos == 'ps16' or pnbl_pos == 'ps17': if nnbl_pos == 'ps4': print("line is ps1") cur_line_pos = df['Identification_Status'][index].split(";") line_new_pos = [] # print(df['line_no'][index]) # print("CHEKING") # print(data) # print(pnbl_pos) # print(nnbl_pos) if "".join(nnbl_pos) == 'ps6' and df['nlb'][index] == 'N': line_new_pos = [ps for ps in cur_line_pos if ps != 'ps6'] print(line_new_pos) df['Identification_Status'][index] = ";".join(line_new_pos) print("\n") cur_line_pos = df['Identification_Status'][index].split(";") line_new_pos = [] if (nnbl_pos[0] == 'ps4' and top1 != 'ps1' and top2 != 'ps1') or nnbl_pos[0] == 'ps1' or nnbl_pos[0] == 'ps7': print(pnbl_pos) if pnbl_index != 'first' : if pnbl_pos[0] == 'ps5' or pnbl_top2 == 'ps5': if cur_indent == pnbl_indent: try: print(line_no,data,"identifying as PS6") except: pass df['Identification_Status'][index] = 'ps6' df['When_Identified'][index] = 'StartTopIdentificationPart2' continue if len(df['Identification_Status'][index].split(";")) == 1 : continue cur_line_pos = df['Identification_Status'][index].split(";") line_new_pos = [] if cur_line_pos[0] in ('ps5','ps6') and cur_line_pos[1] in ('ps5','ps6'): if df['nlb'][index] == 'Y' and "".join(nnbl_pos) == 'ps6': if df['plb'][index] == 'N': if cur_indent == pnbl_indent: try: print(line_no,data,"identifying as ps6") except: pass df['Identification_Status'][index] = 'ps6' continue else: # remove ps5 line_new_pos = [ps for ps in cur_line_pos if ps != 'ps5'] try: print(line_no,data,"removed ps5") except: pass df['Identification_Status'][index] = ";".join(line_new_pos) continue return df def start_slug_identification(df): # loop through to examine slug for index in df.index[1:-1]: if df['isIdentified'][index] == 'Yes': continue pnbl_pos = [] nnbl_pos = [] pnbl_index = index -1 if index == 0: pnbl_pos = ['blank'] elif df['plb'][index] == 'N' : pnbl_pos = df['Identification_Status'][index-1].split(";") pnbl_index = index -1 elif index - 1 == 0: pnpl_pos = ['blank'] else: pnbl_pos = df['Identification_Status'][index-2].split(";") pnbl_index = index -2 if index == df.index[-1]: nnbl_pos = ['blank'] elif df['nlb'][index] == 'N' : nnbl_pos = df['Identification_Status'][index+1].split(";") nnbl_par = df['parenthetical'][index+1] elif index+1 == df.index[-1]: nnbl_pos = ['blank'] else: nnbl_pos = df['Identification_Status'][index+2].split(";") nnbl_par = df['parenthetical'][index+2] # try: # if df['prvious_line_blank'][pnbl_index] == 'N' : # ppnbl_pos = df['Identification_Status'][pnbl_index-1].split(";") # else: # ppnbl_pos = df['Identification_Status'][pnbl_index-2].split(";") # except: # pass line_no = df['line_no'][index] data = df['data'][index] cur_line_pos = df['Identification_Status'][index].split(";") pnbl_par = df['parenthetical'][pnbl_index] try: pnbl_top2 = pnbl_pos[1] except: pnbl_top2 = '' # try: # nnbl_top2 = nnbl_pos[1] # except: # nnbl_top2 = '' # print(line_no,data) # print(pnbl_pos) # print(cur_line_pos) # print(nnbl_pos) line_new_pos = [] if "".join(pnbl_pos) == 'ps16' and not ('ps1' in nnbl_pos): print(nnbl_pos) if 'ps1' in cur_line_pos and 'ps18' in cur_line_pos: wt1 = int(df['ps1'][index]) wt18 = int(df['ps18'][index]) if wt1 > wt18: print("identifying current as ps1 ") try: print(line_no,data) except: pass df['Identification_Status'][index] = 'ps1' df['When_Identified'][index] = 'StartIdentifyingSlug' continue # if len(cur_line_pos) == 2: # if cur_line_pos[0] == 'ps1' and cur_line_pos[1] == 'ps17': # wt1 = int(df['ps1'][index]) # wt17 = int(df['ps17'][index]) # if wt1 - wt17 > 20: # print("identifying current as ps1 ") # try: # print(line_no,data) # except: # pass # df['Identification_Status'][index] = 'ps1' # df['When_Identified'][index] = 'StartIdentifyingSlug' # continue # if len(cur_line_pos) == 3: # if cur_line_pos[0] == 'ps1' and cur_line_pos[1] == 'ps2' and cur_line_pos[2] == 'ps17': # wt1 = int(df['ps1'][index]) # wt17 = int(df['ps17'][index]) # if wt1 - wt17 > 20: # print("removing ps17 ") # try: # print(line_no,data) # except: # pass # cur_line_pos = [ps != 'ps17' for ps in cur_line_pos] # df['Identification_Status'][index] = ';'.join(cur_line_pos) # df['When_Identified'][index] = 'StartIdentifyingSlug' # continue return df def start_top_identifications_part3(df): for index in df.index: if df['isIdentified'][index] == 'Yes' or pd.isna(df['Identification_Status'][index]): continue pnbl_pos = [] nnbl_pos = [] pnbl_index = -1 prev_flag = False next_flag = False if index == 0: pnbl_pos = ['blank'] pnbl_index = 'first' elif df['plb'][index] == 'N' : pnbl_pos = df['Identification_Status'][index-1].split(";") pnbl_index = index -1 elif index - 1 == 0: pnpl_pos = ['blank'] pnbl_index = 'first' else: pnbl_pos = df['Identification_Status'][index-2].split(";") pnbl_index = index -2 if index == df.index[-1]: nnbl_pos = ['blank'] nnbl_index = 'last' elif df['nlb'][index] == 'N' : nnbl_pos = df['Identification_Status'][index+1].split(";") nnbl_par = df['parenthetical'][index+1] nnbl_index = index + 1 elif index+1 == df.index[-1]: nnbl_pos = ['blank'] nnbl_index = 'last' else: nnbl_pos = df['Identification_Status'][index+2].split(";") nnbl_par = df['parenthetical'][index+2] nnbl_index = index + 2 # try: # if df['prvious_line_blank'][pnbl_index] == 'N' : # ppnbl_pos = df['Identification_Status'][pnbl_index-1].split(";") # else: # ppnbl_pos = df['Identification_Status'][pnbl_index-2].split(";") # except: # pass line_no = df['line_no'][index] data = df['data'][index] cur_line_pos = df['Identification_Status'][index].split(";") pnbl_par = df['parenthetical'][pnbl_index] if pnbl_index != 'first' else False try: pnbl_top2 = pnbl_pos[1] except: pnbl_top2 = '' # try: # nnbl_top2 = nnbl_pos[1] # except: # nnbl_top2 = '' cur_indent = df['ssc'][index] #print(cur_indent) #print("examining") #print(line_no,cur_indent,data) #print(cur_line_pos) # print(pnbl_pos) # print(cur_line_pos) # print(nnbl_pos) line_new_pos = [] print("\n") cur_line_pos = df['Identification_Status'][index].split(";") top1 = cur_line_pos[0] top2 = top1 top3 = top1 top4 = top1 top5 = top1 if len(cur_line_pos) == 5: top5 = cur_line_pos[4] if len(cur_line_pos) >= 4: top4 = cur_line_pos[3] if len(cur_line_pos) >= 3: top3 = cur_line_pos[2] if len(cur_line_pos) >= 2: top2 = cur_line_pos[1] lcp = df['lcp'][index] if len(pnbl_pos) == 0: pnbl_pos = ['blank'] pnbl_top1 = pnbl_pos[0] pnbl_top2 = pnbl_top1 pnbl_top3 = pnbl_top1 pnbl_top4 = pnbl_top1 pnbl_top5 = pnbl_top1 if len(pnbl_pos) == 5: pnbl_top5 = pnbl_pos[4] if len(pnbl_pos) >= 4: pnbl_top4 = pnbl_pos[3] print(pnbl_pos[3]) if len(pnbl_pos) >= 3: pnbl_top3 = pnbl_pos[2] if len(pnbl_pos) >= 2: pnbl_top2 = pnbl_pos[1] # lcp = df['last_character_placement'][index] nnbl_top1 = nnbl_pos[0] nnbl_top2 = nnbl_top1 nnbl_top3 = nnbl_top1 # nnbl_top4 = nnbl_top1 # nnbl_top5 = nnbl_top1 if len(nnbl_pos) >= 3: nnbl_top3 = nnbl_pos[2] if len(nnbl_pos) >= 2: nnbl_top2 = nnbl_pos[1] if top1 == 'ps1': try: print("possible slug",data) except: pass print(pnbl_pos) print(pnbl_top4) print(nnbl_pos) if pnbl_top1 == 'ps16' or pnbl_top2 == 'ps16' or pnbl_top3 == 'ps16' or pnbl_top4 == 'ps16' or pnbl_top5 == 'ps16' : if nnbl_top1 == 'ps4' or nnbl_top2 == 'ps4' or nnbl_top3 == 'ps4': print("identifying current as ps1 as between top transitiona and action") try: print(line_no,data) except: pass df['Identification_Status'][index] = 'ps1' df['When_Identified'][index] = 'StartIdentifyingTopsPart3' return df def start_top_identifications_part4(df): for index in df.index: if df['isIdentified'][index] == 'Yes' or pd.isna(df['Identification_Status'][index]): continue pnbl_pos = [] nnbl_pos = [] pnbl_index = -1 prev_flag = False next_flag = False pnbl_index = index -1 nnbl_index = index +1 if index == 0: pnbl_pos = ['blank'] pnbl_index = 'first' elif df['plb'][index] == 'N' : pnbl_pos = df['Identification_Status'][index-1].split(";") pnbl_index = index -1 elif index - 1 == 0: pnpl_pos = ['blank'] pnbl_index = 'first' else: pnbl_pos = df['Identification_Status'][index-2].split(";") pnbl_index = index -2 if index == df.index[-1]: nnbl_pos = ['blank'] nnbl_index = 'last' elif df['nlb'][index] == 'N' : nnbl_pos = df['Identification_Status'][index+1].split(";") nnbl_par = df['parenthetical'][index+1] nnbl_index = index + 1 elif index+1 == df.index[-1]: nnbl_pos = ['blank'] nnbl_index = 'last' else: nnbl_pos = df['Identification_Status'][index+2].split(";") nnbl_par = df['parenthetical'][index+2] nnbl_index = index + 2 cur_indent = df['ssc'][index] try: pnbl_indent = df['ssc'][pnbl_index] except: pnbl_indent = -1 try: nnbl_indent = df['ssc'][nnbl_index] except: nnbl_indent = -1 # try: # if df['prvious_line_blank'][pnbl_index] == 'N' : # ppnbl_pos = df['Identification_Status'][pnbl_index-1].split(";") # else: # ppnbl_pos = df['Identification_Status'][pnbl_index-2].split(";") # except: # pass line_no = df['line_no'][index] data = df['data'][index] case = df['case'][index] cur_line_pos = df['Identification_Status'][index].split(";") pnbl_par = df['parenthetical'][pnbl_index] if pnbl_index != 'first' else False try: pnbl_top2 = pnbl_pos[1] except: pnbl_top2 = '' # try: # nnbl_top2 = nnbl_pos[1] # except: # nnbl_top2 = '' #cur_indent = df['data_begins/Space count'][index] #print(cur_indent) #print("examining") #print(line_no,cur_indent,data) #print(cur_line_pos) # print(pnbl_pos) # print(cur_line_pos) # print(nnbl_pos) line_new_pos = [] print("\n") cur_line_pos = df['Identification_Status'][index].split(";") top1 = cur_line_pos[0] top2 = top1 top3 = top1 top4 = top1 top5 = top1 if len(cur_line_pos) == 5: top5 = cur_line_pos[4] if len(cur_line_pos) >= 4: top4 = cur_line_pos[3] if len(cur_line_pos) >= 3: top3 = cur_line_pos[2] if len(cur_line_pos) >= 2: top2 = cur_line_pos[1] lcp = df['lcp'][index] if len(pnbl_pos) == 0: pnbl_pos = ['blank'] pnbl_top1 = pnbl_pos[0] pnbl_top2 = pnbl_top1 pnbl_top3 = pnbl_top1 pnbl_top4 = pnbl_top1 pnbl_top5 = pnbl_top1 if len(pnbl_pos) == 5: pnbl_top5 = pnbl_pos[4] if len(pnbl_pos) >= 4: pnbl_top4 = pnbl_pos[3] print(pnbl_pos[3]) if len(pnbl_pos) >= 3: pnbl_top3 = pnbl_pos[2] if len(pnbl_pos) >= 2: pnbl_top2 = pnbl_pos[1] # lcp = df['last_character_placement'][index] nnbl_top1 = nnbl_pos[0] nnbl_top2 = nnbl_top1 nnbl_top3 = nnbl_top1 # nnbl_top4 = nnbl_top1 # nnbl_top5 = nnbl_top1 if len(nnbl_pos) >= 3: nnbl_top3 = nnbl_pos[2] if len(nnbl_pos) >= 2: nnbl_top2 = nnbl_pos[1] ## between 15 and 6 , top 4, nlb=N if top1 == 'ps4' and ";".join(pnbl_pos) == 'ps15' and ";".join(nnbl_pos) == 'ps6' and case != 'AllUpper': if cur_indent == nnbl_indent and df['nlb'][index] == 'N': print("identifying current as ps4 as between dialogue and action end and top action begin") try: print(line_no,data) except: pass df['Identification_Status'][index] = 'ps4' df['When_Identified'][index] = 'StartIdentifyingTopsPart4' continue ## between 15,6 and 1 , top 3 has 16, nlb=Y , plb =Y if ('ps16' in (top1,top2,top3)) and (";".join(pnbl_pos) == 'ps15' or ";".join(pnbl_pos) == 'ps6') and ";".join(nnbl_pos) == 'ps1': if df['plb'][index] == 'Y' and df['nlb'][index] == 'Y' and top1 != 'ps6': print("identifying current as transition ") try: print(line_no,data) except: pass df['Identification_Status'][index] = 'ps16' df['When_Identified'][index] = 'StartIdentifyingTopsPart4' continue last_line_index = df.index[-1] if df['Identification_Status'][last_line_index] == 'blank': last_line_index -= 1 cur_line_pos = df['Identification_Status'][last_line_index].split(";") if len(cur_line_pos) > 1 : if cur_line_pos[0] == 'ps6' or cur_line_pos[0] == 'ps15': print("Identifying last line as top",cur_line_pos[0]) try: print(df['line_no'][last_line_index],df['data'][last_line_index]) except: pass df['Identification_Status'][last_line_index] = cur_line_pos[0] df['When_Identified'][last_line_index] = 'IdentifyingLastLine' for index in df.index: cur_line_pos = df['Identification_Status'][index].split(";") if len(cur_line_pos) != 1 : df['isIdentified'][index] == 'No' else: df['isIdentified'][index] == 'Yes' return df def start_top_identifications_part5(df): for index in df.index: if df['isIdentified'][index] == 'Yes' or pd.isna(df['Identification_Status'][index]): continue pnbl_pos = [] nnbl_pos = [] pnbl_index = -1 prev_flag = False next_flag = False pnbl_index = index -1 nnbl_index = index +1 if index == 0: pnbl_pos = ['blank'] pnbl_index = 'first' elif df['plb'][index] == 'N' : print(pnbl_pos) pnbl_pos = df['Identification_Status'][index-1].split(";") pnbl_index = index -1 elif index - 1 == 0: pnpl_pos = ['blank'] pnbl_index = 'first' else: pnbl_pos = df['Identification_Status'][index-2].split(";") pnbl_index = index -2 if index == df.index[-1]: nnbl_pos = ['blank'] nnbl_index = 'last' elif df['nlb'][index] == 'N' : nnbl_pos = df['Identification_Status'][index+1].split(";") nnbl_par = df['parenthetical'][index+1] nnbl_index = index + 1 elif index+1 == df.index[-1]: nnbl_pos = ['blank'] nnbl_index = 'last' else: nnbl_pos = df['Identification_Status'][index+2].split(";") nnbl_par = df['parenthetical'][index+2] nnbl_index = index + 2 cur_indent = df['ssc'][index] try: pnbl_indent = df['ssc'][pnbl_index] except: pnbl_indent = -1 try: nnbl_indent = df['ssc'][nnbl_index] except: nnbl_indent = -1 # try: # if df['prvious_line_blank'][pnbl_index] == 'N' : # ppnbl_pos = df['Identification_Status'][pnbl_index-1].split(";") # else: # ppnbl_pos = df['Identification_Status'][pnbl_index-2].split(";") # except: # pass line_no = df['line_no'][index] data = df['data'][index] cur_line_pos = df['Identification_Status'][index].split(";") pnbl_par = df['parenthetical'][pnbl_index] if pnbl_index != 'first' else False try: pnbl_top2 = pnbl_pos[1] except: pnbl_top2 = '' # try: # nnbl_top2 = nnbl_pos[1] # except: # nnbl_top2 = '' #cur_indent = df['data_begins/Space count'][index] #print(cur_indent) #print("examining") #print(line_no,cur_indent,data) #print(cur_line_pos) # print(pnbl_pos) # print(cur_line_pos) # print(nnbl_pos) line_new_pos = [] print("\n") cur_line_pos = df['Identification_Status'][index].split(";") top1 = cur_line_pos[0] top2 = top1 top3 = top1 top4 = top1 top5 = top1 if len(cur_line_pos) == 5: top5 = cur_line_pos[4] if len(cur_line_pos) >= 4: top4 = cur_line_pos[3] if len(cur_line_pos) >= 3: top3 = cur_line_pos[2] if len(cur_line_pos) >= 2: top2 = cur_line_pos[1] lcp = df['lcp'][index] if len(pnbl_pos) == 0: pnbl_pos = ['blank'] pnbl_top1 = pnbl_pos[0] pnbl_top2 = pnbl_top1 pnbl_top3 = pnbl_top1 pnbl_top4 = pnbl_top1 pnbl_top5 = pnbl_top1 if len(pnbl_pos) == 5: pnbl_top5 = pnbl_pos[4] if len(pnbl_pos) >= 4: pnbl_top4 = pnbl_pos[3] print(pnbl_pos[3]) if len(pnbl_pos) >= 3: pnbl_top3 = pnbl_pos[2] if len(pnbl_pos) >= 2: pnbl_top2 = pnbl_pos[1] # lcp = df['last_character_placement'][index] nnbl_top1 = nnbl_pos[0] nnbl_top2 = nnbl_top1 nnbl_top3 = nnbl_top1 nnbl_top4 = nnbl_top1 # nnbl_top5 = nnbl_top1 if len(nnbl_pos) >= 4: nnbl_top4 = nnbl_pos[3] if len(nnbl_pos) >= 3: nnbl_top3 = nnbl_pos[2] if len(nnbl_pos) >= 2: nnbl_top2 = nnbl_pos[1] ## pnbl is ps5 or 4 , cur top 5 , next top 2 has 6, cur_indent = pvs indent ;then current is ps5 if "".join(pnbl_pos) == 'ps5' or "".join(pnbl_pos) == 'ps4': if top1 == 'ps5' and cur_indent == nnbl_indent: if (nnbl_top1 == 'ps6' or nnbl_top2 == 'ps6') and (nnbl_top1 !='ps1' and nnbl_top1 !='ps2'): print("identifying current as ps5 as between actions") try: print(line_no,data) except: pass df['Identification_Status'][index] = 'ps5' df['When_Identified'][index] = 'StartIdentifyingTopsPart5' continue elif nnbl_top1 == 'ps5' : print("identifying current as ps5 as between actions") try: print(line_no,data) except: pass df['Identification_Status'][index] = 'ps5' df['When_Identified'][index] = 'StartIdentifyingTopsPart5' continue ## pnbl is ps16 , cur top2 has ps1 ,##next top 2 has 4 or 6 , declare ps1 , same indent ? if "".join(pnbl_pos) == 'ps16': if top1 == 'ps1' or top2 == 'ps1': if nnbl_top1 == 'ps4' or nnbl_top2 == 'ps4' or nnbl_top1 == 'ps6' or nnbl_top2 == 'ps6': print("identifying current as ps1 as between transition and action") try: print(line_no,data) except: pass df['Identification_Status'][index] = 'ps1' df['When_Identified'][index] = 'StartIdentifyingTopsPart5' continue line_new_pos = [] ## nnbl top2 does not have ps4 remove 1,3 from current line ## dont remove if next line is identified as speaker and ps1 is top if nnbl_top1 != 'ps4' and nnbl_top2 != 'ps4' and nnbl_top3 != 'ps4' and nnbl_top4 != 'ps4' and nnbl_top1 != 'ps6' and nnbl_top2 != 'ps6': if (nnbl_top1 == 'ps7' or nnbl_top1 =='ps8') and (top1 == 'ps1' or top1 == 'ps3'): print("not removing ps1 as next is speaker and current top is slugline") else: line_new_pos = [ps for ps in cur_line_pos if ps != 'ps1' ] line_new_pos = [ps for ps in line_new_pos if ps != 'ps3' ] print("Removing ps1 ps3 from current as next does not have ps4 in top4 ps6 in top2") try: print(line_no,data) except: pass df['Identification_Status'][index] = ";".join(line_new_pos) #df['When_Identified'][index] = '' cur_line_pos = df['Identification_Status'][index].split(";") line_new_pos = [] ## remove ps2 and ps18 as a possibility if right indent < 75 lcp = df['lcp'][index] if lcp < 68 and ('ps2' in cur_line_pos or 'ps18' in cur_line_pos): print("Removing ps2 ps18 from current as lcp < 75") try: print(line_no,data) except: pass line_new_pos = [ps for ps in cur_line_pos if ps != 'ps2' ] line_new_pos = [ps for ps in line_new_pos if ps != 'ps18' ] df['Identification_Status'][index] = ";".join(line_new_pos) for index in df.index: #print(index) cur_line_pos = df['Identification_Status'][index].split(";") if len(cur_line_pos) != 1 : df['isIdentified'][index] == 'No' else: df['isIdentified'][index] == 'Yes' return df def start_top_identifications_part6(df): for index in df.index: if df['isIdentified'][index] == 'Yes' or pd.isna(df['Identification_Status'][index]): continue pnbl_pos = [] nnbl_pos = [] pnbl_index = -1 prev_flag = False next_flag = False pnbl_index = index -1 nnbl_index = index +1 if index == 0: pnbl_pos = ['blank'] pnbl_index = 'first' elif df['plb'][index] == 'N' : print(pnbl_pos) pnbl_pos = df['Identification_Status'][index-1].split(";") pnbl_index = index -1 elif index - 1 == 0: pnpl_pos = ['blank'] pnbl_index = 'first' else: pnbl_pos = df['Identification_Status'][index-2].split(";") pnbl_index = index -2 if index == df.index[-1]: nnbl_pos = ['blank'] nnbl_index = 'last' elif df['nlb'][index] == 'N' : nnbl_pos = df['Identification_Status'][index+1].split(";") nnbl_par = df['parenthetical'][index+1] nnbl_index = index + 1 elif index+1 == df.index[-1]: nnbl_pos = ['blank'] nnbl_index = 'last' else: nnbl_pos = df['Identification_Status'][index+2].split(";") nnbl_par = df['parenthetical'][index+2] nnbl_index = index + 2 cur_indent = df['ssc'][index] try: pnbl_indent = df['ssc'][pnbl_index] except: pnbl_indent = -1 try: nnbl_indent = df['ssc'][nnbl_index] except: nnbl_indent = -1 # try: # if df['prvious_line_blank'][pnbl_index] == 'N' : # ppnbl_pos = df['Identification_Status'][pnbl_index-1].split(";") # else: # ppnbl_pos = df['Identification_Status'][pnbl_index-2].split(";") # except: # pass line_no = df['line_no'][index] data = df['data'][index] cur_line_pos = df['Identification_Status'][index].split(";") pnbl_par = df['parenthetical'][pnbl_index] if pnbl_index != 'first' else False try: pnbl_top2 = pnbl_pos[1] except: pnbl_top2 = '' # try: # nnbl_top2 = nnbl_pos[1] # except: # nnbl_top2 = '' #cur_indent = df['data_begins/Space count'][index] #print(cur_indent) #print("examining") #print(line_no,cur_indent,data) #print(cur_line_pos) # print(pnbl_pos) # print(cur_line_pos) # print(nnbl_pos) line_new_pos = [] print("\n") cur_line_pos = df['Identification_Status'][index].split(";") top1 = cur_line_pos[0] top2 = top1 top3 = top1 top4 = top1 top5 = top1 if len(cur_line_pos) == 5: top5 = cur_line_pos[4] if len(cur_line_pos) >= 4: top4 = cur_line_pos[3] if len(cur_line_pos) >= 3: top3 = cur_line_pos[2] if len(cur_line_pos) >= 2: top2 = cur_line_pos[1] lcp = df['lcp'][index] if len(pnbl_pos) == 0: pnbl_pos = ['blank'] pnbl_top1 = pnbl_pos[0] pnbl_top2 = pnbl_top1 pnbl_top3 = pnbl_top1 pnbl_top4 = pnbl_top1 pnbl_top5 = pnbl_top1 if len(pnbl_pos) == 5: pnbl_top5 = pnbl_pos[4] if len(pnbl_pos) >= 4: pnbl_top4 = pnbl_pos[3] print(pnbl_pos[3]) if len(pnbl_pos) >= 3: pnbl_top3 = pnbl_pos[2] if len(pnbl_pos) >= 2: pnbl_top2 = pnbl_pos[1] # lcp = df['last_character_placement'][index] nnbl_top1 = nnbl_pos[0] nnbl_top2 = nnbl_top1 nnbl_top3 = nnbl_top1 # nnbl_top4 = nnbl_top1 # nnbl_top5 = nnbl_top1 if len(nnbl_pos) >= 3: nnbl_top3 = nnbl_pos[2] if len(nnbl_pos) >= 2: nnbl_top2 = nnbl_pos[1] ## top 1 is ps1 pnbl is 17 nnbl has ps4 in top2 if top1 == 'ps1' and "".join(pnbl_pos) == 'ps17' : if nnbl_top1 == 'ps4' or nnbl_top2 == 'ps4': print("identifying current as ps1 as between special term and action") try: print(line_no,data) except: pass df['Identification_Status'][index] = 'ps1' df['When_Identified'][index] = 'StartIdentifyingTopsPart6' continue for index in df.index: #print(index) try: cur_line_pos = df['Identification_Status'][index].split(";") except: print("JJJJ",index) if len(cur_line_pos) != 1 : df['isIdentified'][index] == 'No' else: df['isIdentified'][index] == 'Yes' return df def start_top_identifications_part7(df): for index in df.index: if df['isIdentified'][index] == 'Yes' or pd.isna(df['Identification_Status'][index]): continue pnbl_pos = [] nnbl_pos = [] pnbl_index = -1 prev_flag = False next_flag = False pnbl_index = index -1 nnbl_index = index +1 if index == 0: pnbl_pos = ['blank'] pnbl_index = 'first' elif df['plb'][index] == 'N' : print(pnbl_pos) pnbl_pos = df['Identification_Status'][index-1].split(";") pnbl_index = index -1 elif index - 1 == 0: pnbl_pos = ['blank'] pnbl_index = 'first' else: pnbl_pos = df['Identification_Status'][index-2].split(";") pnbl_index = index -2 if index == df.index[-1]: nnbl_pos = ['blank'] nnbl_index = 'last' elif df['nlb'][index] == 'N' : nnbl_pos = df['Identification_Status'][index+1].split(";") nnbl_par = df['parenthetical'][index+1] nnbl_index = index + 1 elif index+1 == df.index[-1]: nnbl_pos = ['blank'] nnbl_index = 'last' else: nnbl_pos = df['Identification_Status'][index+2].split(";") nnbl_par = df['parenthetical'][index+2] nnbl_index = index + 2 cur_indent = df['ssc'][index] try: pnbl_indent = df['ssc'][pnbl_index] except: pnbl_indent = -1 try: nnbl_indent = df['ssc'][nnbl_index] except: nnbl_indent = -1 # try: # if df['prvious_line_blank'][pnbl_index] == 'N' : # ppnbl_pos = df['Identification_Status'][pnbl_index-1].split(";") # else: # ppnbl_pos = df['Identification_Status'][pnbl_index-2].split(";") # except: # pass line_no = df['line_no'][index] data = df['data'][index] cur_line_pos = df['Identification_Status'][index].split(";") pnbl_par = df['parenthetical'][pnbl_index] if pnbl_index != 'first' else False try: pnbl_top2 = pnbl_pos[1] except: pnbl_top2 = '' # try: # nnbl_top2 = nnbl_pos[1] # except: # nnbl_top2 = '' #cur_indent = df['data_begins/Space count'][index] #print(cur_indent) #print("examining") #print(line_no,cur_indent,data) #print(cur_line_pos) # print(pnbl_pos) # print(cur_line_pos) # print(nnbl_pos) line_new_pos = [] print("\n") cur_line_pos = df['Identification_Status'][index].split(";") top1 = cur_line_pos[0] top2 = top1 top3 = top1 top4 = top1 top5 = top1 if len(cur_line_pos) == 5: top5 = cur_line_pos[4] if len(cur_line_pos) >= 4: top4 = cur_line_pos[3] if len(cur_line_pos) >= 3: top3 = cur_line_pos[2] if len(cur_line_pos) >= 2: top2 = cur_line_pos[1] lcp = df['lcp'][index] if len(pnbl_pos) == 0: pnbl_pos = ['blank'] pnbl_top1 = pnbl_pos[0] pnbl_top2 = pnbl_top1 pnbl_top3 = pnbl_top1 pnbl_top4 = pnbl_top1 pnbl_top5 = pnbl_top1 if len(pnbl_pos) == 5: pnbl_top5 = pnbl_pos[4] if len(pnbl_pos) >= 4: pnbl_top4 = pnbl_pos[3] print(pnbl_pos[3]) if len(pnbl_pos) >= 3: pnbl_top3 = pnbl_pos[2] if len(pnbl_pos) >= 2: pnbl_top2 = pnbl_pos[1] # lcp = df['last_character_placement'][index] nnbl_top1 = nnbl_pos[0] nnbl_top2 = nnbl_top1 nnbl_top3 = nnbl_top1 # nnbl_top4 = nnbl_top1 # nnbl_top5 = nnbl_top1 if len(nnbl_pos) >= 3: nnbl_top3 = nnbl_pos[2] if len(nnbl_pos) >= 2: nnbl_top2 = nnbl_pos[1] ## top 1 and 2 are (ps6 and ps15) ppnbl_indent = 0 print(top1,top2) if (top1 == 'ps15' and top2 == 'ps6') or (top1 == 'ps6' and top2 == 'ps15') : if cur_indent < pnbl_indent: if df['plb'][pnbl_index] == 'N': ppnbl_indent = df['ssc'][pnbl_index-1] else: ppnbl_indent = df['ssc'][pnbl_index-2] if str(ppnbl_indent) < str(pnbl_indent): print("identifying current as ps15 as possibly followed by speaker") try: print(line_no,data) except: pass df['Identification_Status'][index] = 'ps15' df['When_Identified'][index] = 'StartIdentifyingTopsPart7' continue ## commentting as previous could be dialogue middle also # elif cur_indent == pnbl_indent: # print("identifying current as ps6 previous has same indent") # try: # print(line_no,data) # except: # pass # df['Identification_Status'][index] = 'ps6' # df['When_Identified'][index] = 'StartIdentifyingTopsPart7' # continue for index in df.index: #print(index) cur_line_pos = df['Identification_Status'][index].split(";") if len(cur_line_pos) != 1 : df['isIdentified'][index] == 'No' else: df['isIdentified'][index] == 'Yes' return df def start_top_identifications_part8(df): for index in df.index: if df['isIdentified'][index] == 'Yes' or pd.isna(df['Identification_Status'][index]): continue pnbl_pos = [] nnbl_pos = [] pnbl_index = -1 prev_flag = False next_flag = False pnbl_index = index -1 nnbl_index = index +1 if index == 0: pnbl_pos = ['blank'] pnbl_index = 'first' elif df['plb'][index] == 'N' : print(pnbl_pos) pnbl_pos = df['Identification_Status'][index-1].split(";") pnbl_index = index -1 elif index - 1 == 0: pnbl_pos = ['blank'] pnbl_index = 'first' else: pnbl_pos = df['Identification_Status'][index-2].split(";") pnbl_index = index -2 if index == df.index[-1]: nnbl_pos = ['blank'] nnbl_index = 'last' elif df['nlb'][index] == 'N' : nnbl_pos = df['Identification_Status'][index+1].split(";") nnbl_par = df['parenthetical'][index+1] nnbl_index = index + 1 elif index+1 == df.index[-1]: nnbl_pos = ['blank'] nnbl_index = 'last' else: nnbl_pos = df['Identification_Status'][index+2].split(";") nnbl_par = df['parenthetical'][index+2] nnbl_index = index + 2 cur_indent = df['ssc'][index] try: pnbl_indent = df['ssc'][pnbl_index] except: pnbl_indent = -1 try: nnbl_indent = df['ssc'][nnbl_index] except: nnbl_indent = -1 # try: # if df['prvious_line_blank'][pnbl_index] == 'N' : # ppnbl_pos = df['Identification_Status'][pnbl_index-1].split(";") # else: # ppnbl_pos = df['Identification_Status'][pnbl_index-2].split(";") # except: # pass line_no = df['line_no'][index] data = df['data'][index] cur_line_pos = df['Identification_Status'][index].split(";") pnbl_par = df['parenthetical'][pnbl_index] if pnbl_index != 'first' else False try: pnbl_top2 = pnbl_pos[1] except: pnbl_top2 = '' # try: # nnbl_top2 = nnbl_pos[1] # except: # nnbl_top2 = '' #cur_indent = df['data_begins/Space count'][index] #print(cur_indent) #print("examining") #print(line_no,cur_indent,data) #print(cur_line_pos) # print(pnbl_pos) # print(cur_line_pos) # print(nnbl_pos) line_new_pos = [] print("\n") cur_line_pos = df['Identification_Status'][index].split(";") top1 = cur_line_pos[0] top2 = top1 top3 = top1 top4 = top1 top5 = top1 if len(cur_line_pos) == 5: top5 = cur_line_pos[4] if len(cur_line_pos) >= 4: top4 = cur_line_pos[3] if len(cur_line_pos) >= 3: top3 = cur_line_pos[2] if len(cur_line_pos) >= 2: top2 = cur_line_pos[1] lcp = df['lcp'][index] if len(pnbl_pos) == 0: pnbl_pos = ['blank'] pnbl_top1 = pnbl_pos[0] pnbl_top2 = pnbl_top1 pnbl_top3 = pnbl_top1 pnbl_top4 = pnbl_top1 pnbl_top5 = pnbl_top1 if len(pnbl_pos) == 5: pnbl_top5 = pnbl_pos[4] if len(pnbl_pos) >= 4: pnbl_top4 = pnbl_pos[3] print(pnbl_pos[3]) if len(pnbl_pos) >= 3: pnbl_top3 = pnbl_pos[2] if len(pnbl_pos) >= 2: pnbl_top2 = pnbl_pos[1] # lcp = df['last_character_placement'][index] nnbl_top1 = nnbl_pos[0] nnbl_top2 = nnbl_top1 nnbl_top3 = nnbl_top1 # nnbl_top4 = nnbl_top1 # nnbl_top5 = nnbl_top1 if len(nnbl_pos) >= 3: nnbl_top3 = nnbl_pos[2] if len(nnbl_pos) >= 2: nnbl_top2 = nnbl_pos[1] ## top 1 and 2 are (ps6 and ps15) ppnbl_indent = 0 print(top1,top2) if (top1 == 'ps4' and top2 == 'ps6') or (top1 == 'ps6' and top2 == 'ps4') : if "".join(pnbl_pos) == 'ps1' : if "".join(nnbl_pos) == 'ps6' and df['nlb'][index] == 'N': print("identifying current as ps4 ") try: print(line_no,data) except: pass df['Identification_Status'][index] = 'ps4' df['When_Identified'][index] = 'StartIdentifyingTopsPart8' continue elif 'ps5' not in nnbl_pos and nnbl_top1 != 'ps6' : print("identifying current as ps6 as possibly between slug and speaker") try: print(line_no,data) except: pass df['Identification_Status'][index] = 'ps6' df['When_Identified'][index] = 'StartIdentifyingTopsPart8' continue for index in df.index: #print(index) cur_line_pos = df['Identification_Status'][index].split(";") if len(cur_line_pos) != 1 : df['isIdentified'][index] == 'No' else: df['isIdentified'][index] == 'Yes' return df #1.1 def decrease_wt_dial_between_action(df): def useWeights(ps): return int(ps.split("-")[1]) for index in df.index: wt_changed = False if df['isIdentified'][index] == 'Yes' or pd.isna(df['Identification_Status'][index]): continue pnbl_pos = [] nnbl_pos = [] pnbl_index = -1 prev_flag = False next_flag = False pnbl_index = index -1 nnbl_index = index +1 if index == 0: pnbl_pos = ['blank'] pnbl_index = 'first' elif df['plb'][index] == 'N' : pnbl_pos = df['Identification_Status'][index-1].split(";") print(pnbl_pos) pnbl_index = index -1 elif index - 1 == 0: pnpl_pos = ['blank'] pnbl_index = 'first' else: pnbl_pos = df['Identification_Status'][index-2].split(";") pnbl_index = index -2 if index == df.index[-1]: nnbl_pos = ['blank'] nnbl_index = 'last' elif df['nlb'][index] == 'N' : nnbl_pos = df['Identification_Status'][index+1].split(";") nnbl_par = df['parenthetical'][index+1] nnbl_index = index + 1 elif index+1 == df.index[-1]: nnbl_pos = ['blank'] nnbl_index = 'last' else: nnbl_pos = df['Identification_Status'][index+2].split(";") nnbl_par = df['parenthetical'][index+2] nnbl_index = index + 2 line_no = df['line_no'][index] data = df['data'][index] cur_line_pos = df['Identification_Status'][index].split(";") line_new_pos = [] print("\n") cur_line_pos = df['Identification_Status'][index].split(";") top1 = cur_line_pos[0] top2 = top1 if len(cur_line_pos) >= 2: top2 = cur_line_pos[1] if len(pnbl_pos) == 0: pnbl_pos = ['blank'] pnbl_top1 = pnbl_pos[0] nnbl_top1 = nnbl_pos[0] print("checking dialogue between action",index,pnbl_pos,cur_line_pos,nnbl_pos) ## if previous top is action start and current top (top 2 ) is dia middle then decrease weight of dialogue middle by 5 ## also decrease wt of dialogue start by 11 if pnbl_top1 == 'ps4' and (top1 == 'ps14' or top2 == 'ps14'): print(str(int(df['ps14'][index]) - 5)) df['ps14'][index] = str(int(df['ps14'][index]) - 5) wt_changed = True ## if previous top is action start and next top1 is ps6 then increase wt of ps5 by 11 ## also decrease wt of dialogue start by 11 if pnbl_top1 == 'ps4' and nnbl_top1 == 'ps6': df['ps5'][index] = str(int(df['ps5'][index]) + 11) #df['ps13'][index] = str(int(df['ps13'][index]) - 11) wt_changed = True if not wt_changed: continue else: ## append the weight to the possibilites pos_with_weights = [] for pos in cur_line_pos: print(pos) wt = 0 pos_wt = str(pos) try: wt = df[pos][index].astype(int) pos_wt += '-' + str(wt) except: try: wt = int(df[pos][index]) pos_wt += '-' + str(wt) except: try: wt = df[pos][index] pos_wt += '-' + str(wt) except: continue print(pos_wt) pos_with_weights.append(pos_wt) # now sort in descending order using the weights as key pos_with_weights = sorted(pos_with_weights,key=useWeights , reverse = True) line_pos_string_with_weights = ';'.join([str(elem) for elem in pos_with_weights]) df['Identification_Status_with_weights'][index] = line_pos_string_with_weights ## copy over to identification status without the weights but in order of decreasing weights pos_without_weight = [] for pos in pos_with_weights: pos_without_weight.append(pos.split("-")[0]) line_pos_string = ';'.join([str(elem) for elem in pos_without_weight]) print(line_pos_string) df['Identification_Status'][index] = line_pos_string for index in df.index: #print(index) cur_line_pos = df['Identification_Status'][index].split(";") if len(cur_line_pos) != 1 : df['isIdentified'][index] == 'No' else: df['isIdentified'][index] == 'Yes' return df def examine_among_two(df): for index in df.index: if df['isIdentified'][index] == 'Yes' or pd.isna(df['Identification_Status'][index]): continue pnbl_pos = [] nnbl_pos = [] pnbl_index = -1 prev_flag = False next_flag = False pnbl_index = index -1 nnbl_index = index +1 if index == 0: pnbl_pos = ['blank'] pnbl_index = 'first' elif df['plb'][index] == 'N' : print(pnbl_pos) pnbl_pos = df['Identification_Status'][index-1].split(";") pnbl_index = index -1 elif index - 1 == 0: pnpl_pos = ['blank'] pnbl_index = 'first' else: pnbl_pos = df['Identification_Status'][index-2].split(";") pnbl_index = index -2 if index == df.index[-1]: nnbl_pos = ['blank'] nnbl_index = 'last' elif df['nlb'][index] == 'N' : nnbl_pos = df['Identification_Status'][index+1].split(";") nnbl_par = df['parenthetical'][index+1] nnbl_index = index + 1 elif index+1 == df.index[-1]: nnbl_pos = ['blank'] nnbl_index = 'last' else: nnbl_pos = df['Identification_Status'][index+2].split(";") nnbl_par = df['parenthetical'][index+2] nnbl_index = index + 2 cur_indent = df['ssc'][index] try: pnbl_indent = df['ssc'][pnbl_index] except: pnbl_indent = -1 try: nnbl_indent = df['ssc'][nnbl_index] nnbl_case = df['case'][nnbl_index] except: nnbl_indent = -1 # try: # if df['prvious_line_blank'][pnbl_index] == 'N' : # ppnbl_pos = df['Identification_Status'][pnbl_index-1].split(";") # else: # ppnbl_pos = df['Identification_Status'][pnbl_index-2].split(";") # except: # pass line_no = df['line_no'][index] data = df['data'][index] cur_line_pos = df['Identification_Status'][index].split(";") pnbl_par = df['parenthetical'][pnbl_index] if pnbl_index != 'first' else False cur_line_par = df['parenthetical'][index] cur_line_case = df['case'][index] try: pnbl_top2 = pnbl_pos[1] except: pnbl_top2 = '' # try: # nnbl_top2 = nnbl_pos[1] # except: # nnbl_top2 = '' #cur_indent = df['data_begins/Space count'][index] #print(cur_indent) #print("examining") #print(line_no,cur_indent,data) #print(cur_line_pos) # print(pnbl_pos) # print(cur_line_pos) # print(nnbl_pos) line_new_pos = [] print("\n") cur_line_pos = df['Identification_Status'][index].split(";") top1 = cur_line_pos[0] top2 = top1 top3 = top1 top4 = top1 top5 = top1 if len(cur_line_pos) == 5: top5 = cur_line_pos[4] if len(cur_line_pos) >= 4: top4 = cur_line_pos[3] if len(cur_line_pos) >= 3: top3 = cur_line_pos[2] if len(cur_line_pos) >= 2: top2 = cur_line_pos[1] lcp = df['lcp'][index] if len(pnbl_pos) == 0: pnbl_pos = ['blank'] pnbl_top1 = pnbl_pos[0] pnbl_top2 = pnbl_top1 pnbl_top3 = pnbl_top1 pnbl_top4 = pnbl_top1 pnbl_top5 = pnbl_top1 if len(pnbl_pos) == 5: pnbl_top5 = pnbl_pos[4] if len(pnbl_pos) >= 4: pnbl_top4 = pnbl_pos[3] print(pnbl_pos[3]) if len(pnbl_pos) >= 3: pnbl_top3 = pnbl_pos[2] if len(pnbl_pos) >= 2: pnbl_top2 = pnbl_pos[1] # lcp = df['last_character_placement'][index] nnbl_top1 = nnbl_pos[0] nnbl_top2 = nnbl_top1 nnbl_top3 = nnbl_top1 # nnbl_top4 = nnbl_top1 # nnbl_top5 = nnbl_top1 if len(nnbl_pos) >= 3: nnbl_top3 = nnbl_pos[2] if len(nnbl_pos) >= 2: nnbl_top2 = nnbl_pos[1] try: print(index,data,cur_line_case,top1,top2,nnbl_case,nnbl_top1) except: pass nnbl_new_data = '' before = '' ## if 6 and 22 left ## split after full stop to new line.. merge with subsequesnt transiton if len(cur_line_pos) == 2 and (top1 == 'ps6' or top2 == 'ps22') or (top2 == 'ps6' or top1 == 'ps22'): if re.search(".",data): print("found full stop,separating") parts = data.split(".") last = parts[-1] try: print(last) except: pass before = parts[0:-1] print(" ".join(before)) print(nnbl_indent) if len(last.split()) == 1: print("single word after full stop") if "".join(nnbl_pos) == 'ps16': print("next is transition , merging") nnbl_data = df['data'][nnbl_index] try: print(nnbl_data) except: pass nnbl_new_data = last.strip() + ' ' + nnbl_data.strip() try: print(nnbl_new_data) except: pass nnbl_new_data = nnbl_new_data.rjust(len(nnbl_new_data) + int(nnbl_indent)) df['data'][nnbl_index] = nnbl_new_data print("Splitting current and Identifying current action end") df['data'][index] = " ".join(before) df['Identification_Status'][index] = 'ps6' df['When_Identified'][index] = 'ExamineLastTwo' continue ## if 7 and 8.. make 7 if no parenthtical if len(cur_line_pos) == 2 and ((top1 == 'ps7' and top2 == 'ps8') or (top1 == 'ps8' and top2 == 'ps7')): if df['parenthetical'][index] == 'Absent': try: print("Identifying as speaker as no parenthtical",data) except: pass df['Identification_Status'][index] = 'ps7' df['When_Identified'][index] = 'ExamineLastTwo' continue ## if 1/2 and 30 left keep 1 if lcp < if len(cur_line_pos) == 2 and (( (top1 == 'ps1' or top1 == 'ps2') and top2 == 'ps30') or (top1 == 'ps30' and top2 == 'ps1')): print(index,cur_line_case,nnbl_case,nnbl_top1) if df['lcp'][index] <= 63: try: print("Identifying as slugline",data) except: pass df['Identification_Status'][index] = 'ps1' df['When_Identified'][index] = 'ExamineLastTwo' elif cur_line_case == 'AllUpper' and nnbl_case == 'AllUpper' and (nnbl_top1 == 'ps1' or nnbl_top1 == 'ps3'): try: print("Identifying as slugline beginning",data) except: pass df['Identification_Status'][index] = 'ps2' df['When_Identified'][index] = 'ExamineLastTwo' try: print("Identifying as slugline end",df['data'][nnbl_index]) except: pass df['Identification_Status'][nnbl_index] = 'ps3' df['When_Identified'][nnbl_index] = 'ExamineLastTwo' continue ## if 15 and 29 left keep 15 if lcp < if len(cur_line_pos) == 2 and ((top1 == 'ps15' and top2 == 'ps29') or (top1 == 'ps29' and top2 == 'ps15')): if df['lcp'][index] <= 51: try: print("Identifying as dialogue ending",data) except: pass df['Identification_Status'][index] = 'ps15' df['When_Identified'][index] = 'ExamineLastTwo' continue if len(cur_line_pos) == 2 and ((top1 == 'ps13' and top2 == 'ps9') or (top1 == 'ps9' and top2 == 'ps13')) : if cur_line_par == 'Absent': try: print("Identifying as dialogue beginning",data) except: pass df['Identification_Status'][index] = 'ps13' df['When_Identified'][index] = 'ExamineLastTwo' continue if len(cur_line_pos) == 2: if cur_line_pos[0] == 'ps1' and cur_line_pos[1] == 'ps17': wt1 = int(df['ps1'][index]) wt17 = int(df['ps17'][index]) if wt1 - wt17 > 20: print("identifying current as ps1 ") try: print(line_no,data) except: pass df['Identification_Status'][index] = 'ps1' df['When_Identified'][index] = 'ExamineLastTwo' continue ### remove ps7 ,8 if in stopwords elif cur_line_pos[0] == 'ps7': line_new_pos = cur_line_pos print("Checking stop words") skip_words = ['ON THE SCREEN','ON THE TV','MORNING','AT HOTEL','TV','MONTAGES','MUSICAL MONTAGES','ESSENTIALS','LATER','ESSENTIAL'] search_data = data.replace(":","") found_match = False for word in skip_words: if re.match(word,search_data.strip()): found_match = True break if found_match: try: line_new_pos.remove('ps7') line_new_pos.remove('ps8') print("ps7,ps8 removed") df['Identification_Status'][index] = ";".join(line_new_pos) df['When_Identified'][index] = 'ExamineSpeakerSkipWords' continue except: print("Could not remove speaker pos") ### remove ps3 if pnbl top 2 does not have ps2 else: line_new_pos = cur_line_pos print("Checking sluglineend") if not (pnbl_top1 == 'ps2' or pnbl_top2 == 'ps2') and cur_line_pos[0] == 'ps3': line_new_pos.remove('ps3') print("ps3 removed") df['Identification_Status'][index] = ";".join(line_new_pos) df['When_Identified'][index] = 'ExamineSluglineEnd' continue for index in df.index: #print(index) cur_line_pos = df['Identification_Status'][index].split(";") if len(cur_line_pos) != 1 : df['isIdentified'][index] == 'No' else: df['isIdentified'][index] == 'Yes' return df def examine_action_using_top2_wt_diff(df): for index in df.index[1:-1]: if df['isIdentified'][index] == 'Yes': continue pnbl_pos = [] nnbl_pos = [] pnbl_index = index -1 nnbl_index = index +1 if index == 0: pnbl_pos = ['blank'] elif df['plb'][index] == 'N' : pnbl_pos = df['Identification_Status'][index-1].split(";") pnbl_index = index -1 elif index - 1 == 0: pnpl_pos = ['blank'] else: pnbl_pos = df['Identification_Status'][index-2].split(";") pnbl_index = index -2 if index == df.index[-1]: nnbl_pos = ['blank'] elif df['nlb'][index] == 'N' : nnbl_pos = df['Identification_Status'][index+1].split(";") nnbl_par = df['parenthetical'][index+1] nnbl_index = index +1 elif index+1 == df.index[-1]: nnbl_pos = ['blank'] else: nnbl_pos = df['Identification_Status'][index+2].split(";") nnbl_par = df['parenthetical'][index+2] nnbl_index = index +2 cur_indent = df['ssc'][index] cur_case = df['case'][index] try: pnbl_indent = df['ssc'][pnbl_index] pnbl_case = df['case'][pnbl_index] except: pnbl_indent = -1 pnbl_case = '' #nnbl_indent = df['ssc'][nnbl_index] try: nnbl_indent = df['ssc'][nnbl_index] nnbl_case = df['case'][nnbl_index] except: nnbl_indent = -1 nnbl_case = '' try: if df['plb'][pnbl_index] == 'N' : ppnbl_pos = df['Identification_Status'][pnbl_index-1].split(";") else: ppnbl_pos = df['Identification_Status'][pnbl_index-2].split(";") ppnbl_exists = True except: ppnbl_exists = False pass line_no = df['line_no'][index] data = df['data'][index] cur_line_pos = df['Identification_Status'][index].split(";") pnbl_par = df['parenthetical'][pnbl_index] try: pnbl_top2 = pnbl_pos[1] except: pnbl_top2 = '' # try: # nnbl_top2 = nnbl_pos[1] # except: # nnbl_top2 = '' # print(line_no,data) # print(pnbl_pos) # print(cur_line_pos) # print(nnbl_pos) line_new_pos = [] #using pnbl and nnbl identified lines refine/identify current line # if "".join(pnbl_pos) in ('ps15','ps6') and cur_line_pos[0] == 'ps4': # print(line_no,data) # print("pnbl is 15 or 6 and current top is 'ps4'") # print("Identifying as ps4") # df['Identification_Status'][index] = 'ps4' # cur_line_pos = ['ps4'] # df['When_Identified'][index] = 'RefiningActionPossibilities' cur_line_pos = df['Identification_Status'][index].split(";") top1 = cur_line_pos[0] top2 = top1 top3 = top1 top4 = top1 top5 = top1 if len(cur_line_pos) == 5: top5 = cur_line_pos[4] if len(cur_line_pos) >= 4: top4 = cur_line_pos[3] if len(cur_line_pos) >= 3: top3 = cur_line_pos[2] if len(cur_line_pos) >= 2: top2 = cur_line_pos[1] top1_wt = df[top1][index] top2_wt = df[top2][index] top2_wt_diff = top1_wt - top2_wt ## if top is 6 if cur_line_pos[0] == 'ps6' : print("top 2 wt diff",top2_wt_diff) if cur_indent < 25 and "".join(nnbl_pos) == 'ps1' and top2_wt_diff > 15: print("identifying as ps6") df['Identification_Status'][index] = 'ps6' df['When_Identified'][index] = 'ExamineActionUsingTop2Wt' ## if top is 5 if cur_line_pos[0] == 'ps5' and cur_case != 'AllUpper': print("top 2 wt diff",top2_wt_diff) if pnbl_indent == cur_indent and cur_indent == nnbl_indent and (("".join(pnbl_pos) == 'ps4' or "".join(pnbl_pos) == 'ps5') or ("".join(nnbl_pos) == 'ps6' or "".join(nnbl_pos) == 'ps5'))and top2_wt_diff > 10 and pnbl_case != 'AllUpper' and nnbl_case != 'AllUpper' : print("identifying as ps5") df['Identification_Status'][index] = 'ps5' df['When_Identified'][index] = 'ExamineActionUsingTop2Wt' return df def identify_top_as_final(df): #take the top possibility as final for index in df.index: cur_line_pos = df['Identification_Status'][index].split(";") top1 = cur_line_pos[0] top2 = top1 top3 = top1 top4 = top1 top5 = top1 if len(cur_line_pos) == 5: top5 = cur_line_pos[4] if len(cur_line_pos) >= 4: top4 = cur_line_pos[3] if len(cur_line_pos) >= 3: top3 = cur_line_pos[2] if len(cur_line_pos) >= 2: top2 = cur_line_pos[1] if df['isIdentified'][index] == 'Yes': continue contains_slug_words = False data = df['data'][index] sp_words3 = ['INT.','EXT.','I/E','E/I','EXT-','INT-'] for sp_word in sp_words3: print(sp_word) #search_data = data.replace(":","") found = re.search(sp_word,data.strip()[0:8]) if found: contains_slug_words = True break #line_pos = df['Identification_Status'][index].split(";") if (top1 == 'ps1' or top1 == 'ps2') and not contains_slug_words: df['Identification_Status'][index] = top2 continue df['Identification_Status'][index] = top1 # df['isIdentified'][index] = 'No' return df def run_audit_on_identified_backup(df,audit_df): def correct_case(df,audit_df,index,new_case): ## line_no = df['line_no'][index] print("correcting case to",new_case) if new_case == 'AllUpper': df['data'][index] = df['data'][index].upper() elif new_case == 'AllLower': df['data'][index] = df['data'][index].lower() df['case'][index] = new_case #audit_df['case_format'][line_no] = new_case audit_df['case_corrected'][line_no] = 'Corrected to ' + str(new_case) def correct_left_indent(df,audit_df,index,new_indent): ## line_no = df['line_no'][index] data = df['data'][index] data = data.strip() print("Correcting left indent to",new_indent) df['data'][index] = data.rjust(len(data)+new_indent) df['ssc'][index] = new_indent df['lcp'][index] = new_indent + len(data) - 1 audit_df['left_indent_corrected'][line_no] = 'Left indent Corrected to ' + str(new_indent) def correct_right_indent(df,audit_df,index,new_lcp): ## line_no = df['line_no'][index] data = df['data'][index] data = data.strip() new_indent = 0 print("Correcting right indent to",83 - new_lcp -1) new_indent = new_lcp - len(data) + 1 df['data'][index] = data.rjust(len(data) + new_indent) df['ssc'][index] = new_indent df['lcp'][index] = new_lcp audit_df['right_indent_corrected'][line_no] = 'Right indent Corrected to ' + str(83 - new_lcp -1) def delete_line_after(df,audit_df,index): line_no = df['line_no'][index] removed_line_no = df['line_no'][index+1] df.drop(index + 1, inplace= True) print("line deleted after",line_no) print("line no deleted ",removed_line_no) audit_df['blank_deleted_after'][line_no] = 'Yes' audit_df['line_removed'][removed_line_no] = 'Yes' def delete_line_before(df,audit_df,line_no): line_no = df['line_no'][index] removed_line_no = df['line_no'][index-1] df.drop(index - 1, inplace= True) print("line deleted before",line_no) audit_df['blank_deleted_before'][line_no] = 'Yes' audit_df['line_removed'][removed_line_no] = 'Yes' def insert_line_after(df,audit_df,index): line_no = df['line_no'][index] next_line_no = df['line_no'][index+1] new_line_no = (line_no + next_line_no) / 2 if new_line_no in audit_df.index: new_line_no = (new_line_no + next_line_no)/2 print("inserted blank line after ", line_no) df.loc[index + 0.25] = np.nan df.loc[index + 0.25,'Identification_Status'] = 'blank' df.loc[index + 0.25,'case'] = '' df.loc[index + 0.25,'plb'] = 'N' df.loc[index + 0.25,'nlb'] = 'N' df.loc[index + 0.25,'line_no'] = new_line_no df['plb'][index + 1] = 'Y' audit_df['blank_inserted_after'][line_no] = 'Yes' audit_df.loc[new_line_no] = 'No' audit_df.loc[new_line_no]['data'] = '' audit_df.loc[new_line_no]['data_corrected'] = '' audit_df.loc[new_line_no]['line_removed'] = 'No' print("line inserted after ",line_no) def insert_line_before(df,audit_df,index): line_no = df['line_no'][index] pvs_line_no = df['line_no'][index-1] new_line_no = (line_no + pvs_line_no) / 2 if new_line_no in audit_df.index: new_line_no = (new_line_no + line_no)/2 print("inserted blank line before",line_no) df.loc[index - 0.25] = np.nan df.loc[index - 0.25,'Identification_Status'] = 'blank' df.loc[index - 0.25,'case'] = 'None' df.loc[index - 0.25,'plb'] = 'N' df.loc[index - 0.25,'nlb'] = 'N' df.loc[index - 0.25,'line_no'] = new_line_no df['nlb'][index - 1] = 'Y' audit_df['blank_inserted_before'][line_no] = 'Yes' audit_df.loc[new_line_no] = '' audit_df.loc[new_line_no]['line_removed'] = 'No' def check_and_remove_numbers(df,audit_df,index): data = df['data'][index] start_is_num = True scene_num = '' ## check if number at start while start_is_num: sub_num = re.search('\d',data.lstrip()) if sub_num: if sub_num.start() == 0: data = data.replace(sub_num.group(0),'') df['data'][index] = data print(data) scene_num += sub_num.group(0) continue start_is_num = False print("scene num",scene_num) def audit_ps1(df,audit_df,index): print("Auditing Slugline") cur_indent = df['ssc'][index] cur_data = df['data'][index] cur_case = df['case'][index] try: print(cur_data) except: pass new_indent = 15 if cur_indent != new_indent: correct_left_indent(df,audit_df,index,new_indent) else: print("indent already",new_indent) #check and correct case new_case = 'AllUpper' if cur_case != new_case: correct_case(df,audit_df,index,new_case) else: print("Case already",new_case) ## plb nlb plb = df['plb'][index] nlb = df['nlb'][index] if plb == 'N': if index != 0 : insert_line_before(df,audit_df,index) df['plb'][index] = 'Y' else: print("previous line already blank") if nlb == 'N': insert_line_after(df,audit_df,index) df['nlb'][index] = 'Y' else: print("next line already blank") ## remove numbers if found at start check_and_remove_numbers(df,audit_df,index) def audit_ps4(df,audit_df,index): print("Auditing Action Beginning") nl_deleted = False cur_indent = df['ssc'][index] cur_data = df['data'][index] cur_case = df['case'][index] new_indent = 15 if cur_indent != new_indent: correct_left_indent(df,audit_df,index,new_indent) else: print("indent already",new_indent) #check and correct case # new_case = 'AllLower' # if cur_case != new_case: # correct_case(df,audit_df,index,new_case) # else: # print("Case already",new_case) ## plb nlb plb = df['plb'][index] nlb = df['nlb'][index] if plb == 'N': insert_line_before(df,audit_df,index) df['plb'][index] = 'Y' else: print("previous line already blank") if nlb == 'Y': delete_line_after(df,audit_df,index) nl_deleted = True df['nlb'][index] = 'N' else: print("next line not blank") return nl_deleted def audit_ps5(df,audit_df,index): print("Auditing Action Middle") nl_deleted = False cur_indent = df['ssc'][index] cur_data = df['data'][index] cur_case = df['case'][index] new_indent = 15 if cur_indent != new_indent: correct_left_indent(df,audit_df,index,new_indent) else: print("indent already",new_indent) #check and correct case # new_case = 'AllLower' # if cur_case != new_case: # correct_case(df,audit_df,index,new_case) # else: # print("Case already",new_case) ## plb nlb plb = df['plb'][index] nlb = df['nlb'][index] if plb == 'Y': delete_line_before(df,audit_df,index) df['plb'][index] = 'N' else: print("previous line already non blank") if nlb == 'Y': delete_line_after(df,audit_df,index) nl_deleted = True df['nlb'][index] = 'N' else: print("next line not blank") return nl_deleted def audit_ps6(df,audit_df,index): print("Auditing Action Ending") nl_deleted = False cur_indent = df['ssc'][index] cur_data = df['data'][index] cur_case = df['case'][index] new_indent = 15 if cur_indent != new_indent: correct_left_indent(df,audit_df,index,new_indent) else: print("indent already",new_indent) #check and correct case # new_case = 'AllLower' # if cur_case != new_case: # correct_case(df,audit_df,index,new_case) # else: # print("Case already",new_case) ## plb nlb plb = df['plb'][index] nlb = df['nlb'][index] if df['Identification_Status'][index - 1] in ('ps4','ps5'): if plb == 'Y': delete_line_before(df,audit_df,index) df['plb'][index] = 'N' else: print("previous line already non blank") else: ## later move this to insert line before pnbl_line_no = df['pnbl_line_no'][index] try: pnbl_identified = True if df.loc[df['line_no'] == pnbl_line_no,'isIdentified'] == 'Yes' else False except: pnbl_identified = False if plb == 'N' and pnbl_identified: insert_line_before(df,audit_df,index) df['plb'][index] = 'Y' else: print("previous line already blank") if nlb == 'N': insert_line_after(df,audit_df,index) df['nlb'][index] = 'Y' else: print("next line already blank") def audit_ps7(df,audit_df,index): print("Auditing Speaker") nl_deleted = False cur_indent = df['ssc'][index] cur_data = df['data'][index] cur_case = df['case'][index] new_indent = 35 if cur_indent != new_indent: correct_left_indent(df,audit_df,index,new_indent) else: print("indent already",new_indent) #check and correct case new_case = 'AllUpper' if cur_case != new_case: correct_case(df,audit_df,index,new_case) else: print("Case already",new_case) ## plb nlb plb = df['plb'][index] nlb = df['nlb'][index] if plb == 'N': insert_line_before(df,audit_df,index) df['plb'][index] = 'Y' else: print("previous line already blank") if nlb == 'Y': delete_line_after(df,audit_df,index) nl_deleted = True df['nlb'][index] = 'N' else: print("next line not blank") return nl_deleted def audit_ps10(df,audit_df,index): print("Auditing Parenthetical complete") nl_deleted = False cur_indent = df['ssc'][index] cur_data = df['data'][index] cur_case = df['case'][index] new_indent = 30 if cur_indent != new_indent: correct_left_indent(df,audit_df,index,new_indent) else: print("indent already",new_indent) #check and correct case new_case = 'AllLower' if cur_case != new_case: correct_case(df,audit_df,index,new_case) else: print("Case already",new_case) ## plb nlb plb = df['plb'][index] nlb = df['nlb'][index] if plb == 'Y': delete_line_before(df,audit_df,index) df['plb'][index] = 'N' else: print("previous line already blank") if nlb == 'Y': delete_line_after(df,audit_df,index) nl_deleted = True df['nlb'][index] = 'N' else: print("next line not blank") return nl_deleted def audit_ps11(df,audit_df,index): print("Auditing Parenthetical beginning") nl_deleted = False cur_indent = df['ssc'][index] cur_data = df['data'][index] cur_case = df['case'][index] new_indent = 30 if cur_indent != new_indent: correct_left_indent(df,audit_df,index,new_indent) else: print("indent already",new_indent) #check and correct case new_case = 'AllLower' if cur_case != new_case: correct_case(df,audit_df,index,new_case) else: print("Case already",new_case) ## plb nlb plb = df['plb'][index] nlb = df['nlb'][index] if plb == 'Y': delete_line_before(df,audit_df,index) df['plb'][index] = 'N' else: print("previous line already blank") if nlb == 'Y': delete_line_after(df,audit_df,index) nl_deleted = True df['nlb'][index] = 'N' else: print("next line not blank") return nl_deleted def audit_ps20(df,audit_df,index): print("Auditing Parenthetical middle") nl_deleted = False cur_indent = df['ssc'][index] cur_data = df['data'][index] cur_case = df['case'][index] new_indent = 30 if cur_indent != new_indent: correct_left_indent(df,audit_df,index,new_indent) else: print("indent already",new_indent) #check and correct case new_case = 'AllLower' if cur_case != new_case: correct_case(df,audit_df,index,new_case) else: print("Case already",new_case) ## plb nlb plb = df['plb'][index] nlb = df['nlb'][index] if plb == 'Y': delete_line_before(df,audit_df,index) df['plb'][index] = 'N' else: print("previous line already blank") if nlb == 'Y': delete_line_after(df,audit_df,index) nl_deleted = True df['nlb'][index] = 'N' else: print("next line not blank") return nl_deleted def audit_ps12(df,audit_df,index): print("Auditing Parenthetical end") nl_deleted = False cur_indent = df['ssc'][index] cur_data = df['data'][index] cur_case = df['case'][index] new_indent = 30 if cur_indent != new_indent: correct_left_indent(df,audit_df,index,new_indent) else: print("indent already",new_indent) #check and correct case new_case = 'AllLower' if cur_case != new_case: correct_case(df,audit_df,index,new_case) else: print("Case already",new_case) ## plb nlb plb = df['plb'][index] nlb = df['nlb'][index] if plb == 'Y': delete_line_before(df,audit_df,index) df['plb'][index] = 'N' else: print("previous line already blank") if nlb == 'Y': delete_line_after(df,audit_df,index) nl_deleted = True df['nlb'][index] = 'N' else: print("next line not blank") return nl_deleted def audit_ps13(df,audit_df,index): print("Auditing Dialogue Beginning") nl_deleted = False cur_indent = df['ssc'][index] cur_data = df['data'][index] cur_case = df['case'][index] new_indent = 25 if cur_indent != new_indent: correct_left_indent(df,audit_df,index,new_indent) else: print("indent already",new_indent) #check and correct case # new_case = 'AllLower' # if cur_case != new_case: # correct_case(df,audit_df,index,new_case) # else: # print("Case already",new_case) ## plb nlb plb = df['plb'][index] nlb = df['nlb'][index] if plb == 'Y': delete_line_before(df,audit_df,index) df['plb'][index] = 'N' else: print("previous line already blank") if nlb == 'Y': delete_line_after(df,audit_df,index) nl_deleted = True df['nlb'][index] = 'N' else: print("next line not blank") return nl_deleted def audit_ps14(df,audit_df,index): print("Auditing Dialogue Middle") nl_deleted = False cur_indent = df['ssc'][index] cur_data = df['data'][index] cur_case = df['case'][index] new_indent = 25 if cur_indent != new_indent: correct_left_indent(df,audit_df,index,new_indent) else: print("indent already",new_indent) #check and correct case # new_case = 'AllLower' # if cur_case != new_case: # correct_case(df,audit_df,index,new_case) # else: # print("Case already",new_case) ## plb nlb plb = df['plb'][index] nlb = df['nlb'][index] if plb == 'Y': delete_line_before(df,audit_df,index) df['plb'][index] = 'N' else: print("previous line already blank") if nlb == 'Y': delete_line_after(df,audit_df,index) nl_deleted = True df['nlb'][index] = 'N' else: print("next line not blank") return nl_deleted def audit_ps15(df,audit_df,index): print("Auditing Dialogue End") nl_deleted = False cur_indent = df['ssc'][index] cur_data = df['data'][index] cur_case = df['case'][index] new_indent = 25 if cur_indent != new_indent: correct_left_indent(df,audit_df,index,new_indent) else: print("indent already",new_indent) # #check and correct case # new_case = 'AllLower' # if cur_case != new_case: # correct_case(df,audit_df,index,new_case) # else: # print("Case already",new_case) ## plb nlb plb = df['plb'][index] nlb = df['nlb'][index] if plb == 'Y': delete_line_before(df,audit_df,index) df['plb'][index] = 'N' else: print("previous line already not blank") if nlb == 'N': nl_pos = df['Identification_Status'][index+1] if nl_pos == 'ps10': print("not inserting blank as next is parenthtical") else: insert_line_after(df,audit_df,index) df['nlb'][index] = 'Y' else: print("next line already blank") def audit_ps16(df,audit_df,index): print("Auditing Transition") cur_indent = df['ssc'][index] cur_data = df['data'][index] cur_case = df['case'][index] cur_lcp = df['lcp'][index] new_lcp = 72 if cur_lcp != new_lcp: correct_right_indent(df,audit_df,index,new_lcp) else: print("indent already",new_lcp) #check and correct case print(cur_case,"123") new_case = 'AllUpper' if cur_case != new_case: correct_case(df,audit_df,index,new_case) else: print("Case already",new_case) ## plb nlb plb = df['plb'][index] nlb = df['nlb'][index] if plb == 'N': insert_line_before(df,audit_df,index) df['plb'][index] = 'Y' else: print("previous line already blank") if nlb == 'N': insert_line_after(df,audit_df,index) df['nlb'][index] = 'Y' else: print("next line already blank") def audit_ps17(df,audit_df,index): print("Auditing Special Term") cur_indent = df['ssc'][index] cur_data = df['data'][index] cur_case = df['case'][index] new_indent = 15 if cur_indent != new_indent: correct_left_indent(df,audit_df,index,new_indent) else: print("indent already",new_indent) #check and correct case new_case = 'AllUpper' if cur_case != new_case: correct_case(df,audit_df,index,new_case) else: print("Case already",new_case) ## plb nlb plb = df['plb'][index] nlb = df['nlb'][index] if plb == 'N': insert_line_before(df,audit_df,index) df['plb'][index] = 'Y' else: print("previous line already blank") if nlb == 'N': insert_line_after(df,audit_df,index) df['nlb'][index] = 'Y' else: print("next line already blank") index_iter = iter(df.index) for index in index_iter: if (df['Identification_Status'][index] == 'blank'): continue nl_deleted = False cur_line_pos = df['Identification_Status'][index] fn_name = 'audit_' + cur_line_pos line_no = df['line_no'][index] print("\n") print("line no",line_no) print("index ",index) print(cur_line_pos) try: to_call_fn = locals()[fn_name] print(to_call_fn) except: continue try: nl_deleted = to_call_fn(df,audit_df,index) except: pass if nl_deleted : next(index_iter) df = df.sort_index().reset_index(drop=True) #df = df.sort_values(by=['line_no']).reset_index(drop =True) return df def run_audit_on_identified(df,audit_df = False): def correct_case(df,index,new_case,audit_df = False): ## line_no = df['line_no'][index] print("correcting case to",new_case) if new_case == 'AllUpper': df['data'][index] = df['data'][index].upper() elif new_case == 'AllLower': df['data'][index] = df['data'][index].lower() df['case'][index] = new_case try: if not audit_df.empty: audit_df['case_corrected'][line_no] = 'Corrected to ' + str(new_case) except: pass def correct_left_indent(df,index,new_indent,audit_df= False): ## line_no = df['line_no'][index] data = df['data'][index] data = data.strip() print("Correcting left indent to",new_indent) df['data'][index] = data.rjust(len(data)+new_indent) df['ssc'][index] = new_indent df['lcp'][index] = new_indent + len(data) - 1 try: if not audit_df.empty: audit_df['left_indent_corrected'][line_no] = 'Left indent Corrected to ' + str(new_indent) except: pass def correct_right_indent(df,index,new_lcp,audit_df=False): ## line_no = df['line_no'][index] data = df['data'][index] data = data.strip() new_indent = 0 print("Correcting right indent to",83 - new_lcp -1) new_indent = new_lcp - len(data) + 1 df['data'][index] = data.rjust(len(data) + new_indent) df['ssc'][index] = new_indent df['lcp'][index] = new_lcp try: if not audit_df.empty: audit_df['right_indent_corrected'][line_no] = 'Right indent Corrected to ' + str(83 - new_lcp -1) except: pass def delete_line_after(df,index,audit_df=False): line_no = df['line_no'][index] removed_line_no = df['line_no'][index+1] df.drop(index + 1, inplace= True) print("line deleted after",line_no) print("line no deleted ",removed_line_no) try: if not audit_df.empty: audit_df['blank_deleted_after'][line_no] = 'Yes' audit_df['line_removed'][removed_line_no] = 'Yes' except: pass def delete_line_before(df,line_no,audit_df=False): line_no = df['line_no'][index] removed_line_no = df['line_no'][index-1] df.drop(index - 1, inplace= True) print("line deleted before",line_no) try: if not audit_df.empty: audit_df['blank_deleted_before'][line_no] = 'Yes' audit_df['line_removed'][removed_line_no] = 'Yes' except: pass def insert_line_after(df,index,audit_df=False): line_no = df['line_no'][index] next_line_no = df['line_no'][index+1] new_line_no = (line_no + next_line_no) / 2 try: if not audit_df.empty: if new_line_no in audit_df.index: new_line_no = (new_line_no + next_line_no)/2 except: pass print("inserted blank line after ", line_no) df.loc[index + 0.25] = np.nan df.loc[index + 0.25,'data'] = '' df.loc[index + 0.25,'Identification_Status'] = 'blank' df.loc[index + 0.25,'case'] = '' df.loc[index + 0.25,'plb'] = 'N' df.loc[index + 0.25,'nlb'] = 'N' df.loc[index + 0.25,'line_no'] = new_line_no df['plb'][index + 1] = 'Y' try: if not audit_df.empty: audit_df['blank_inserted_after'][line_no] = 'Yes' audit_df.loc[new_line_no] = np.nan audit_df.loc[new_line_no]['data'] = '' audit_df.loc[new_line_no]['data_corrected'] = '' audit_df.loc[new_line_no]['line_removed'] = 'No' except: pass print("line inserted after ",line_no) def insert_line_before(df,index,audit_df=False): line_no = df['line_no'][index] pvs_line_no = df['line_no'][index-1] new_line_no = (line_no + pvs_line_no) / 2 try: if not audit_df.empty: if new_line_no in audit_df.index: new_line_no = (new_line_no + line_no)/2 except: pass print("inserted blank line before",line_no) df.loc[index - 0.25] = np.nan df.loc[index - 0.25,'Identification_Status'] = 'blank' df.loc[index - 0.25,'data'] = '' df.loc[index - 0.25,'case'] = 'None' df.loc[index - 0.25,'plb'] = 'N' df.loc[index - 0.25,'nlb'] = 'N' df.loc[index - 0.25,'line_no'] = new_line_no df['nlb'][index - 1] = 'Y' try: if not audit_df.empty: audit_df['blank_inserted_before'][line_no] = 'Yes' audit_df.loc[new_line_no] = np.nan audit_df.loc[new_line_no]['line_removed'] = 'No' except: pass def check_and_remove_numbers(df,index,audit_df=False): data = df['data'][index] start_is_num = True scene_num = '' ## check if number at start while start_is_num: sub_num = re.search('\d',data.lstrip()) if sub_num: if sub_num.start() == 0: data = data.replace(sub_num.group(0),'') df['data'][index] = data print(data) scene_num += sub_num.group(0) continue start_is_num = False print("scene num",scene_num) def audit_ps1(df,index,audit_df=False): print("Auditing Slugline") cur_indent = df['ssc'][index] cur_data = df['data'][index] cur_case = df['case'][index] try: print(cur_data) except: pass new_indent = 15 if cur_indent != new_indent: correct_left_indent(df,index,new_indent,audit_df) else: print("indent already",new_indent) #check and correct case new_case = 'AllUpper' if cur_case != new_case: correct_case(df,index,new_case,audit_df) else: print("Case already",new_case) ## plb nlb plb = df['plb'][index] nlb = df['nlb'][index] if plb == 'N': if index != 0 : insert_line_before(df,index,audit_df) df['plb'][index] = 'Y' else: print("previous line already blank") if nlb == 'N': insert_line_after(df,index,audit_df) df['nlb'][index] = 'Y' else: print("next line already blank") ## remove numbers if found at start check_and_remove_numbers(df,index,audit_df) def audit_ps4(df,index,audit_df=False): print("Auditing Action Beginning") nl_deleted = False cur_indent = df['ssc'][index] cur_data = df['data'][index] cur_case = df['case'][index] new_indent = 15 if cur_indent != new_indent: correct_left_indent(df,index,new_indent,audit_df) else: print("indent already",new_indent) #check and correct case # new_case = 'AllLower' # if cur_case != new_case: # correct_case(df,audit_df,index,new_case) # else: # print("Case already",new_case) ## plb nlb plb = df['plb'][index] nlb = df['nlb'][index] if plb == 'N': insert_line_before(df,index,audit_df) df['plb'][index] = 'Y' else: print("previous line already blank") if nlb == 'Y': delete_line_after(df,index,audit_df) nl_deleted = True df['nlb'][index] = 'N' else: print("next line not blank") return nl_deleted def audit_ps5(df,index,audit_df=False): print("Auditing Action Middle") nl_deleted = False cur_indent = df['ssc'][index] cur_data = df['data'][index] cur_case = df['case'][index] new_indent = 15 if cur_indent != new_indent: correct_left_indent(df,index,new_indent,audit_df) else: print("indent already",new_indent) #check and correct case # new_case = 'AllLower' # if cur_case != new_case: # correct_case(df,audit_df,index,new_case) # else: # print("Case already",new_case) ## plb nlb plb = df['plb'][index] nlb = df['nlb'][index] if plb == 'Y': delete_line_before(df,index,audit_df) df['plb'][index] = 'N' else: print("previous line already non blank") if nlb == 'Y': delete_line_after(df,index,audit_df) nl_deleted = True df['nlb'][index] = 'N' else: print("next line not blank") return nl_deleted def audit_ps6(df,index,audit_df=False): print("Auditing Action Ending") nl_deleted = False cur_indent = df['ssc'][index] cur_data = df['data'][index] cur_case = df['case'][index] new_indent = 15 if cur_indent != new_indent: correct_left_indent(df,index,new_indent,audit_df) else: print("indent already",new_indent) #check and correct case # new_case = 'AllLower' # if cur_case != new_case: # correct_case(df,audit_df,index,new_case) # else: # print("Case already",new_case) ## plb nlb plb = df['plb'][index] nlb = df['nlb'][index] if df['Identification_Status'][index - 1] in ('ps4','ps5'): if plb == 'Y': delete_line_before(df,index,audit_df) df['plb'][index] = 'N' else: print("previous line already non blank") else: ## later move this to insert line before pnbl_line_no = df['pnbl_line_no'][index] try: pnbl_identified = True if df.loc[df['line_no'] == pnbl_line_no,'isIdentified'] == 'Yes' else False except: pnbl_identified = False if plb == 'N' and pnbl_identified: insert_line_before(df,index,audit_df) df['plb'][index] = 'Y' else: print("previous line already blank") if nlb == 'N': insert_line_after(df,index,audit_df) df['nlb'][index] = 'Y' else: print("next line already blank") def audit_ps7(df,index,audit_df=False): print("Auditing Speaker") nl_deleted = False cur_indent = df['ssc'][index] cur_data = df['data'][index] cur_case = df['case'][index] new_indent = 35 if cur_indent != new_indent: correct_left_indent(df,index,new_indent,audit_df) else: print("indent already",new_indent) #check and correct case new_case = 'AllUpper' if cur_case != new_case: correct_case(df,index,new_case,audit_df) else: print("Case already",new_case) ## plb nlb plb = df['plb'][index] nlb = df['nlb'][index] if plb == 'N': insert_line_before(df,index,audit_df) df['plb'][index] = 'Y' else: print("previous line already blank") if nlb == 'Y': delete_line_after(df,index,audit_df) nl_deleted = True df['nlb'][index] = 'N' else: print("next line not blank") return nl_deleted def audit_ps8(df,index,audit_df=False): print("Auditing Speaker") nl_deleted = False cur_indent = df['ssc'][index] cur_data = df['data'][index] cur_case = df['case'][index] new_indent = 35 if cur_indent != new_indent: correct_left_indent(df,index,new_indent,audit_df) else: print("indent already",new_indent) #check and correct case new_case = 'AllUpper' if cur_case != new_case: correct_case(df,index,new_case,audit_df) else: print("Case already",new_case) ## plb nlb plb = df['plb'][index] nlb = df['nlb'][index] if plb == 'N': insert_line_before(df,index,audit_df) df['plb'][index] = 'Y' else: print("previous line already blank") if nlb == 'Y': delete_line_after(df,index,audit_df) nl_deleted = True df['nlb'][index] = 'N' else: print("next line not blank") return nl_deleted def audit_ps10(df,index,audit_df=False): print("Auditing Parenthetical complete") nl_deleted = False cur_indent = df['ssc'][index] cur_data = df['data'][index] cur_case = df['case'][index] new_indent = 30 if cur_indent != new_indent: correct_left_indent(df,index,new_indent,audit_df) else: print("indent already",new_indent) #check and correct case new_case = 'AllLower' if cur_case != new_case: correct_case(df,index,new_case,audit_df) else: print("Case already",new_case) ## plb nlb plb = df['plb'][index] nlb = df['nlb'][index] if plb == 'Y': delete_line_before(df,index,audit_df) df['plb'][index] = 'N' else: print("previous line already blank") if nlb == 'Y': delete_line_after(df,index,audit_df) nl_deleted = True df['nlb'][index] = 'N' else: print("next line not blank") return nl_deleted def audit_ps11(df,index,audit_df=False): print("Auditing Parenthetical beginning") nl_deleted = False cur_indent = df['ssc'][index] cur_data = df['data'][index] cur_case = df['case'][index] new_indent = 30 if cur_indent != new_indent: correct_left_indent(df,index,new_indent,audit_df) else: print("indent already",new_indent) #check and correct case new_case = 'AllLower' if cur_case != new_case: correct_case(df,index,new_case,audit_df) else: print("Case already",new_case) ## plb nlb plb = df['plb'][index] nlb = df['nlb'][index] if plb == 'Y': delete_line_before(df,index,audit_df) df['plb'][index] = 'N' else: print("previous line already blank") if nlb == 'Y': delete_line_after(df,index,audit_df) nl_deleted = True df['nlb'][index] = 'N' else: print("next line not blank") return nl_deleted def audit_ps20(df,index,audit_df=False): print("Auditing Parenthetical middle") nl_deleted = False cur_indent = df['ssc'][index] cur_data = df['data'][index] cur_case = df['case'][index] new_indent = 30 if cur_indent != new_indent: correct_left_indent(df,index,new_indent,audit_df) else: print("indent already",new_indent) #check and correct case new_case = 'AllLower' if cur_case != new_case: correct_case(df,index,new_case,audit_df) else: print("Case already",new_case) ## plb nlb plb = df['plb'][index] nlb = df['nlb'][index] if plb == 'Y': delete_line_before(df,index,audit_df) df['plb'][index] = 'N' else: print("previous line already blank") if nlb == 'Y': delete_line_after(df,index,audit_df) nl_deleted = True df['nlb'][index] = 'N' else: print("next line not blank") return nl_deleted def audit_ps12(df,index,audit_df=False): print("Auditing Parenthetical end") nl_deleted = False cur_indent = df['ssc'][index] cur_data = df['data'][index] cur_case = df['case'][index] new_indent = 30 if cur_indent != new_indent: correct_left_indent(df,index,new_indent,audit_df) else: print("indent already",new_indent) #check and correct case new_case = 'AllLower' if cur_case != new_case: correct_case(df,index,new_case,audit_df) else: print("Case already",new_case) ## plb nlb plb = df['plb'][index] nlb = df['nlb'][index] if plb == 'Y': delete_line_before(df,index,audit_df) df['plb'][index] = 'N' else: print("previous line already blank") if nlb == 'Y': delete_line_after(df,index,audit_df) nl_deleted = True df['nlb'][index] = 'N' else: print("next line not blank") return nl_deleted def audit_ps13(df,index,audit_df=False): print("Auditing Dialogue Beginning") nl_deleted = False cur_indent = df['ssc'][index] cur_data = df['data'][index] cur_case = df['case'][index] new_indent = 25 if cur_indent != new_indent: correct_left_indent(df,index,new_indent,audit_df) else: print("indent already",new_indent) #check and correct case # new_case = 'AllLower' # if cur_case != new_case: # correct_case(df,audit_df,index,new_case) # else: # print("Case already",new_case) ## plb nlb plb = df['plb'][index] nlb = df['nlb'][index] if plb == 'Y': delete_line_before(df,index,audit_df) df['plb'][index] = 'N' else: print("previous line already blank") if nlb == 'Y': delete_line_after(df,index,audit_df) nl_deleted = True df['nlb'][index] = 'N' else: print("next line not blank") return nl_deleted def audit_ps14(df,index,audit_df=False): print("Auditing Dialogue Middle") nl_deleted = False cur_indent = df['ssc'][index] cur_data = df['data'][index] cur_case = df['case'][index] new_indent = 25 if cur_indent != new_indent: correct_left_indent(df,index,new_indent,audit_df) else: print("indent already",new_indent) #check and correct case # new_case = 'AllLower' # if cur_case != new_case: # correct_case(df,audit_df,index,new_case) # else: # print("Case already",new_case) ## plb nlb plb = df['plb'][index] nlb = df['nlb'][index] if plb == 'Y': delete_line_before(df,index,audit_df) df['plb'][index] = 'N' else: print("previous line already blank") if nlb == 'Y': delete_line_after(df,index,audit_df) nl_deleted = True df['nlb'][index] = 'N' else: print("next line not blank") return nl_deleted def audit_ps15(df,index,audit_df=False): print("Auditing Dialogue End") nl_deleted = False cur_indent = df['ssc'][index] cur_data = df['data'][index] cur_case = df['case'][index] new_indent = 25 if cur_indent != new_indent: correct_left_indent(df,index,new_indent,audit_df) else: print("indent already",new_indent) # #check and correct case # new_case = 'AllLower' # if cur_case != new_case: # correct_case(df,audit_df,index,new_case) # else: # print("Case already",new_case) ## plb nlb plb = df['plb'][index] nlb = df['nlb'][index] if plb == 'Y': delete_line_before(df,index,audit_df) df['plb'][index] = 'N' else: print("previous line already not blank") if nlb == 'N': nl_pos = df['Identification_Status'][index+1] if nl_pos == 'ps10': print("not inserting blank as next is parenthtical") else: insert_line_after(df,index,audit_df) df['nlb'][index] = 'Y' else: print("next line already blank") def audit_ps16(df,index,audit_df=False): print("Auditing Transition") cur_indent = df['ssc'][index] cur_data = df['data'][index] cur_case = df['case'][index] cur_lcp = df['lcp'][index] new_lcp = 72 if cur_lcp != new_lcp: correct_right_indent(df,index,new_lcp,audit_df) else: print("indent already",new_lcp) #check and correct case print(cur_case,"123") new_case = 'AllUpper' if cur_case != new_case: correct_case(df,index,new_case,audit_df) else: print("Case already",new_case) ## plb nlb plb = df['plb'][index] nlb = df['nlb'][index] if plb == 'N': insert_line_before(df,index,audit_df) df['plb'][index] = 'Y' else: print("previous line already blank") if nlb == 'N': insert_line_after(df,index,audit_df) df['nlb'][index] = 'Y' else: print("next line already blank") def audit_ps17(df,index,audit_df=False): print("Auditing Special Term") cur_indent = df['ssc'][index] cur_data = df['data'][index] cur_case = df['case'][index] new_indent = 15 if cur_indent != new_indent: correct_left_indent(df,index,new_indent,audit_df) else: print("indent already",new_indent) #check and correct case new_case = 'AllUpper' if cur_case != new_case: correct_case(df,index,new_case,audit_df) else: print("Case already",new_case) ## plb nlb plb = df['plb'][index] nlb = df['nlb'][index] if plb == 'N': insert_line_before(df,index,audit_df) df['plb'][index] = 'Y' else: print("previous line already blank") if nlb == 'N': insert_line_after(df,index,audit_df) df['nlb'][index] = 'Y' else: print("next line already blank") index_iter = iter(df.index) for index in index_iter: if (df['Identification_Status'][index] == 'blank'): continue nl_deleted = False cur_line_pos = df['Identification_Status'][index] fn_name = 'audit_' + cur_line_pos line_no = df['line_no'][index] print("\n") print("line no",line_no) print("index ",index) print(cur_line_pos) print(df['line_no'].dtype) try: to_call_fn = locals()[fn_name] print(to_call_fn) except: continue try: nl_deleted = to_call_fn(df,index,audit_df) except: pass if nl_deleted : next(index_iter) df = df.sort_index().reset_index(drop=True) #df = df.sort_values(by=['line_no']).reset_index(drop =True) try: if not audit_df.empty: return df,audit_df else: return df except: return df def merge_line_to_para(df): ## output columns ## line_no, para_no , scene_no, identified_script_element, langueage , script para_df = pd.DataFrame() para_df['para_no'] = '' para_df['scene_no'] = '' para_df['content'] = '' para_df['script_element'] = '' para_no = 0 scene_no = 0 index_iter = iter(df.index) for index in index_iter: line_pos = df['Identification_Status'][index] data = df['data'][index] data = data.strip() #print(line_pos,data) para_no += 1 para_df.loc[para_no] = '' para_df['para_no'][para_no] = para_no if line_pos == 'blank' : para_df['content'][para_no] = '' para_df['script_element'][para_no] = 'blank' para_df['scene_no'][para_no] = scene_no continue if line_pos == 'ps1': para_df['content'][para_no] = data para_df['script_element'][para_no] = 'slugline' scene_no += 1 para_df['scene_no'][para_no] = scene_no continue if line_pos == 'ps2': ##merge with ps3 if df['Identification_Status'][index+1] == 'ps3': nl_data = df['data'][index+1] nl_data = nl_data.strip() merge_data = data + ' ' + nl_data para_df['content'][para_no] = merge_data para_df['script_element'][para_no] = 'slugline' scene_no += 1 para_df['scene_no'][para_no] = scene_no next(index_iter) continue else: para_df['content'][para_no] = data para_df['script_element'][para_no] = 'slugline' scene_no += 1 para_df['scene_no'][para_no] = scene_no continue if line_pos == 'ps3': para_df['content'][para_no] = data para_df['script_element'][para_no] = 'slugline' scene_no += 1 para_df['scene_no'][para_no] = scene_no continue if line_pos == 'ps4': merge_data = data fwd_index = index+1 nl_pos = df['Identification_Status'][fwd_index] while nl_pos == 'ps5' or nl_pos == 'ps6': next(index_iter) nl_data = df['data'][fwd_index] nl_data = nl_data.strip() merge_data += ' ' + nl_data fwd_index += 1 try: nl_pos = df['Identification_Status'][fwd_index] except: break para_df['content'][para_no] = merge_data para_df['script_element'][para_no] = 'action' para_df['scene_no'][para_no] = scene_no continue if line_pos == 'ps5': merge_data = data fwd_index = index+1 nl_pos = df['Identification_Status'][fwd_index] while nl_pos == 'ps6': next(index_iter) nl_data = df['data'][fwd_index] nl_data = nl_data.strip() merge_data += ' ' + nl_data fwd_index += 1 try: nl_pos = df['Identification_Status'][fwd_index] except: break para_df['content'][para_no] = merge_data para_df['script_element'][para_no] = 'action' para_df['scene_no'][para_no] = scene_no continue if line_pos == 'ps6': para_df['content'][para_no] = data para_df['script_element'][para_no] = 'action' para_df['scene_no'][para_no] = scene_no continue if line_pos == 'ps7' or line_pos == 'ps8': para_df['content'][para_no] = data para_df['script_element'][para_no] = 'speaker' para_df['scene_no'][para_no] = scene_no continue if line_pos == 'ps9' or line_pos == 'ps10': para_df['content'][para_no] = data para_df['script_element'][para_no] = 'parenthetical' para_df['scene_no'][para_no] = scene_no continue if line_pos == 'ps11': merge_data = data fwd_index = index+1 nl_pos = df['Identification_Status'][fwd_index] while nl_pos == 'ps20' or nl_pos == 'ps12': next(index_iter) nl_data = df['data'][fwd_index] nl_data = nl_data.strip() merge_data += ' ' + nl_data fwd_index += 1 try: nl_pos = df['Identification_Status'][fwd_index] except: break para_df['content'][para_no] = merge_data para_df['script_element'][para_no] = 'parenthetical' para_df['scene_no'][para_no] = scene_no continue if line_pos == 'ps20': merge_data = data fwd_index = index+1 nl_pos = df['Identification_Status'][fwd_index] while nl_pos == 'ps12': next(index_iter) nl_data = df['data'][fwd_index] nl_data = nl_data.strip() merge_data += ' ' + nl_data fwd_index += 1 try: nl_pos = df['Identification_Status'][fwd_index] except: break para_df['content'][para_no] = merge_data para_df['script_element'][para_no] = 'parenthetical' para_df['scene_no'][para_no] = scene_no continue if line_pos == 'ps12' : para_df['content'][para_no] = data para_df['script_element'][para_no] = 'parenthetical' para_df['scene_no'][para_no] = scene_no continue if line_pos == 'ps13': merge_data = data fwd_index = index+1 nl_pos = df['Identification_Status'][fwd_index] while nl_pos == 'ps14' or nl_pos == 'ps15': next(index_iter) nl_data = df['data'][fwd_index] nl_data = nl_data.strip() merge_data += ' ' + nl_data fwd_index += 1 try: nl_pos = df['Identification_Status'][fwd_index] except: break para_df['content'][para_no] = merge_data para_df['script_element'][para_no] = 'dialogue' para_df['scene_no'][para_no] = scene_no continue if line_pos == 'ps14': merge_data = data fwd_index = index+1 nl_pos = df['Identification_Status'][fwd_index] while nl_pos == 'ps15': next(index_iter) nl_data = df['data'][fwd_index] nl_data = nl_data.strip() merge_data += ' ' + nl_data fwd_index += 1 try: nl_pos = df['Identification_Status'][fwd_index] except: break para_df['content'][para_no] = merge_data para_df['script_element'][para_no] = 'dialogue' para_df['scene_no'][para_no] = scene_no continue if line_pos == 'ps15' : para_df['content'][para_no] = data para_df['script_element'][para_no] = 'dialogue' para_df['scene_no'][para_no] = scene_no continue if line_pos == 'ps16' : para_df['content'][para_no] = data para_df['script_element'][para_no] = 'transition' para_df['scene_no'][para_no] = scene_no continue if line_pos == 'ps17' : para_df['content'][para_no] = data para_df['script_element'][para_no] = 'special_term' para_df['scene_no'][para_no] = scene_no continue if line_pos == 'ps27': para_df['content'][para_no] = data para_df['script_element'][para_no] = 'dialogue' para_df['scene_no'][para_no] = scene_no continue return para_df def wrap_text(df,audit_df): # df.reset_index(inplace=True, drop=True) # audit_df.reset_index(inplace=True, drop=True) #df['line_no'] = df['line_no'].astype(str).astype(float) index_iter = iter(df.index) print("wrapping lines") print(df.dtypes) for index in index_iter: line_pos = df['Identification_Status'][index] if line_pos == 'blank' or df['isIdentified'][index] == 'No': continue data = df['data'][index] data = data.strip() print("line no",df['line_no'][index],df['line_no'].dtype) if line_pos == 'ps1': print("checking Slugline") if len(data) > 58: print("Need to wrap line") print("data 9808",data) wrapped_data = textwrap.wrap(data, width = 58) for line in wrapped_data: try: print("line 9812:",line) except: pass if line_pos == 'ps6': action_data = '' action_list = [] print("checking Action line") cur_lines_count = 0 action_index = index while line_pos != 'blank': data = df['data'][action_index] line_no = df['line_no'][action_index] try: print("9827\n",line_pos,line_no,data) except: pass action_data = data.strip() + ' ' + action_data cur_lines_count += 1 action_list.append(line_no) action_index -= 1 if action_index < 0: break try: line_pos = df['Identification_Status'][action_index] except: line_pos = '' if line_pos == '' or df['isIdentified'][action_index] == 'No': break if action_index < 0: continue if line_pos == '' or df['isIdentified'][action_index] == 'No': continue action_start_index = action_index + 1 action_data = action_data.strip() print("Number of action lines",cur_lines_count) if len(action_data) > 58: print("Need to wrap line") try: print("actiob data:\n",action_data) except: pass wrapped_data = textwrap.wrap(action_data, width = 58) print("Wrapped line 9753") wrapped_data_lines_count = len(wrapped_data) if cur_lines_count == wrapped_data_lines_count: #can change the original line(s) data print("cur and wrapped number of lines same") # for i in range(0,cur_lines_count): # print(wrapped_data[i]) elif wrapped_data_lines_count > cur_lines_count: lines_to_add = wrapped_data_lines_count - cur_lines_count #Multiple action lines print("will need to create ",lines_to_add," more lines") pvs_line_no = df['line_no'][index-1] #float cur_line_no = df['line_no'][index] #float cur_line_index = index pvs_line_index = cur_line_index -1 print("in line no 9874") while lines_to_add != 0: new_line_no = 0.0 pvs_line_no = df['line_no'][pvs_line_index] #float new_line_no = (cur_line_no + pvs_line_no ) / 2 while new_line_no in audit_df.index: new_line_no = (cur_line_no + new_line_no)/2 action_list.append(new_line_no) new_line_index = (cur_line_index + pvs_line_index) /2 df.loc[new_line_index] = np.nan df.loc[new_line_index,'line_no'] = new_line_no if df['Identification_Status'][pvs_line_index] == 'blank': df.loc[new_line_index,'Identification_Status'] = 'ps4' else: df.loc[new_line_index,'Identification_Status'] = 'ps5' #cur_line_no = new_line_no pvs_line_index = new_line_index #cur_line_index -= 1 lines_to_add -= 1 elif wrapped_data_lines_count < cur_lines_count: lines_to_remove = cur_lines_count - wrapped_data_lines_count print("Will need to remove ",lines_to_remove, "lines") remove_index = index -1 #pvs_line_no = df['line_no'][remove_index] while lines_to_remove != 0: pvs_line_no = df['line_no'][remove_index] ## remove pvs line df.drop(remove_index, inplace= True) audit_df['line_removed'][pvs_line_no] = 'Yes' action_list.remove(pvs_line_no) remove_index -= 1 lines_to_remove -= 1 action_list.sort() print(action_list) ## add these lines in the original df print("in line no 9914") ## now assign the values to these lines wrapped_index = 0 for line_no in action_list: df_index = df.index[df['line_no'] == line_no] df['data'][df_index] = wrapped_data[wrapped_index] if line_no not in audit_df.index: audit_df.loc[line_no] = np.nan audit_df.loc[line_no,'data'] = '' audit_df.loc[line_no,'data_corrected'] = '' audit_df['line_wrapped_at_prescribed_right_indent'][line_no] = 'Yes' wrapped_index += 1 else: print("No need to wrap line") try: print(action_data) except: pass print(len(action_data)) print("in line no 9936") if line_pos == 'ps15': dialogue_data = '' dialogue_list = [] print("\n checking Dialogue line",index) cur_lines_count = 0 dialogue_index = int(index) while line_pos not in ('ps7','ps8','ps10','ps12','ps5','ps6'): ## added 5 and 6 as wrong identification causes previous line to be ps5 data = df['data'][dialogue_index] line_no = df['line_no'][dialogue_index] try: print(dialogue_index,line_no,line_pos,data) except: pass #dialogue_data = data.strip() + ' ' + dialogue_data try: dialogue_data = data.strip() + ' ' + dialogue_data except: data = str(data) dialogue_data = data.strip() + ' ' + dialogue_data cur_lines_count += 1 if dialogue_index == index: df['Identification_Status'][dialogue_index] = 'ps15' else: df['Identification_Status'][dialogue_index] = 'ps14' dialogue_index -= 1 dialogue_list.append(line_no) print("\nprinting isIdentified: ") try: li = df['isIdentified'][dialogue_index] == 'No' print("dialogue bunch not fully identified") except: li = '' print("dialogue bunch not fully identified") if li == '' or df['isIdentified'][dialogue_index] == 'No': break # if df['isIdentified'][dialogue_index] == 'No' : # print("dialogue bunch not fully identified") # break line_pos = df['Identification_Status'][dialogue_index] if li == '' or df['isIdentified'][dialogue_index] == 'No' : #added li == '' print("dialogue bunch not fully identified") continue dialogue_start_index = dialogue_index + 1 if dialogue_start_index != index: df['Identification_Status'][dialogue_start_index] = 'ps13' dialogue_data = dialogue_data.strip() print("Number of dialogue lines 9990",cur_lines_count) if len(dialogue_data) > 35: print("Need to wrap dialogue line 9992") try: print(dialogue_data) except: pass wrapped_data = textwrap.wrap(dialogue_data, width = 35) wrapped_data_lines_count = len(wrapped_data) if cur_lines_count == wrapped_data_lines_count: #can change the original line(s) data print("cur and wrapped number of lines same") # for i in range(0,cur_lines_count): # print(wrapped_data[i]) elif wrapped_data_lines_count > cur_lines_count: lines_to_add = wrapped_data_lines_count - cur_lines_count #Multiple action lines print("will need to create ",lines_to_add," more lines") pvs_line_no = float(df['line_no'][index-1]) cur_line_no = float(df['line_no'][index]) cur_line_index = index pvs_line_index = cur_line_index -1 while lines_to_add != 0: new_line_no = 0.0 pvs_line_no = float(df['line_no'][pvs_line_index]) new_line_no = (cur_line_no + pvs_line_no ) / 2 while (new_line_no in audit_df.index) or (new_line_no in dialogue_list): new_line_no = (cur_line_no + new_line_no)/2 new_line_no = new_line_no print(cur_line_index,cur_line_no,pvs_line_no,new_line_no) dialogue_list.append(new_line_no) new_line_index = (cur_line_index + pvs_line_index) /2 df.loc[new_line_index] = np.nan df.loc[new_line_index,'line_no'] = new_line_no if df['Identification_Status'][pvs_line_index] in ('ps7','ps10','ps12'): df.loc[new_line_index,'Identification_Status'] = 'ps13' else: df.loc[new_line_index,'Identification_Status'] = 'ps14' #cur_line_no = new_line_no #cur_line_index -= 1 pvs_line_index = new_line_index lines_to_add -= 1 elif wrapped_data_lines_count < cur_lines_count: lines_to_remove = cur_lines_count - wrapped_data_lines_count print("Will need to remove ",lines_to_remove, "lines") remove_index = index -1 #pvs_line_no = df['line_no'][remove_index] while lines_to_remove != 0: pvs_line_no = df['line_no'][remove_index] ## remove pvs line df.drop(remove_index, inplace= True) audit_df['line_removed'][pvs_line_no] = 'Yes' dialogue_list.remove(pvs_line_no) remove_index -= 1 lines_to_remove -= 1 try: dialogue_list.sort() except: print("converting dialogue_list to float") dialogue_list = [float(value) if type(value) != int else value for value in dialogue_list] dialogue_list.sort() print("diaogue_list",dialogue_list) ## add these lines in the original df print("in line no 10060") ## now assign the values to these lines wrapped_index = 0 for line_no in dialogue_list: df_index = df.index[df['line_no'] == line_no] df['data'][df_index] = wrapped_data[wrapped_index] if line_no not in audit_df.index: audit_df.loc[line_no] = 'No' audit_df.loc[line_no,'data'] = '' audit_df.loc[line_no,'data_corrected'] = '' audit_df['line_wrapped_at_prescribed_right_indent'][line_no] = 'Yes' wrapped_index += 1 else: print("No need to wrap line") try: print(dialogue_data) except: pass print(len(dialogue_data)) # if all(isinstance(val, int) for val in df['line_no']): # print("All values in 'line_no' are integers.") # elif all(isinstance(val, str) for val in df['line_no']): # print("All values in 'line_no' are strings. Converting to floats or integers...") #try: # df['line_no'] = df['line_no'].astype(int) # print("Converted 'line_no' column to integers.") #except ValueError: # df['line_no'] = df['line_no'].astype(float) # print("Converted 'line_no' column to floats.") # else: # print("Values in 'line_no' are of mixed types.") df = df.sort_values(by=['line_no']).reset_index(drop =True) index_iter = iter(df.index) df.fillna({'data':''},inplace=True) for index in index_iter: print(index) line_pos = df['Identification_Status'][index] if line_pos == 'blank': continue data = df['data'][index] try: print("data",data) print(type(data)) except: pass data = data.strip() if line_pos == 'ps10' : par_data = '' par_list = [] print("checking Parenthetical line") cur_lines_count = 0 par_index = index data = df['data'][par_index] line_no = df['line_no'][par_index] try: print(line_pos,data) except: pass par_data = data.strip() cur_lines_count += 1 par_list.append(line_no) line_pos = df['Identification_Status'][par_index] print("Number of parenthetical lines",cur_lines_count) print("index",par_index,"line_no",line_no) if len(par_data) > 20: print("Need to wrap parenthetical line 10133") try: print(par_data) except: pass wrapped_data = textwrap.wrap(par_data, width = 20) wrapped_data_lines_count = len(wrapped_data) if wrapped_data_lines_count > cur_lines_count: lines_to_add = wrapped_data_lines_count - cur_lines_count #Multiple par lines print("will need to create ",lines_to_add," more lines") pvs_line_no = df['line_no'][index-1] #float cur_line_no = df['line_no'][index] #float cur_line_index = index pvs_line_index = cur_line_index -1 while lines_to_add != 0: new_line_no = 0.0 pvs_line_no = df['line_no'][pvs_line_index] try: new_line_no = (cur_line_no + pvs_line_no ) / 2 except: new_line_no = (float(cur_line_no) + pvs_line_no ) / 2 while new_line_no in audit_df.index: new_line_no = (cur_line_no + new_line_no)/2 new_line_no = (new_line_no) par_list.append(new_line_no) new_line_index = (cur_line_index + pvs_line_index) /2 df.loc[new_line_index] = np.nan df.loc[new_line_index,'line_no'] = new_line_no if df['Identification_Status'][pvs_line_index] in ('ps7','ps8','ps15'): df.loc[new_line_index,'Identification_Status'] = 'ps11' df.loc[new_line_index,'isIdentified'] = 'Yes' else: df.loc[new_line_index,'Identification_Status'] = 'ps20' df.loc[new_line_index,'isIdentified'] = 'Yes' cur_line_no = new_line_no cur_line_index = new_line_index lines_to_add -= 1 df['Identification_Status'][index] = 'ps12' try: par_list.sort() except : print("exception accepted:") par_list = [np.array([float(x)]) if isinstance(x, str) else x for x in par_list] par_list.sort() print("\n\npar_list:",par_list,"\n\n") ## add these lines in the original df ## now assign the values to these lines wrapped_index = 0 for line_no in par_list: try: df_index = df.index[df['line_no'] == line_no] print("try block executed\n") except: print("Exception:") df_index = df.index[df['line_no'] == line_no[0]] print("except block executed\n") print("printing df_index 10200",df_index,"\n") df['data'][df_index] = wrapped_data[wrapped_index] print("printing audit_df:\n",audit_df.index,"\n") print("checking the audit_fd:",line_no,"\n") try: if line_no not in audit_df.index: audit_df.loc[line_no] = np.nan audit_df.loc[line_no]['data'] = '' audit_df.loc[line_no]['data_corrected'] = '' print("###########try############") except Exception as e: print("Exception accepted:",e) audit_df['line_wrapped_at_prescribed_right_indent'][line_no] = 'Yes' wrapped_index += 1 else: print("No need to wrap line") try: print(par_data) except: pass print(len(par_data)) try: df = df.sort_values(by=['line_no']).reset_index(drop =True) except: print("Exception 10184:") df['line_no'] = [np.float64(val) if isinstance(val, str) else val for val in df['line_no']] df = df.sort_values(by=['line_no']).reset_index(drop =True) print("The df in merge_text123456789") print(df) return df def check_slug_still_unidentified(df): slug_still_unidentified = False print("checking if all slugs were identified") df_unidn = df.loc[df['isIdentified'] == 'No',:] for index in df_unidn.index: try: line_pos = df_unidn['Identification_Status'][index].split(';') print(line_pos) line_pos = line_pos[0:2] print("top2 line pos",line_pos) except: line_pos = [] for ps in line_pos: if ps in ['ps1','ps2','ps18']: slug_still_unidentified = True return slug_still_unidentified return slug_still_unidentified def sa_wrapped_output_to_docx(para_df,output_docx): page_no = 1 lines_added = 0 output_doc = Document() style = output_doc.styles['Normal'] font = style.font font.name = 'Courier New' font.size = Pt(12) section = output_doc.sections[0] section.page_height = Mm(297) #section.page_width = Mm(210) a4_right = 8.57 section.page_width = Inches(a4_right) section.left_margin = Inches(1.5) for index in para_df.index: para = output_doc.add_paragraph() paragraph_format = para.paragraph_format paragraph_format.space_before = Pt(0) paragraph_format.space_after = Pt(0) paragraph_format.line_spacing = Pt(12) script_element = para_df['script_element'][index] content = para_df['content'][index] if script_element == 'blank': continue if script_element in ('slugline','action'): paragraph_format.left_indent = Inches(0) paragraph_format.right_indent = Inches(0) if script_element == 'dialogue': paragraph_format.left_indent = Inches(1.0) paragraph_format.right_indent = Inches(1.25) if script_element == 'parenthetical': paragraph_format.left_indent = Inches(1.5) paragraph_format.right_indent = Inches(2.25) if script_element == 'speaker': paragraph_format.left_indent = Inches(2) paragraph_format.right_indent = Inches(1) if script_element == 'transition': para.alignment = WD_ALIGN_PARAGRAPH.RIGHT paragraph_format.left_indent = Inches(2.5) paragraph_format.right_indent = Inches(0) if script_element == 'special_term': paragraph_format.left_indent = Inches(0) paragraph_format.right_indent = Inches(0) para.text = content # first_page = True output_doc.save(output_docx) def sa_output_to_docx(df,output_docx,output_template): page_no = 1 lines_added = 0 output_doc = Document(output_template) style = output_doc.styles['Normal'] font = style.font font.name = 'Courier New' font.size = Pt(12) section = output_doc.sections[0] section.page_height = Mm(297) section.page_width = Mm(210) #section.page_width = Inches(11) section.left_margin = Inches(1.5) header = section.header paragraph = output_doc.paragraphs[0] paragraph_format = paragraph.paragraph_format paragraph_format.space_before = Pt(0) paragraph_format.space_after = Pt(0) paragraph_format.line_spacing = Pt(12) #def add_page_number(doc,): slug_still_unidentified = check_slug_still_unidentified(df) print(slug_still_unidentified) if df['Identification_Status'][0] == 'blank' and df['Identification_Status'][1] in ('ps1','ps2'): ## add Fade in data = "FADE IN:" data = data.rjust(len(data)) paragraph.add_run(data) lines_added = 1 elif df['Identification_Status'][0] in ('ps1','ps2'): ## add Fade in and blank data = "FADE IN:" data = data.rjust(len(data)) paragraph.add_run(data) run = paragraph.add_run() run.add_break() lines_added = 2 scene_no = 1 first_page = True for index in df.index: #print("lines_added",lines_added) if lines_added == 56: ## add break if dialogue is getting separated if df['Identification_Status'][index] in ('ps7','ps8'): if df['Identification_Status'][index+1] in ('ps9','ps10'): output_doc.add_page_break() lines_added = 0 elif df['Identification_Status'][index] in ('ps1','ps2'): ## add a page break if next line is slug output_doc.add_page_break() lines_added = 0 if lines_added == 57: ## add break if dialogue is getting separated if df['Identification_Status'][index] in ('ps7','ps8'): output_doc.add_page_break() lines_added = 0 elif df['Identification_Status'][index] in ('ps1','ps2'): ## add a page break if next line is slug output_doc.add_page_break() lines_added = 0 if lines_added == 58: lines_added = 0 # elif lines_added == 58: # lines_added = 0 # first_page = False pos = df['Identification_Status'][index] data = df['data'][index] try: print(index,data) except: pass if slug_still_unidentified: para = output_doc.add_paragraph() paragraph_format = para.paragraph_format paragraph_format.space_before = Pt(0) paragraph_format.space_after = Pt(0) paragraph_format.line_spacing = Pt(12) if data.strip(): para.text = data[15:] else: para.text = '' else: # to do , get scene number scene_data = str(scene_no) left_indent = 12 if pos == 'ps1' or pos == 'ps2': print("Removing already present scene number") print("Adding scene number") if scene_no < 9: data = scene_data + ' ' + data.lstrip() elif scene_no < 100: data = scene_data + ' ' + data.lstrip() else: data = scene_data + data.lstrip() data = data.rjust(len(data) + left_indent) data = data.rstrip() scene_indent = 63 - len(data.strip()) scene_data = scene_data.rjust(scene_indent) data = data + scene_data scene_no += 1 para = output_doc.add_paragraph() paragraph_format = para.paragraph_format paragraph_format.space_before = Pt(0) paragraph_format.space_after = Pt(0) paragraph_format.line_spacing = Pt(12) paragraph_format.left_indent = -Inches(0.3) para.text = data[12:] else: para = output_doc.add_paragraph() paragraph_format = para.paragraph_format paragraph_format.space_before = Pt(0) paragraph_format.space_after = Pt(0) paragraph_format.line_spacing = Pt(12) if data.strip(): para.text = data[15:] else: para.text = '' lines_added += 1 ## add Fade out data = "FADE OUT:" data = data.rjust( 58 - len(data)) para = output_doc.add_paragraph() paragraph_format = para.paragraph_format paragraph_format.space_before = Pt(0) paragraph_format.space_after = Pt(0) paragraph_format.line_spacing = Pt(12) para.text = data output_doc.save(output_docx) # def sa_txt_to_docx(script_txt,output_script_docx): # output_template_name = 'ScriptTemplate5.docx' # output_template = os.path.join(mypath,output_template_name) # new_doc = Document(output_template) # style = new_doc.styles['Normal'] # font = style.font # font.name = 'Courier New' # font.size = Pt(12) # section = new_doc.sections[0] # section.page_height = Mm(297) # section.page_width = Mm(210) # #section.page_width = Inches(11) # section.left_margin = Inches(1.5) # header = section.header # with open(script_txt,'r',encoding='utf-8') as txt_in: # lines = txt_in.readlines() # for line in lines: # para = new_doc.add_paragraph() # paragraph_format = para.paragraph_format # paragraph_format.space_before = Pt(0) # paragraph_format.space_after = Pt(0) # paragraph_format.line_spacing = Pt(12) # if line.strip(): # para.text = line[15:] # else: # para.text = '' # new_doc.save(output_script_docx) def sa_output_to_txt(output_script_docx,output_script_txt): from docx import Document from docx.shared import Pt from docx.shared import Mm read_doc = Document(output_script_docx) all_paras = read_doc.paragraphs first = all_paras[0].paragraph_format #print(first.left_indent) #count = 1 print(len(all_paras)) left_margin = 15 with open(output_script_txt, 'w', encoding='utf-8') as f: for para in all_paras: paragraph_format = para.paragraph_format fli =0 li =0 ri =0 try: fli = paragraph_format.first_line_indent.inches except: pass try: li = paragraph_format.left_indent.inches except: pass try: ri = paragraph_format.right_indent.inches except: pass indent = int((fli + li ) * 10) print(fli,li,indent,ri) data = para.text lines = data.split('\n') print(len(lines)) for line in lines: try: print(line) except: pass line = line.rjust(len(line) + indent + left_margin) try: print(line) except: pass f.write(line) f.write('\n') def print_audit_report_docx(audit_df,audit_report_docx): #line_removed header left_indent_corrected right_indent_corrected line_wrapped_at_prescribed_right_indent case_corrected #blank_inserted_before blank_inserted_after blank_deleted_before blank_deleted_after space_removed_between_characters #space_added_between_characters line_merged_with_next_line line_broken_into_multiple_lines punctuation_mark_added #punctuation_mark_removed output_doc = Document() para = output_doc.add_paragraph() para.alignment = WD_ALIGN_PARAGRAPH.CENTER run = para.add_run() run.text = ' Audit Report' run.add_break() run.add_break() for index in audit_df.index: para = output_doc.add_paragraph() data = "Line No: " + str(index) run = para.add_run() run.add_break() run.text = data run.add_break() run.add_break() #para.add_run(data) cur_data = audit_df['data'][index] data = "Current Data: " + cur_data run = para.add_run() run.text = data run.add_break() if audit_df['line_removed'][index] == 'Yes': data = "Line was removed" run = para.add_run() run.text = data run.add_break() continue new_data = audit_df['data_corrected'][index] data = "Corrected Data: " + new_data run = para.add_run() run.text = data run.add_break() data = "Changes Done:- " run = para.add_run() run.text = data run.add_break() sno = 1 changes_done = False if audit_df['left_indent_corrected'][index] != 'No': change_comment = audit_df['left_indent_corrected'][index] data = str(sno) + '. ' + change_comment run = para.add_run() run.text = data run.add_break() sno += 1 changes_done = True if audit_df['right_indent_corrected'][index] != 'No': change_comment = audit_df['right_indent_corrected'][index] data = str(sno) + '. ' + change_comment run = para.add_run() run.text = data run.add_break() sno += 1 changes_done = True if audit_df['case_corrected'][index] != 'No': change_comment = 'Case ' + audit_df['case_corrected'][index] data = str(sno) + '. ' + change_comment run = para.add_run() run.text = data run.add_break() sno += 1 changes_done = True if audit_df['line_wrapped_at_prescribed_right_indent'][index] != 'No': change_comment = 'Line Wrapped at Prescribed Right Indent' data = str(sno) + '. ' + change_comment run = para.add_run() run.text = data run.add_break() sno += 1 changes_done = True if audit_df['line_broken_into_multiple_lines'][index] != 'No': change_comment = 'Line Broken into Multiple Lines' data = str(sno) + '. ' + change_comment run = para.add_run() run.text = data run.add_break() sno += 1 changes_done = True if audit_df['line_merged_with_next_line'][index] != 'No': change_comment = 'Line Merged with Next Line' data = str(sno) + '. ' + change_comment run = para.add_run() run.text = data run.add_break() sno += 1 changes_done = True if not changes_done: data = 'No Changes Done' run = para.add_run() run.text = data run.add_break() output_doc.save(audit_report_docx) def ps_to_script_element(ps): if ps == 'ps1': return 'Slugline' elif ps == 'ps2': return 'Slugline' elif ps == 'ps3': return 'Slugline' elif ps == 'ps4': return 'Action' elif ps == 'ps5': return 'Action' elif ps == 'ps6': return 'Action' elif ps == 'ps7': return 'Speaker' elif ps == 'ps8': return 'Speaker with Extension' elif ps == 'ps9': return 'Speaker Extension' elif ps == 'ps10': return 'Parenthetical' elif ps == 'ps11': return 'Parenthetical' elif ps == 'ps12': return 'Parenthetical' elif ps == 'ps20': return 'Parenthetical' elif ps == 'ps13': return 'Dialogue' elif ps == 'ps14': return 'Dialogue' elif ps == 'ps15': return 'Dialogue' elif ps == 'ps16': return 'Transition' elif ps == 'ps17': return 'Special Term' # elif ps == 'ps0': # return 'Title Lines' else: return '' # def print_audit_report_tabular_docx(audit_df): # print("inside audit report") # #line_removed header left_indent_corrected right_indent_corrected line_wrapped_at_prescribed_right_indent case_corrected #blank_inserted_before blank_inserted_after blank_deleted_before blank_deleted_after space_removed_between_characters #space_added_between_characters line_merged_with_next_line line_broken_into_multiple_lines punctuation_mark_added #punctuation_mark_removed # output_doc = Document() # style = output_doc.styles['Normal'] # font = style.font # #font.name = 'Courier New' # font.size = Pt(8) # section = output_doc.sections[-1] # section.orientation = WD_ORIENT.LANDSCAPE # section.page_width = Inches(11) # section.left_margin = Inches(0.25) # section.right_margin = Inches(0.25) # para = output_doc.add_paragraph() # para.alignment = WD_ALIGN_PARAGRAPH.CENTER # run = para.add_run() # font = run.font # font.size = Pt(12) # run.text = ' Audit Report' # run.add_break() # run.add_break() # para = output_doc.add_paragraph() # para.alignment = WD_ALIGN_PARAGRAPH.LEFT # run = para.add_run() # font = run.font # font.size = Pt(10) # run.text = ' Audit Summary' # para = output_doc.add_paragraph() # run = para.add_run() # font = run.font # font.size = Pt(9) # print("audit summary column is created") # case_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['case_corrected'] != 'No'),:]) # left_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['left_indent_corrected'] != 'No'),:]) # right_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['right_indent_corrected'] != 'No'),:]) # wrapped_lines_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_wrapped_at_prescribed_right_indent'] != 'No'),:]) # table =output_doc.add_table(1, cols =2) # table.style = 'Table Grid' # font.size = Pt(9) # heading_cells = table.rows[0].cells # heading_cells[0].width = Inches(2) # heading_cells[1].width = Inches(1) # heading_cells[0].text = 'Type of Change Done' # heading_cells[1].text = 'Count of Lines' # for i in range(0,2): # heading_cells[i].paragraphs[0].runs[0].font.bold = True # cells = table.add_row().cells # font.size = Pt(8) # cells[0].width = Inches(2) # cells[0].text = 'Case Corrected' # cells[1].width = Inches(0.5) # cells[1].text = str(case_corrected_count) # cells = table.add_row().cells # font.size = Pt(8) # cells[0].width = Inches(2) # cells[0].text = 'Left Indent Corrected' # cells[1].width = Inches(0.5) # cells[1].text = str(left_indent_corrected_count) # cells = table.add_row().cells # font.size = Pt(8) # cells[0].width = Inches(2) # cells[0].text = 'Case Corrected' # cells[1].width = Inches(0.5) # cells[1].text = str(right_indent_corrected_count) # cells = table.add_row().cells # font.size = Pt(8) # cells[0].width = Inches(2) # cells[0].text = 'Lines Wrapped at prescribed indents' # cells[1].width = Inches(0.5) # cells[1].text = str(wrapped_lines_count) # # run.add_break() # para = output_doc.add_paragraph() # run = para.add_run() # run.add_break() # run.add_break() # font.size = Pt(8) # no_rows = len(audit_df.index) # table =output_doc.add_table(1, cols =6) # table.style = 'Table Grid' # table.autofit = False # # table.columns[0].width = Inches(0.5) # # table.columns[1].width = Inches(4) # # table.columns[2].width = Inches(4) # # table.columns[3].width = Inches(0.5) # heading_cells = table.rows[0].cells # heading_cells[0].width = Inches(0.5) # heading_cells[1].width = Inches(0.5) # heading_cells[2].width = Inches(3.5) # heading_cells[3].width = Inches(0.8) # heading_cells[4].width = Inches(3.5) # heading_cells[5].width = Inches(2) # heading_cells[0].text = 'Line No' # heading_cells[1].text = 'Audited Line No' # heading_cells[2].text = 'Current Content' # heading_cells[3].text = 'Script Element' # heading_cells[4].text = 'New Content' # heading_cells[5].text = 'Changes Done' # print("assigned heading") # for i in range(0,6): # heading_cells[i].paragraphs[0].runs[0].font.bold = True # heading_cells[i].paragraphs[0].runs[0].font.size = Pt(9) # print("assigned Index") # for index in audit_df.index: # row_index = 1 # #line_no = audit_df['line_no'][index] # cells = table.add_row().cells # cells[0].width = Inches(0.5) # cells[0].text = str(index) # audited_line_no = audit_df['audited_line_no'][index] # data = str(audited_line_no) # cells[1].width = Inches(0.5) # cells[1].text = data # cur_data = audit_df['data'][index] # data = cur_data # cells[2].width = Inches(3.5) # data = str(data) # cells[2].text = data # if audit_df['Identification_Status'][index] == 'blank': # script_element = 'Blank Line' # elif audit_df['Identification_Status'][index] == '': # if audit_df['introduction'][index] == 'Yes': # script_element = 'Title/Introduction' # elif audit_df['appendix'][index] == 'Yes': # script_element = 'Appendix' # # -----------------------------changed with mohit sir # else: # continue # # -----------------------------changed with mohit sir # else: # script_element = ps_to_script_element(audit_df['Identification_Status'][index]) # data = script_element # cells[3].width = Inches(0.8) # cells[3].text = data # new_data = audit_df['data_corrected'][index] # data = new_data # cells[4].width = Inches(3.5) # data = str(data) # cells[4].text = data # # if audit_df['line_removed'][index] == 'Yes': # # data = "Line was removed" # # run = para.add_run() # # run.text = data # # run.add_break() # # continue # sno = 1 # changes_done = False # if audit_df['left_indent_corrected'][index] != 'No': # change_comment = audit_df['left_indent_corrected'][index] # data = str(sno) + '. ' + str(change_comment) # cells[5].width = Inches(2) # para = cells[5].add_paragraph() # run = para.add_run() # run.text = data # run.add_break() # sno += 1 # changes_done = True # if audit_df['right_indent_corrected'][index] != 'No': # change_comment = audit_df['right_indent_corrected'][index] # data = str(sno) + '. ' + str(change_comment) # cells[5].width = Inches(2) # para = cells[5].add_paragraph() # run = para.add_run() # run.text = data # run.add_break() # sno += 1 # changes_done = True # if audit_df['case_corrected'][index] != 'No': # change_comment = 'Case ' + str(audit_df['case_corrected'][index]) # data = str(sno) + '. ' + str(change_comment) # cells[5].width = Inches(2) # para = cells[5].add_paragraph() # run = para.add_run() # run.text = data # run.add_break() # sno += 1 # changes_done = True # if audit_df['line_wrapped_at_prescribed_right_indent'][index] != 'No': # change_comment = 'Line Wrapped at Prescribed Right Indent' # data = str(sno) + '. ' + str(change_comment) # cells[5].width = Inches(2) # para = cells[5].add_paragraph() # run = para.add_run() # run.text = data # run.add_break() # sno += 1 # changes_done = True # if audit_df['line_broken_into_multiple_lines'][index] != 'No': # change_comment = 'Line Broken into Multiple Lines' # data = str(sno) + '. ' + str(change_comment) # cells[5].width = Inches(2) # para = cells[5].add_paragraph() # run = para.add_run() # run.text = data # run.add_break() # sno += 1 # changes_done = True # if audit_df['line_merged_with_next_line'][index] != 'No': # change_comment = 'Line Merged with Next Line' # data = str(sno) + '. ' + str(change_comment) # cells[5].width = Inches(2) # para = cells[5].add_paragraph() # run = para.add_run() # run.text = data # run.add_break() # sno += 1 # changes_done = True # if audit_df['language_specific_audit_comments'][index] != 'No': # change_comment = str(audit_df['language_specific_audit_comments'][index]) # data = str(sno) + '. ' + str(change_comment) # cells[5].width = Inches(2) # para = cells[5].add_paragraph() # run = para.add_run() # run.text = data # run.add_break() # sno += 1 # changes_done = True # if not changes_done: # data = 'No Changes Done' # cells[5].width = Inches(2) # para = cells[5].add_paragraph() # run = para.add_run() # run.text = data # run.add_break() # row_index += 1 # buffer = io.BytesIO() # output_doc.save(buffer) # buffer.seek(0) # print("complete") # #output_doc.save(audit_report_tabular_docx) # return buffer # def print_audit_report_tabular_docx(audit_df): # #line_removed header left_indent_corrected right_indent_corrected line_wrapped_at_prescribed_right_indent case_corrected #blank_inserted_before blank_inserted_after blank_deleted_before blank_deleted_after space_removed_between_characters #space_added_between_characters line_merged_with_next_line line_broken_into_multiple_lines punctuation_mark_added #punctuation_mark_removed # output_doc = Document() # style = output_doc.styles['Normal'] # font = style.font # #font.name = 'Courier New' # font.size = Pt(8) # section = output_doc.sections[-1] # section.orientation = WD_ORIENT.LANDSCAPE # section.page_width = Inches(11) # section.left_margin = Inches(0.25) # section.right_margin = Inches(0.25) # para = output_doc.add_paragraph() # para.alignment = WD_ALIGN_PARAGRAPH.CENTER # run = para.add_run() # font = run.font # font.size = Pt(12) # run.text = ' Audit Report' # run.add_break() # run.add_break() # para = output_doc.add_paragraph() # para.alignment = WD_ALIGN_PARAGRAPH.LEFT # run = para.add_run() # font = run.font # font.size = Pt(10) # run.text = ' Audit Summary' # para = output_doc.add_paragraph() # run = para.add_run() # font = run.font # font.size = Pt(9) # case_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['case_corrected'] != 'No'),:]) # left_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['left_indent_corrected'] != 'No'),:]) # right_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['right_indent_corrected'] != 'No'),:]) # wrapped_lines_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_wrapped_at_prescribed_right_indent'] != 'No'),:]) # table =output_doc.add_table(1, cols =2) # table.style = 'Table Grid' # font.size = Pt(9) # heading_cells = table.rows[0].cells # heading_cells[0].width = Inches(2) # heading_cells[1].width = Inches(1) # heading_cells[0].text = 'Type of Change Done' # heading_cells[1].text = 'Count of Lines' # for i in range(0,2): # heading_cells[i].paragraphs[0].runs[0].font.bold = True # cells = table.add_row().cells # font.size = Pt(8) # cells[0].width = Inches(2) # cells[0].text = 'Case Corrected' # cells[1].width = Inches(0.5) # cells[1].text = str(case_corrected_count) # cells = table.add_row().cells # font.size = Pt(8) # cells[0].width = Inches(2) # cells[0].text = 'Left Indent Corrected' # cells[1].width = Inches(0.5) # cells[1].text = str(left_indent_corrected_count) # cells = table.add_row().cells # font.size = Pt(8) # cells[0].width = Inches(2) # cells[0].text = 'Case Corrected' # cells[1].width = Inches(0.5) # cells[1].text = str(right_indent_corrected_count) # cells = table.add_row().cells # font.size = Pt(8) # cells[0].width = Inches(2) # cells[0].text = 'Lines Wrapped at prescribed indents' # cells[1].width = Inches(0.5) # cells[1].text = str(wrapped_lines_count) # run.add_break() # para = output_doc.add_paragraph() # run = para.add_run() # run.add_break() # run.add_break() # font.size = Pt(8) # no_rows = len(audit_df.index) # table =output_doc.add_table(1, cols =6) # table.alignment = WD_TABLE_ALIGNMENT.CENTER # table.style = 'Table Grid' # table.autofit = False # table.columns[0].width = Inches(0.5) # table.columns[1].width = Inches(1.2) # table.columns[2].width = Inches(2) # table.columns[3].width = Inches(1.5) # table.columns[4].width = Inches(2) # table.columns[5].width = Inches(2.5) # heading_cells = table.rows[0].cells # heading_cells[0].width = Inches(0.5) # heading_cells[1].width = Inches(0.5) # heading_cells[2].width = Inches(3.5) # heading_cells[3].width = Inches(0.8) # heading_cells[4].width = Inches(3.5) # heading_cells[5].width = Inches(2) # heading_cells[0].text = 'Line No' # heading_cells[1].text = 'Audited Line No' # heading_cells[2].text = 'Current Content' # heading_cells[3].text = 'Script Element' # heading_cells[4].text = 'New Content' # heading_cells[5].text = 'Changes Done' # for i in range(0,6): # heading_cells[i].paragraphs[0].runs[0].font.bold = True # heading_cells[i].paragraphs[0].runs[0].font.size = Pt(9) # for index in audit_df.index: # columns_to_check = ["line_removed","introduction", "appendix", "page_no" ,"left_indent_corrected" ,"right_indent_corrected" ,"line_wrapped_at_prescribed_right_indent", "case_corrected", "blank_inserted_before" ,"blank_inserted_after" ,"blank_deleted_before" ,"blank_deleted_after" ,"space_removed_between_characters" ,"space_added_between_characters" ,"line_merged_with_next_line", "line_broken_into_multiple_lines" ,"punctuation_mark_added" ,"punctuation_mark_removed" ,"language_specific_audit_comments"] # audit_df[columns_to_check] = audit_df[columns_to_check].fillna('No') # if audit_df.loc[index, columns_to_check].eq('No').all().all(): # continue # elif audit_df['introduction'][index] == 'Yes': # continue # elif audit_df['appendix'][index] == 'Yes': # continue # elif audit_df['Identification_Status'][index] == 'blank': # continue # elif pd.isna(audit_df.loc[index, "Identification_Status"]): # continue # row_index = 1 # #line_no = audit_df['line_no'][index] # cells = table.add_row().cells # cells[0].width = Inches(0.5) # cells[0].text = str(index) # audited_line_no = audit_df['audited_line_no'][index] # data = str(audited_line_no) # cells[1].width = Inches(0.5) # cells[1].text = data # cur_data = audit_df['data'][index] # data = str(cur_data).strip() # cells[2].width = Inches(3.5) # data = str(data) # cells[2].text = data # if audit_df['Identification_Status'][index] == 'blank': # script_element = 'Blank Line' # elif audit_df['Identification_Status'][index] == '': # if audit_df['introduction'][index] == 'Yes': # script_element = 'Title/Introduction' # elif audit_df['appendix'][index] == 'Yes': # script_element = 'Appendix' # # -----------------------------changed with mohit sir # else: # continue # # -----------------------------changed with mohit sir # else: # script_element = ps_to_script_element(audit_df['Identification_Status'][index]) # data = script_element # cells[3].width = Inches(0.8) # cells[3].text = data # new_data = audit_df['data_corrected'][index] # data = str(new_data).strip() # cells[4].width = Inches(3.5) # data = str(data) # cells[4].text = data # sno = 1 # changes_done = False # # identification_status = audit_df['Identification_Status'][index] # if pd.isnull(audit_df['Identification_Status'][index]) or audit_df['Identification_Status'][index] == "": # continue # if audit_df['left_indent_corrected'][index] != 'No': # change_comment = audit_df['left_indent_corrected'][index] # try: # str_int = change_comment[-2]+change_comment[-1] # except Exception as e: # pass # if ps_to_script_element(audit_df['Identification_Status'][index]) == "Dialogue": # if str_int == "15": # change_comment = "Dialogue line left index corrected to 1.5 Inch" # elif str_int == "25": # change_comment = "Dialogue line left index corrected to 2.5 Inch" # if str_int == "15": # name = ps_to_script_element(audit_df['Identification_Status'][index]) # change_comment = f"{name} line left indent corrected to 1.5 Inch" # print(change_comment) # elif str_int == "25": # name = ps_to_script_element(audit_df['Identification_Status'][index]) # change_commen = f"{name} left indent corrected to 2.5 Inch" # elif str_int == "30": # change_comment = "Parenthetical left indent corrected to 3 Inch" # elif str_int == "35": # change_comment = "Speaker left indent corrected to 3.5 Inch" # if len(str(change_comment)) <= 2 : # continue # data = str(sno) + '. ' + str(change_comment) # cells[5].width = Inches(2) # para = cells[5].add_paragraph() # run = para.add_run() # run.text = data # run.add_break() # sno += 1 # changes_done = True # if audit_df['right_indent_corrected'][index] != 'No': # name = ps_to_script_element(audit_df['Identification_Status'][index]) # change_comment = audit_df['right_indent_corrected'][index] # try: # str_int = change_comment[-2]+change_comment[-1] # except Exception as e: # pass # if str_int == "10": # change_comment = f"{name} right indent corrected to 1 Inch" # if len(str(change_comment)) <= 2 : # continue # data = str(sno) + '. ' + str(change_comment) # cells[5].width = Inches(2) # para = cells[5].add_paragraph() # run = para.add_run() # run.text = data # run.add_break() # sno += 1 # changes_done = True # if audit_df['case_corrected'][index] != 'No': # name = ps_to_script_element(audit_df['Identification_Status'][index]) # string = str(audit_df['case_corrected'][index]) # string = string.split() # content = string[-1] # if content == "AllUpper": # change_comment = f'{name} Case ' + "Corrected to All Upper" # elif content == "AllLower": # change_comment = f'{name} Case ' + "Corrected to All Lowerr" # if len(str(change_comment)) <= 2 : # continue # data = str(sno) + '. ' + str(change_comment) # cells[5].width = Inches(2) # para = cells[5].add_paragraph() # run = para.add_run() # run.text = data # run.add_break() # sno += 1 # changes_done = True # if audit_df['line_wrapped_at_prescribed_right_indent'][index] != 'No': # change_comment = 'Line Wrapped at Prescribed Right Indent 1 Inch' # name = ps_to_script_element(audit_df['Identification_Status'][index]) # if name == "Action": # change_comment = f'{name}Line Wrapped at Prescribed Right Indent 1 Inch' # elif name == "Dialogue": # change_comment = f'{name}Line Wrapped at Prescribed Right Indent 2 Inch' # if len(str(change_comment)) <= 2 : # continue # data = str(sno) + '. ' + str(change_comment) # cells[5].width = Inches(2) # para = cells[5].add_paragraph() # run = para.add_run() # run.text = data # run.add_break() # sno += 1 # changes_done = True # if audit_df['line_broken_into_multiple_lines'][index] != 'No': # name = ps_to_script_element(audit_df['Identification_Status'][index]) # change_comment = f'{name} line Broken into Multiple Lines' # if len(str(change_comment)) <= 2 : # continue # data = str(sno) + '. ' + str(change_comment) # cells[5].width = Inches(2) # para = cells[5].add_paragraph() # run = para.add_run() # run.text = data # run.add_break() # sno += 1 # changes_done = True # if audit_df['line_merged_with_next_line'][index] != 'No': # name = ps_to_script_element(audit_df['Identification_Status'][index]) # change_comment = f'{name} line Merged with Next Line' # if len(str(change_comment)) <= 2 : # continue # data = str(sno) + '. ' + str(change_comment) # cells[5].width = Inches(2) # para = cells[5].add_paragraph() # run = para.add_run() # run.text = data # run.add_break() # sno += 1 # changes_done = True # if audit_df['language_specific_audit_comments'][index] != 'No': # pass # name = ps_to_script_element(audit_df['Identification_Status'][index]) # change_comment = f"{name}",str(audit_df['language_specific_audit_comments'][index]) # if len(str(change_comment)) <= 2 : # continue # data = str(sno) + '. ' + str(change_comment) # cells[5].width = Inches(2) # para = cells[5].add_paragraph() # run = para.add_run() # run.text = data # run.add_break() # sno += 1 # changes_done = True # if audit_df['blank_inserted_after'][index] != 'No': # change_comment = 'A blank line is added below' # data = str(sno) + '. ' + str(change_comment) # cells[5].width = Inches(2) # para = cells[5].add_paragraph() # run = para.add_run() # run.text = data # run.add_break() # sno += 1 # changes_done = True # if not changes_done: # continue # # data = 'No Changes Done' # # cells[5].width = Inches(2) # # para = cells[5].add_paragraph() # # run = para.add_run() # # run.text = data # # run.add_break() # row_index += 1 # buffer = io.BytesIO() # output_doc.save(buffer) # buffer.seek(0) # # output_doc.save(audit_report_tabular_docx) # return buffer # def print_audit_report_tabular_docx(audit_df,scriptname,author,pre_audit_pagenumber,postauditpagenumber,preaudit_line_no,postaudit_line_no,script_language,dialogue_language): # #line_removed header left_indent_corrected right_indent_corrected line_wrapped_at_prescribed_right_indent case_corrected #blank_inserted_before blank_inserted_after blank_deleted_before blank_deleted_after space_removed_between_characters #space_added_between_characters line_merged_with_next_line line_broken_into_multiple_lines punctuation_mark_added #punctuation_mark_removed # total_no_blanklines = len(audit_df[audit_df['Identification_Status'].isin(['blank'])]) # # <---------------------BLANK LINE ADD AND remove LOGIC IS HERE-----------------> # blankline_added = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['blank_inserted_before'] != 'No'),:] ) # blank_add_after = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['blank_inserted_after'] != 'No'),:] ) # blankline_inserted = blankline_added + blank_add_after # blankline_rem_before = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['blank_deleted_before'] != 'No'),:] ) # blank_rem_after = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['blank_deleted_after'] != 'No'),:] ) # blankline_removed_total = blankline_rem_before + blank_rem_after # ### <<----------------- logic for case ---------------------------------> # # for slugline # # case corrected # sluglinecase_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['case_corrected'] != 'No') & (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])), :]) # print(sluglinecase_corrected_count) # # indentatioin corrected # sleft_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['left_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])),:]) # sright_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['right_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])),:]) # swrapped_lines_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_wrapped_at_prescribed_right_indent'] != 'No')& (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])),:]) # slugline_indentation = sleft_indent_corrected_count + sright_indent_corrected_count + swrapped_lines_count # print("sluglin_indentation:",slugline_indentation) # # formate corrected # slugline_formate1 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_removed_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])),:]) # slugline_formate2 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_added_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])),:]) # slugline_formate3 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_merged_with_next_line'] != 'No')& (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])),:]) # slugline_formate4 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_broken_into_multiple_lines'] != 'No')& (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])),:]) # slugline_formate5 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_added'] != 'No')& (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])),:]) # slugline_formate6 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_removed'] != 'No')& (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])),:]) # slugline_formated = slugline_formate1 + slugline_formate2 + slugline_formate3 + slugline_formate4 + slugline_formate5 + slugline_formate6 # print("slugline_formated",slugline_formated) # #total sluglines # total_no_sluglines = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])), :]) # print(total_no_sluglines) # # for actioon -----line # # case corrected # actionlinecase_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['case_corrected'] != 'No') & (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])), :]) # print(actionlinecase_corrected_count) # # indentatioin corrected # actionleft_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['left_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])),:]) # actionright_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['right_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])),:]) # actionwrapped_lines_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_wrapped_at_prescribed_right_indent'] != 'No')& (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])),:]) # actionline_indentation = actionleft_indent_corrected_count + actionright_indent_corrected_count + actionwrapped_lines_count # print("actionliine_indentation:",actionline_indentation) # # formate corrected # actionline_formate1 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_removed_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])),:]) # actionline_formate2 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_added_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])),:]) # actionline_formate3 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_merged_with_next_line'] != 'No')& (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])),:]) # actionline_formate4 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_broken_into_multiple_lines'] != 'No')& (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])),:]) # actionline_formate5 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_added'] != 'No')& (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])),:]) # actionline_formate6 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_removed'] != 'No')& (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])),:]) # actionline_formated = actionline_formate1 + actionline_formate2 + actionline_formate3 + actionline_formate4 + actionline_formate5 + actionline_formate6 # print("actionline_formated",actionline_formated) # #total no of actionline # total_actionlines = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])), :]) # print(total_actionlines) # # for Speaker # # case corrected # speakercase_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['case_corrected'] != 'No') & (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])), :]) # print("speakercase_corrected_count", speakercase_corrected_count) # # indentatioin corrected # speakerleft_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['left_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])),:]) # speakerright_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['right_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])),:]) # speaker_lines_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_wrapped_at_prescribed_right_indent'] != 'No')& (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])),:]) # speaker_indentation = speakerleft_indent_corrected_count + speakerright_indent_corrected_count + speaker_lines_count # print("speaker_indentation:",speaker_indentation) # # formate corrected # speaker_formate1 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_removed_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])),:]) # speaker_formate2 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_added_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])),:]) # speaker_formate3 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_merged_with_next_line'] != 'No')& (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])),:]) # speaker_formate4 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_broken_into_multiple_lines'] != 'No')& (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])),:]) # speaker_formate5 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_added'] != 'No')& (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])),:]) # speaker_formate6 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_removed'] != 'No')& (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])),:]) # speaker_formated = speaker_formate1 + speaker_formate2 + speaker_formate3 + speaker_formate4 + speaker_formate5 + speaker_formate6 # print("speaker_formated",speaker_formated) # #total no of speaker -speaker # total_no_speaker = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])),:]) # print(total_no_speaker) # # for Parenthetical -----line # # case corrected # parentheticalcase_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['case_corrected'] != 'No') & (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])), :]) # print(parentheticalcase_corrected_count) # # indentatioin corrected # parenthetical_left_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['left_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])),:]) # parenthetical_right_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['right_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])),:]) # parenthetical_wrapped_lines_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_wrapped_at_prescribed_right_indent'] != 'No')& (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])),:]) # parenthetical_line_indentation = parenthetical_left_indent_corrected_count + parenthetical_right_indent_corrected_count + parenthetical_wrapped_lines_count # print("parenthetical_line_indentation:",parenthetical_line_indentation) # # formate corrected # parenthetical_formate1 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_removed_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])),:]) # parenthetical_formate2 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_added_between_characters'] != 'No') & (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])),:]) # parenthetical_formate3 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_merged_with_next_line'] != 'No')& (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])),:]) # parenthetical_formate4 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_broken_into_multiple_lines'] != 'No')& (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])),:]) # parenthetical_formate5 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_added'] != 'No')& (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])),:]) # parenthetical_formate6 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_removed'] != 'No')& (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])),:]) # parenthetical_formated = parenthetical_formate1 + parenthetical_formate2 + parenthetical_formate3 + parenthetical_formate4 + parenthetical_formate5 + parenthetical_formate6 # print("parenthetical_formated",parenthetical_formated) # #total number of parenthetical # total_no_parenthetical = len(audit_df.loc[(audit_df['line_removed'] == 'No')& (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])),:]) # print(total_no_parenthetical) # # for Dialogue -----line # # case corrected # Dialogue_case_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['case_corrected'] != 'No') & (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])), :]) # print(Dialogue_case_corrected_count) # # indentatioin corrected # dialogue_left_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['left_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])),:]) # dialogue_right_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['right_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])),:]) # dialogue_wrapped_lines_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_wrapped_at_prescribed_right_indent'] != 'No')& (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])),:]) # dialogue_line_indentation = dialogue_left_indent_corrected_count + dialogue_right_indent_corrected_count + dialogue_wrapped_lines_count # print("dialogue_line_indentation:",dialogue_line_indentation) # # formate corrected # dialogue_formate1 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_removed_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])),:]) # dialogue_formate2 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_added_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])),:]) # dialogue_formate3 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_merged_with_next_line'] != 'No')& (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])),:]) # dialogue_formate4 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_broken_into_multiple_lines'] != 'No')& (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])),:]) # dialogue_formate5 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_added'] != 'No')& (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])),:]) # dialogue_formate6 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_removed'] != 'No')& (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])),:]) # dialogue_formated = dialogue_formate1 + dialogue_formate2 + dialogue_formate3 + dialogue_formate4 + dialogue_formate5 + dialogue_formate6 # print("dialogue_formated",dialogue_formated) # # total number of dialogue # total_no_dialogue = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])),:]) # print(total_no_dialogue) # # for Transistion -----line # # case corrected # transitions_case_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['case_corrected'] != 'No') & (audit_df['Identification_Status'].isin(['ps16'])), :]) # print(transitions_case_corrected_count) # # indentatioin corrected # transitions_left_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['left_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps16'])),:]) # transitions_right_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['right_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps16'])),:]) # transitions_wrapped_lines_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_wrapped_at_prescribed_right_indent'] != 'No')& (audit_df['Identification_Status'].isin(['ps16'])),:]) # transitions_line_indentation = transitions_left_indent_corrected_count + transitions_right_indent_corrected_count + transitions_wrapped_lines_count # print("transitions_line_indentation:",transitions_line_indentation) # # formate corrected # transitions_formate1 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_removed_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps16'])),:]) # transitions_formate2 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_added_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps16'])),:]) # transitions_formate3 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_merged_with_next_line'] != 'No')& (audit_df['Identification_Status'].isin(['ps16'])),:]) # transitions_formate4 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_broken_into_multiple_lines'] != 'No')& (audit_df['Identification_Status'].isin(['ps16'])),:]) # transitions_formate5 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_added'] != 'No')& (audit_df['Identification_Status'].isin(['ps16'])),:]) # transitions_formate6 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_removed'] != 'No')& (audit_df['Identification_Status'].isin(['ps16'])),:]) # transitions_formated = transitions_formate1 + transitions_formate2 + transitions_formate3 + transitions_formate4 + transitions_formate5 + transitions_formate6 # print("transitions_formated",transitions_formated) # #total transition # total_no_transition = len(audit_df.loc[audit_df['Identification_Status'].isin(['ps16']),:]) # print(total_no_transition) # # for Spectial Terms -----line # # case corrected # st_case_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['case_corrected'] != 'No') & (audit_df['Identification_Status'].isin(['ps17'])), :]) # print("st_case_corrected_count",st_case_corrected_count) # # indentatioin corrected # st_left_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['left_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps17'])),:]) # st_right_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['right_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps17'])),:]) # st_wrapped_lines_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_wrapped_at_prescribed_right_indent'] != 'No')& (audit_df['Identification_Status'].isin(['ps17'])),:]) # st_line_indentation = st_left_indent_corrected_count + st_right_indent_corrected_count + st_wrapped_lines_count # print("st_line_indentation:",st_line_indentation) # # formate corrected # st_formate1 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_removed_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps17'])),:]) # st_formate2 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_added_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps17'])),:]) # st_formate3 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_merged_with_next_line'] != 'No')& (audit_df['Identification_Status'].isin(['ps17'])),:]) # st_formate4 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_broken_into_multiple_lines'] != 'No')& (audit_df['Identification_Status'].isin(['ps17'])),:]) # st_formate5 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_added'] != 'No')& (audit_df['Identification_Status'].isin(['ps17'])),:]) # st_formate6 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_removed'] != 'No')& (audit_df['Identification_Status'].isin(['ps17'])),:]) # st_formated = st_formate1 + st_formate2 + st_formate3 + st_formate4 + st_formate5 + st_formate6 # print("st_formated",st_formated) # #total numner of special terms # total_special_terms = len(audit_df.loc[audit_df['Identification_Status'].isin(['ps17']),:]) # if total_special_terms < 1 : # total_special_terms = 1 # print(total_special_terms) # # write logic for the percentage # #a # difference_of_page_no = int(pre_audit_pagenumber) - int(postauditpagenumber) # average_of_page_no = (int(pre_audit_pagenumber) + int(postauditpagenumber)) / 2 # final_ratio_pageno = (difference_of_page_no / average_of_page_no) * 100 # #b # difference_of_line_no = int(preaudit_line_no)- int(postaudit_line_no) # average_of_line_no = (int(preaudit_line_no) + int(postaudit_line_no)) / 2 # final_ratio_lineno = (difference_of_line_no / average_of_line_no) * 100 # #c # try: # ratio_for_blanklines = ((int(blankline_inserted) + int(blankline_removed_total)) / average_of_line_no) *100 # except: # ratio_for_blanklines = 0 # #j # try: # ratio_for_sluglines = ((int(sluglinecase_corrected_count)+int(slugline_indentation)+int(slugline_formated))/total_no_sluglines)*100 # except: # ratio_for_sluglines = 0 # #d # try: # ratio_for_actionlines = ((int(actionlinecase_corrected_count)+ int(actionline_indentation)+ int(total_actionlines))/total_actionlines)*100 # except: # ratio_for_actionlines = 0 # #e # try: # ratio_for_Speaker = ((int(speakercase_corrected_count)+int(speaker_formated)+int(speaker_formated))/ total_actionlines)*100 # except: # ratio_for_Speaker = 0 # #f # try: # ratio_for_parenthetical = ((int(parentheticalcase_corrected_count)+int(parenthetical_line_indentation)+int(parenthetical_formated)) / total_no_parenthetical)*100 # except: # ratio_for_parenthetical = 0 # #g # try: # ratio_for_dialogues = ((int(Dialogue_case_corrected_count)+int(dialogue_line_indentation)+int(dialogue_formated)) / total_no_dialogue)*100 # except: # ratio_for_dialogues = 0 # #h # try: # ratio_for_transitions = ((int(transitions_case_corrected_count)+int(transitions_line_indentation)+int(transitions_formated)) / total_no_transition)*100 # except: # ratio_for_transitions = 0 # #i # try: # ratio_for_special_terms = ((int(st_case_corrected_count)+int(st_line_indentation)+int(st_formated))/total_special_terms) * 100 # except: # ratio_for_special_terms = 0 # average_of_c_j = (ratio_for_sluglines+ratio_for_actionlines+ratio_for_Speaker+ratio_for_parenthetical+ratio_for_dialogues+ratio_for_transitions+ratio_for_special_terms)/7 # audit_configuration_percentage = (final_ratio_pageno+final_ratio_lineno+ratio_for_blanklines) + (average_of_c_j) # audit_configuration_percentage_str = f"{audit_configuration_percentage:.2f}%" # print("audit_configuration_percentage",audit_configuration_percentage_str) # total_script_element_correct = (total_no_sluglines+total_actionlines+total_no_speaker+total_no_parenthetical+total_no_dialogue+total_no_transition+total_special_terms) # print("total_script_element_correct",total_script_element_correct) # audit_script_accuracy = (total_no_sluglines+total_actionlines+total_no_speaker+total_no_parenthetical+total_no_dialogue+total_no_transition+total_special_terms+total_no_blanklines)/preaudit_line_no # print("audit_script_accuracy",audit_script_accuracy) # # audit_script_accuracy_str = min(audit_script_accuracy*100 , 100) # audit_script_accuracy_str = min(audit_script_accuracy*100,100) # audit_script_accuracy_str = f"{audit_script_accuracy_str:.2f}%" # print("audit_script_accuracy_str",audit_script_accuracy_str) # # the table logics ends here # # percenteage table from here # output_doc = Document() # style = output_doc.styles['Normal'] # font = style.font # #font.name = 'Courier New' # font.size = Pt(10) # section = output_doc.sections[-1] # section.orientation = WD_ORIENT.LANDSCAPE # section.page_width = Inches(11) # section.left_margin = Inches(0.25) # section.right_margin = Inches(0.25) # para = output_doc.add_paragraph() # para.alignment = WD_ALIGN_PARAGRAPH.CENTER # # Audit Summary at center of the page with bold # run = para.add_run() # font = run.font # font.bold = True # font.size = Pt(14) # run.text = ' Audit Summary' # run.add_break() # # Add a paragraph for the left-aligned "Audit Date" # current_date = date.today() # # Convert to the "day month year" format # formatted_date = current_date.strftime("%d %B %Y") # left_aligned_text = output_doc.add_paragraph("\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tAudit Date: " + str(formatted_date)) # left_aligned_text.alignment = WD_ALIGN_PARAGRAPH.LEFT # font_audit_date = left_aligned_text.runs[0].font # font_audit_date.size = Pt(12) # para = output_doc.add_paragraph() # right_aligned_text = para.add_run('\t\tScriptname: ' + str(scriptname)) # right_aligned_text.alignment = WD_ALIGN_PARAGRAPH.RIGHT # font_right = right_aligned_text.font # font_right.size = Pt(12) # author_para = output_doc.add_paragraph() # run_author = author_para.add_run("\t\tAuthor: " + str(author)) # font_author = run_author.font # font_author.size = Pt(12) # language_script_para = output_doc.add_paragraph() # run_language_script = language_script_para.add_run("\t\tLanguage of Script: " + str(script_language)) # font_language_script = run_language_script.font # font_language_script.size = Pt(12) # language_dialogue_para = output_doc.add_paragraph() # run_language_dialogue = language_dialogue_para.add_run("\t\tLanguage of Dialogue: " + str(dialogue_language)) # font_language_dialogue = run_language_dialogue.font # font_language_dialogue.size = Pt(12) # # for pre audit and post Audit # para = output_doc.add_paragraph() # run = para.add_run() # font = run.font # font.size = Pt(11) # percent_table = output_doc.add_table(1, cols =2) # percent_table.alignment = WD_TABLE_ALIGNMENT.CENTER # percent_table.style = 'Table Grid' # percent_heading_cells = percent_table.rows[0].cells # percent_heading_cells[0].width = Inches(1.5) # percent_heading_cells[1].width = Inches(1) # percent_heading_cells[0].text = 'Audit Contribution' # percent_heading_cells[1].text = str(audit_configuration_percentage_str) # percent_heading_cells = percent_table.add_row().cells # font.size = Pt(12) # percent_heading_cells[0].width = Inches(1.5) # percent_heading_cells[0].text = 'Audit Script Accuracy' # percent_heading_cells[1].width = Inches(1) # percent_heading_cells[1].text = str(audit_script_accuracy_str) # para = output_doc.add_paragraph() # pre_post_table = output_doc.add_table(1, cols =3) # pre_post_table.alignment = WD_TABLE_ALIGNMENT.CENTER # pre_post_table.style = 'Table Grid' # preheading_cells = pre_post_table.rows[0].cells # preheading_cells[0].width = Inches(1.5) # preheading_cells[1].width = Inches(1) # preheading_cells[2].width = Inches(1) # preheading_cells[1].text = 'Pre Audit' # preheading_cells[2].text = 'Post Audit' # # row No of pages # pcells = pre_post_table.add_row().cells # font.size = Pt(12) # pcells[0].width = Inches(1.5) # pcells[0].text = 'No of Pages' # pcells[1].width = Inches(1) # pcells[1].text = str(pre_audit_pagenumber) # pcells[2].width = Inches(1) # pcells[2].text = str(postauditpagenumber) # # row no of lines # pcells = pre_post_table.add_row().cells # font.size = Pt(12) # pcells[0].width = Inches(1.5) # pcells[0].text = 'No of lines' # pcells[1].width = Inches(1) # pcells[1].text = str(preaudit_line_no) # pcells[2].width = Inches(1) # pcells[2].text = str(postaudit_line_no) # # adding extra line after the table above # para = output_doc.add_paragraph() # run = para.add_run() # font = run.font # font.size = Pt(12) # #-------------------------------------- # bl_table = output_doc.add_table(1, cols =2) # bl_table.alignment = WD_TABLE_ALIGNMENT.CENTER # bl_table.style = 'Table Grid' # bl_heading_cells = bl_table.rows[0].cells # bl_heading_cells[0].width = Inches(1.5) # bl_heading_cells[0].text = 'Blank Lines Added' # bl_heading_cells[1].width = Inches(1.5) # bl_heading_cells[1].text = str(blankline_inserted) # add the number here # blcells = bl_table.add_row().cells # font.size = Pt(12) # blcells[0].width = Inches(1.5) # blcells[0].text = 'Blank Lines Removed' # blcells[1].width = Inches(1.5) # blcells[1].text = str(blankline_removed_total) # add the number here # # adding extra line after the table above # para = output_doc.add_paragraph() # run = para.add_run() # font = run.font # font.size = Pt(12) # sum_table = output_doc.add_table(1, cols =4) # sum_table.alignment = WD_TABLE_ALIGNMENT.CENTER # sum_table.style = 'Table Grid' # sum_heading_cells = sum_table.rows[0].cells # sum_heading_cells[0].width = Inches(1.5) # sum_heading_cells[0].text = '' # sum_heading_cells[1].width = Inches(1.5) # sum_heading_cells[1].height = Inches(0.5) # sum_heading_cells[1].text = 'Case Correction' # sum_heading_cells[2].width = Inches(1.5) # sum_heading_cells[2].text = 'Indent Correction' # sum_heading_cells[3].width = Inches(1.5) # sum_heading_cells[3].text = 'Format Correction' # sum_cells = sum_table.add_row().cells # font.size = Pt(12) # sum_cells[0].width = Inches(1.5) # sum_cells[0].height = Inches(0.3) # sum_cells[0].text = 'Sluglines' # sum_cells[1].width = Inches(1.5) # sum_cells[1].height = Inches(0.3) # sum_cells[1].text = str(sluglinecase_corrected_count) # sum_cells[2].width = Inches(1.5) # sum_cells[2].height = Inches(0.3) # sum_cells[2].text = str(slugline_indentation) # sum_cells[3].width = Inches(1.5) # sum_cells[3].height = Inches(0.3) # sum_cells[3].text = str(slugline_formated) # sum_cells = sum_table.add_row().cells # font.size = Pt(12) # sum_cells[0].width = Inches(1.5) # sum_cells[0].text = 'Actioin Lines' # sum_cells[1].width = Inches(1.5) # sum_cells[1].text = str(actionlinecase_corrected_count) # sum_cells[2].width = Inches(1.5) # sum_cells[2].text = str(actionline_indentation) # sum_cells[3].width = Inches(1.5) # sum_cells[3].text = str(actionline_formated) # sum_cells = sum_table.add_row().cells # font.size = Pt(12) # sum_cells[0].width = Inches(1.5) # sum_cells[0].text = 'Speakers' # sum_cells[1].width = Inches(1.5) # sum_cells[1].text = str(speakercase_corrected_count) # sum_cells[2].width = Inches(1.5) # sum_cells[2].text = str(speaker_indentation) # sum_cells[3].width = Inches(1.5) # sum_cells[3].text = str(speaker_formated) # sum_cells = sum_table.add_row().cells # font.size = Pt(12) # sum_cells[0].width = Inches(1.5) # sum_cells[0].text = 'Parentheticals' # sum_cells[1].width = Inches(1.5) # sum_cells[1].text = str(parentheticalcase_corrected_count) # sum_cells[2].width = Inches(1.5) # sum_cells[2].text = str(parenthetical_line_indentation) # sum_cells[3].width = Inches(1.5) # sum_cells[3].text = str(parenthetical_formated) # sum_cells = sum_table.add_row().cells # font.size = Pt(12) # sum_cells[0].width = Inches(1.5) # sum_cells[0].text = 'Dialogues' # sum_cells[1].width = Inches(1.5) # sum_cells[1].text = str(Dialogue_case_corrected_count) # sum_cells[2].width = Inches(1.5) # sum_cells[2].text = str(dialogue_line_indentation) # sum_cells[3].width = Inches(1.5) # sum_cells[3].text = str(dialogue_formated) # sum_cells = sum_table.add_row().cells # font.size = Pt(12) # sum_cells[0].width = Inches(1.5) # sum_cells[0].text = 'Transitions' # sum_cells[1].width = Inches(1.5) # sum_cells[1].text = str(transitions_case_corrected_count) # sum_cells[2].width = Inches(1.5) # sum_cells[2].text = str(transitions_line_indentation) # sum_cells[3].width = Inches(1.5) # sum_cells[3].text = str(transitions_formated) # sum_cells = sum_table.add_row().cells # font.size = Pt(12) # sum_cells[0].width = Inches(1.5) # sum_cells[0].text = 'Special Terms' # sum_cells[1].width = Inches(1.5) # sum_cells[1].text = str(st_case_corrected_count) # sum_cells[2].width = Inches(1.5) # sum_cells[2].text = str(st_line_indentation) # sum_cells[3].width = Inches(1.5) # sum_cells[3].text = str(st_line_indentation) # para = output_doc.add_paragraph() # run = para.add_run() # font = run.font # font.size = Pt(12) # para = output_doc.add_paragraph() # run = para.add_run() # run.add_break() # run.add_break() # #--------------------------- 14-09-2023 # for _ in range(5): # output_doc.add_paragraph() # #----------------------- 14-09-23 # para = output_doc.add_paragraph() # para.alignment = WD_ALIGN_PARAGRAPH.CENTER # # Audit detail at center of the page with bold # run = para.add_run() # font = run.font # font.bold = True # font.size = Pt(14) # run.text = ' Audit Details' # run.add_break() # # -------------------------- 14-09-23 # no_rows = len(audit_df.index) # table =output_doc.add_table(1, cols =6) # table.alignment = WD_TABLE_ALIGNMENT.CENTER # table.style = 'Table Grid' # table.autofit = False # table.columns[0].width = Inches(0.5) # table.columns[1].width = Inches(1.2) # table.columns[2].width = Inches(2) # table.columns[3].width = Inches(1.5) # table.columns[4].width = Inches(2) # table.columns[5].width = Inches(2.5) # # table.columns[3].width = Inches(0.5) # heading_cells = table.rows[0].cells # heading_cells[0].width = Inches(0.1) # heading_cells[1].width = Inches(0.1) # heading_cells[2].width = Inches(3.5) # heading_cells[3].width = Inches(0.8) # heading_cells[4].width = Inches(3.5) # heading_cells[5].width = Inches(2) # heading_cells[0].text = 'Line No' # heading_cells[1].text = 'Audited Line No' # heading_cells[2].text = 'Current Content' # heading_cells[3].text = 'Script Element' # heading_cells[4].text = 'New Content' # heading_cells[5].text = 'Changes Done' # for i in range(0,6): # heading_cells[i].paragraphs[0].runs[0].font.bold = True # heading_cells[i].paragraphs[0].runs[0].font.size = Pt(9) # #------------------------------->LOGIC HERE<--------------------------------------------- # report_df = pd.DataFrame(columns=['line_no', 'audited_line_no', 'current_content', 'script_element', 'new_content', 'changes_done', 'para_no']) # for index in audit_df.index: # columns_to_check = ["line_removed","introduction", "appendix", "page_no" ,"left_indent_corrected" ,"right_indent_corrected" ,"line_wrapped_at_prescribed_right_indent", "case_corrected", "blank_inserted_before" ,"blank_inserted_after" ,"blank_deleted_before" ,"blank_deleted_after" ,"space_removed_between_characters" ,"space_added_between_characters" ,"line_merged_with_next_line", "line_broken_into_multiple_lines" ,"punctuation_mark_added" ,"punctuation_mark_removed" ,"language_specific_audit_comments"] # audit_df[columns_to_check] = audit_df[columns_to_check].fillna('No') # if audit_df.loc[index, columns_to_check].eq('No').all().all(): # # All columns contain 'No', skip this row # continue # elif audit_df['introduction'][index] == 'Yes': # continue # elif audit_df['appendix'][index] == 'Yes': # continue # elif audit_df['Identification_Status'][index] == 'blank': # continue # elif pd.isna(audit_df.loc[index, "Identification_Status"]): # continue # para_value = audit_df["para_no"][index] # ---------------------------------------------><------------------------- # current_para_value = report_df['para_no'].iloc[-1] if not report_df.empty else None # if para_value == current_para_value: # continue # else: # # report_df = report_df.append(audit_df.loc[index], ignore_index=True) # new_row = audit_df.loc[index].to_frame().T # report_df = pd.concat([report_df, new_row], ignore_index=True) # print("current_para_value",current_para_value) # row_index = 1 # old_line_no_index = index # collection_old_line_no = [] # while old_line_no_index < len(audit_df) and str(audit_df["para_no"][old_line_no_index]) == str(para_value): # if audit_df['Identification_Status'][old_line_no_index] != "blank": # try: # data = int(old_line_no_index) # collection_old_line_no.append(str(data)) # except ValueError: # pass # old_line_no_index += 1 # cells = table.add_row().cells # cells[0].width = Inches(0.1) # cells[0].text = ', '.join(collection_old_line_no) # audited_line_index = index # #--------------------------------------audited_lino_no------------------ # collection_audited_line_no = [] # while audited_line_index < len(audit_df) and str(audit_df["para_no"][audited_line_index]) == str(para_value): # if audit_df['Identification_Status'][audited_line_index] != "blank": # audited_line_no = audit_df['audited_line_no'][audited_line_index] # try: # data = int(audited_line_no) # collection_audited_line_no.append(str(data)) # except ValueError: # pass # audited_line_index += 1 # print("collection_audited_line_no", collection_audited_line_no) # data_string = ', '.join(collection_audited_line_no) # print("data_string:", data_string) # cells[1].width = Inches(0.1) # cells[1].text = data_string # #------------------------------>OLD DATA<--------------------------------- # data_index = index # collection_data = [] # while data_index < len(audit_df) and str(audit_df["para_no"][data_index]) == str(para_value): # cur_data = audit_df['data'][data_index] # if not pd.isna(cur_data): # Check if the value is not NaN # data = str(cur_data).strip() # collection_data.append(data) # data_index += 1 # cells[2].width = Inches(3.5) # data = str(data) # cells[2].text = '\n '.join(collection_data) # if audit_df['Identification_Status'][index] == 'blank': # script_element = 'Blank Line' # elif audit_df['Identification_Status'][index] == '': # if audit_df['introduction'][index] == 'Yes': # script_element = 'Title/Introduction' # elif audit_df['appendix'][index] == 'Yes': # script_element = 'Appendix' # else: # continue # else: # script_element = ps_to_script_element(audit_df['Identification_Status'][index]) # data = script_element # cells[3].width = Inches(0.8) # cells[3].text = data # collection_new_data = [] # new_data_index = index # while new_data_index < len(audit_df) and str(audit_df["para_no"][new_data_index]) == str(para_value): # if audit_df["line_removed"][new_data_index] == "No": # new_data = audit_df['data_corrected'][new_data_index] # if not pd.isna(new_data): # Check if the value is not NaN # data = str(new_data).strip() # collection_new_data.append(data) # new_data_index += 1 # data = str(new_data).strip() # cells[4].width = Inches(3.5) # data = str(data) # cells[4].text = '\n '.join(collection_new_data) # sno = 1 # changes_done = False # # identification_status = audit_df['Identification_Status'][index] # if pd.isnull(audit_df['Identification_Status'][index]) or audit_df['Identification_Status'][index] == "": # continue # if audit_df['left_indent_corrected'][index] != 'No': # change_comment = audit_df['left_indent_corrected'][index] # try: # str_int = change_comment[-2]+change_comment[-1] # except Exception as e: # pass # if ps_to_script_element(audit_df['Identification_Status'][index]) == "Dialogue": # if str_int == "15": # change_comment = "Dialogue line left index corrected to 1.5 Inch" # elif str_int == "25": # change_comment = "Dialogue line left index corrected to 2.5 Inch" # if str_int == "15": # name = ps_to_script_element(audit_df['Identification_Status'][index]) # change_comment = f"{name} line left indent corrected to 1.5 Inch" # print(change_comment) # elif str_int == "25": # name = ps_to_script_element(audit_df['Identification_Status'][index]) # change_commen = f"{name} left indent corrected to 2.5 Inch" # elif str_int == "30": # change_comment = "Parenthetical left indent corrected to 3 Inch" # elif str_int == "35": # change_comment = "Speaker left indent corrected to 3.5 Inch" # data = str(sno) + '. ' + str(change_comment) # # dataa = data.split() # # if dataa[-1] == "nan": # # continue # cells[5].width = Inches(2) # para = cells[5].add_paragraph() # run = para.add_run() # run.text = data # run.add_break() # sno += 1 # changes_done = True # if audit_df['right_indent_corrected'][index] != 'No': # name = ps_to_script_element(audit_df['Identification_Status'][index]) # change_comment = audit_df['right_indent_corrected'][index] # try: # str_int = change_comment[-2]+change_comment[-1] # except Exception as e: # pass # if str_int == "10": # change_comment = f"{name} right indent corrected to 1 Inch" # data = str(sno) + '. ' + str(change_comment) # # dataa = data.split() # # if dataa[-1] == "nan": # # continue # cells[5].width = Inches(2) # para = cells[5].add_paragraph() # run = para.add_run() # run.text = data # run.add_break() # sno += 1 # changes_done = True # if audit_df['case_corrected'][index] != 'No': # name = ps_to_script_element(audit_df['Identification_Status'][index]) # string = str(audit_df['case_corrected'][index]) # string = string.split() # content = string[-1] # if content == "AllUpper": # change_comment = f'{name} Case ' + "Corrected to All Upper" # elif content == "AllLower": # change_comment = f'{name} Case ' + "Corrected to All Lowerr" # if len(str(change_comment)) <= 2 : # continue # data = str(sno) + '. ' + str(change_comment) # # dataa = data.split() # # if dataa[-1] == "nan": # # continue # cells[5].width = Inches(2) # para = cells[5].add_paragraph() # run = para.add_run() # run.text = data # run.add_break() # sno += 1 # changes_done = True # if audit_df['line_wrapped_at_prescribed_right_indent'][index] != 'No': # change_comment = 'Line Wrapped at Prescribed Right Indent 1 Inch' # name = ps_to_script_element(audit_df['Identification_Status'][index]) # if name == "Action": # change_comment = f'{name}Line Wrapped at Prescribed Right Indent 1 Inch' # elif name == "Dialogue": # change_comment = f'{name}Line Wrapped at Prescribed Right Indent 2 Inch' # data = str(sno) + '. ' + str(change_comment) # # dataa = data.split() # # if dataa[-1] == "nan": # # continue # cells[5].width = Inches(2) # para = cells[5].add_paragraph() # run = para.add_run() # run.text = data # run.add_break() # sno += 1 # changes_done = True # if audit_df['line_broken_into_multiple_lines'][index] != 'No': # name = ps_to_script_element(audit_df['Identification_Status'][index]) # change_comment = f'{name} line Broken into Multiple Lines' # data = str(sno) + '. ' + str(change_comment) # # dataa = data.split() # # if dataa[-1] == "nan": # # continue # cells[5].width = Inches(2) # para = cells[5].add_paragraph() # run = para.add_run() # run.text = data # run.add_break() # sno += 1 # changes_done = True # if audit_df['line_merged_with_next_line'][index] != 'No': # name = ps_to_script_element(audit_df['Identification_Status'][index]) # change_comment = f'{name} line Merged with Next Line' # data = str(sno) + '. ' + str(change_comment) # # dataa = data.split() # # if dataa[-1] == "nan": # # continue # cells[5].width = Inches(2) # para = cells[5].add_paragraph() # run = para.add_run() # run.text = data # run.add_break() # sno += 1 # changes_done = True # if audit_df['language_specific_audit_comments'][index] != 'No': # pass # name = ps_to_script_element(audit_df['Identification_Status'][index]) # change_comment = f"{name}",str(audit_df['language_specific_audit_comments'][index]) # data = str(sno) + '. ' + str(change_comment) # cells[5].width = Inches(2) # para = cells[5].add_paragraph() # run = para.add_run() # run.text = data # run.add_break() # sno += 1 # changes_done = True # if audit_df['blank_inserted_after'][index] != 'No': # change_comment = 'A blank line is added below' # # name = ps_to_script_element(audit_df['Identification_Status'][index]) # # if name == "Action": # # change_comment = f'{name}Line Wrapped at Prescribed Right Indent 1 Inch' # # elif name == "Dialogue": # # change_comment = f'{name}Line Wrapped at Prescribed Right Indent 2 Inch' # data = str(sno) + '. ' + str(change_comment) # # dataa = data.split() # # if dataa[-1] == "nan": # # continue # cells[5].width = Inches(2) # para = cells[5].add_paragraph() # run = para.add_run() # run.text = data # run.add_break() # sno += 1 # changes_done = True # if not changes_done: # continue # # data = 'No Changes Done' # # cells[5].width = Inches(2) # # para = cells[5].add_paragraph() # # run = para.add_run() # # run.text = data # # run.add_break() # row_index += 1 # buffer = io.BytesIO() # output_doc.save(buffer) # buffer.seek(0) # #output_doc.save(audit_report_tabular_docx) # return buffer def count_the_line(text_file_path): with open(text_file_path, 'r') as fp: lines = len(fp.readlines()) return lines def convert_to_pdf(input_docx, out_folder): p = subprocess.Popen( [ "libreoffice", "--headless", "--convert-to", "pdf", "--outdir", out_folder, input_docx, ] ) print(["--convert-to", "pdf", input_docx]) p.communicate() def countPages(docfile, pdf_file_path, base_path_directory): convert_to_pdf(docfile, base_path_directory) print("converted to pdf") print("pdf_file_path",pdf_file_path) pdf = PdfFileReader(open(pdf_file_path, "rb")) number_of_pages = pdf.getNumPages() return number_of_pages def convert_txt_to_docx(txt_file_path, docx_file_path): doc = docx.Document() with open(txt_file_path, 'r', encoding='utf-8') as txt: text = txt.read() doc.add_paragraph(text) doc.save(docx_file_path) def csv_to_docx(csv: pd.DataFrame) -> Document: output_doc = Document() style = output_doc.styles["Normal"] font = style.font font.name = "Courier New" font.size = Pt(12) section = output_doc.sections[0] section.page_height = Mm(297) a4_right = 8.57 section.page_width = Inches(a4_right) section.left_margin = Inches(1.5) for index in csv.index: para = output_doc.add_paragraph() paragraph_format = para.paragraph_format paragraph_format.space_before = Pt(0) paragraph_format.space_after = Pt(0) paragraph_format.line_spacing = Pt(12) script_element = csv["script_element"][index] content = csv["content"][index] if script_element == "blank": continue elif script_element == "slugline": paragraph_format.left_indent = Inches(0) paragraph_format.right_indent = Inches(0) content = content.upper() elif script_element == "action": paragraph_format.left_indent = Inches(0) paragraph_format.right_indent = Inches(0) elif script_element == "dialogue": paragraph_format.left_indent = Inches(1.0) paragraph_format.right_indent = Inches(1.25) elif script_element == "parenthetical": paragraph_format.left_indent = Inches(1.5) paragraph_format.right_indent = Inches(2.25) elif script_element == "speaker": paragraph_format.left_indent = Inches(2) paragraph_format.right_indent = Inches(1) content = content.upper() elif script_element == "transition": para.alignment = WD_ALIGN_PARAGRAPH.RIGHT paragraph_format.left_indent = Inches(2.5) paragraph_format.right_indent = Inches(0) elif script_element == "special_term": paragraph_format.left_indent = Inches(0) paragraph_format.right_indent = Inches(0) if isinstance(content, float): content = "" para.text = content return output_doc def language_detector_for_csv(orginal_csv_path): try: audit_df = pd.read_csv(orginal_csv_path) except: audit_df = orginal_csv_path actionline_lang = [] dialogue_lang = [] for index, row in audit_df.iterrows(): if audit_df["script_element"][index] in ["action"]: string_original = audit_df["content"][index] src_lang = language_detector(string_original) Final_lang = [language_code[src_lang]] actionline_lang.append(Final_lang) elif audit_df["script_element"][index] in ["dialogue"]: string_original = audit_df["content"][index] src_lang = language_detector(string_original) Final_lang = [language_code[src_lang]] dialogue_lang.append(Final_lang) return actionline_lang, dialogue_lang def assign_para_no(df): para_no = 1 df['para_no'] = 0 index_iter = iter(df.index) for index in df.index: line_pos = df['Identification_Status'][index] if line_pos == 'blank' : continue if line_pos == 'ps1': df.at[index, 'para_no'] = para_no para_no += 1 continue if line_pos == "ps2": if df['Identification_Status'][index + 1] == "ps3": df.at[index, 'para_no'] = para_no df.at[index+1, 'para_no'] = para_no para_no += 1 continue else: df.at[index, 'para_no'] = para_no para_no += 1 continue if line_pos == 'ps4': df.at[index, 'para_no'] = para_no spot_index = index +1 while df['Identification_Status'][spot_index] in ["ps5","ps6","ps4"]: df.at[spot_index, 'para_no'] = para_no spot_index += 1 para_no += 1 continue if line_pos == 'ps13': df.at[index, 'para_no'] = para_no spot_index = index +1 while spot_index < len(df) and df['Identification_Status'][spot_index] in ["ps14","ps15","ps13","blank"]: if df['Identification_Status'][spot_index] == "blank": if spot_index + 1 < len(df) and df['Identification_Status'][spot_index+1] == "ps14": df.at[spot_index+1, 'para_no'] = para_no spot_index += 1 else: pass df.at[spot_index, 'para_no'] = para_no spot_index += 1 para_no += 1 continue if line_pos == 'ps6': if df['Identification_Status'][index-1] in ["ps5","ps4"]: continue else: df.at[index, 'para_no'] = para_no para_no += 1 if line_pos == "ps7": df.at[index, 'para_no'] = para_no spot_index = index +1 while df['Identification_Status'][spot_index] in ["ps8","ps9"]: df.at[spot_index, 'para_no'] = para_no spot_index += 1 para_no += 1 continue if line_pos == "ps8": if df['Identification_Status'][index+1] in ["ps13","ps15"]: df.at[index, 'para_no'] = para_no para_no += 1 continue else: df.at[index, 'para_no'] = para_no para_no += 1 continue if line_pos == 'ps15': if df['Identification_Status'][index-1] in ["ps7","ps12","ps10","ps20","ps8","blank"]: df.at[index, 'para_no'] = para_no para_no += 1 continue else: continue if line_pos == "ps14": if df['Identification_Status'][index-1] in ["ps8","ps7"]: df.at[index, 'para_no'] = para_no spot_index = index +1 while df['Identification_Status'][spot_index] == "ps15": df.at[spot_index, 'para_no'] = para_no spot_index += 1 para_no += 1 else: continue if line_pos == 'ps11': df.at[index, 'para_no'] = para_no spot_index = index +1 while df['Identification_Status'][spot_index] in ["ps12","ps20"]: df.at[spot_index, 'para_no'] = para_no spot_index += 1 para_no += 1 continue if line_pos == "ps12": if df['Identification_Status'][index-1] in ["ps11","ps20"]: continue continue if line_pos == "ps10": df.at[index, 'para_no'] = para_no para_no += 1 continue if line_pos == "ps20": if df['Identification_Status'][index-1] == "ps11": continue elif df['Identification_Status'][index+1] == "ps12": df.at[index, 'para_no'] = para_no df.at[index+1, 'para_no'] = para_no para_no += 1 continue para_no += 1 continue if line_pos == 'ps17' : df.at[index, 'para_no'] = para_no para_no += 1 continue if line_pos == 'ps16' : df.at[index, 'para_no'] = para_no para_no += 1 continue columns = list(df.columns) columns.insert(3, columns.pop(columns.index('para_no'))) df = df[columns] return df def print_audit_report_tabular_docx(audit_df,scriptname,author,pre_audit_pagenumber,postauditpagenumber,preaudit_line_no,postaudit_line_no,script_language,dialogue_language): #line_removed header left_indent_corrected right_indent_corrected line_wrapped_at_prescribed_right_indent case_corrected #blank_inserted_before blank_inserted_after blank_deleted_before blank_deleted_after space_removed_between_characters #space_added_between_characters line_merged_with_next_line line_broken_into_multiple_lines punctuation_mark_added #punctuation_mark_removed total_no_blanklines = len(audit_df[audit_df['Identification_Status'].isin(['blank'])]) # <---------------------BLANK LINE ADD AND remove LOGIC IS HERE-----------------> blankline_added = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['blank_inserted_before'] != 'No'),:] ) blank_add_after = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['blank_inserted_after'] != 'No'),:] ) blankline_inserted = blankline_added + blank_add_after blankline_rem_before = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['blank_deleted_before'] != 'No'),:] ) blank_rem_after = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['blank_deleted_after'] != 'No'),:] ) blankline_removed_total = blankline_rem_before + blank_rem_after ### <<----------------- logic for case ---------------------------------> # for slugline # case corrected sluglinecase_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['case_corrected'] != 'No') & (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])), :]) print(sluglinecase_corrected_count) # indentatioin corrected sleft_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['left_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])),:]) sright_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['right_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])),:]) swrapped_lines_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_wrapped_at_prescribed_right_indent'] != 'No')& (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])),:]) slugline_indentation = sleft_indent_corrected_count + sright_indent_corrected_count + swrapped_lines_count print("sluglin_indentation:",slugline_indentation) # formate corrected slugline_formate1 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_removed_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])),:]) slugline_formate2 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_added_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])),:]) slugline_formate3 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_merged_with_next_line'] != 'No')& (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])),:]) slugline_formate4 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_broken_into_multiple_lines'] != 'No')& (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])),:]) slugline_formate5 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_added'] != 'No')& (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])),:]) slugline_formate6 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_removed'] != 'No')& (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])),:]) slugline_formated = slugline_formate1 + slugline_formate2 + slugline_formate3 + slugline_formate4 + slugline_formate5 + slugline_formate6 print("slugline_formated",slugline_formated) #total sluglines total_no_sluglines = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['Identification_Status'].isin(['ps1', 'ps2', 'ps3'])), :]) print(total_no_sluglines) # for actioon -----line # case corrected actionlinecase_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['case_corrected'] != 'No') & (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])), :]) print(actionlinecase_corrected_count) # indentatioin corrected actionleft_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['left_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])),:]) actionright_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['right_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])),:]) actionwrapped_lines_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_wrapped_at_prescribed_right_indent'] != 'No')& (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])),:]) actionline_indentation = actionleft_indent_corrected_count + actionright_indent_corrected_count + actionwrapped_lines_count print("actionliine_indentation:",actionline_indentation) # formate corrected actionline_formate1 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_removed_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])),:]) actionline_formate2 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_added_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])),:]) actionline_formate3 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_merged_with_next_line'] != 'No')& (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])),:]) actionline_formate4 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_broken_into_multiple_lines'] != 'No')& (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])),:]) actionline_formate5 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_added'] != 'No')& (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])),:]) actionline_formate6 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_removed'] != 'No')& (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])),:]) actionline_formated = actionline_formate1 + actionline_formate2 + actionline_formate3 + actionline_formate4 + actionline_formate5 + actionline_formate6 print("actionline_formated",actionline_formated) #total no of actionline total_actionlines = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['Identification_Status'].isin(['ps4', 'ps5', 'ps6'])), :]) print(total_actionlines) # for Speaker # case corrected speakercase_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['case_corrected'] != 'No') & (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])), :]) print("speakercase_corrected_count", speakercase_corrected_count) # indentatioin corrected speakerleft_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['left_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])),:]) speakerright_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['right_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])),:]) speaker_lines_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_wrapped_at_prescribed_right_indent'] != 'No')& (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])),:]) speaker_indentation = speakerleft_indent_corrected_count + speakerright_indent_corrected_count + speaker_lines_count print("speaker_indentation:",speaker_indentation) # formate corrected speaker_formate1 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_removed_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])),:]) speaker_formate2 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_added_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])),:]) speaker_formate3 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_merged_with_next_line'] != 'No')& (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])),:]) speaker_formate4 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_broken_into_multiple_lines'] != 'No')& (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])),:]) speaker_formate5 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_added'] != 'No')& (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])),:]) speaker_formate6 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_removed'] != 'No')& (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])),:]) speaker_formated = speaker_formate1 + speaker_formate2 + speaker_formate3 + speaker_formate4 + speaker_formate5 + speaker_formate6 print("speaker_formated",speaker_formated) #total no of speaker -speaker total_no_speaker = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['Identification_Status'].isin(['ps7', 'ps8', 'ps9'])),:]) print(total_no_speaker) # for Parenthetical -----line # case corrected parentheticalcase_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['case_corrected'] != 'No') & (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])), :]) print(parentheticalcase_corrected_count) # indentatioin corrected parenthetical_left_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['left_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])),:]) parenthetical_right_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['right_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])),:]) parenthetical_wrapped_lines_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_wrapped_at_prescribed_right_indent'] != 'No')& (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])),:]) parenthetical_line_indentation = parenthetical_left_indent_corrected_count + parenthetical_right_indent_corrected_count + parenthetical_wrapped_lines_count print("parenthetical_line_indentation:",parenthetical_line_indentation) # formate corrected parenthetical_formate1 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_removed_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])),:]) parenthetical_formate2 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_added_between_characters'] != 'No') & (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])),:]) parenthetical_formate3 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_merged_with_next_line'] != 'No')& (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])),:]) parenthetical_formate4 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_broken_into_multiple_lines'] != 'No')& (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])),:]) parenthetical_formate5 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_added'] != 'No')& (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])),:]) parenthetical_formate6 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_removed'] != 'No')& (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])),:]) parenthetical_formated = parenthetical_formate1 + parenthetical_formate2 + parenthetical_formate3 + parenthetical_formate4 + parenthetical_formate5 + parenthetical_formate6 print("parenthetical_formated",parenthetical_formated) #total number of parenthetical total_no_parenthetical = len(audit_df.loc[(audit_df['line_removed'] == 'No')& (audit_df['Identification_Status'].isin(['ps10', 'ps11', 'ps12', 'ps20'])),:]) print(total_no_parenthetical) # for Dialogue -----line # case corrected Dialogue_case_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['case_corrected'] != 'No') & (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])), :]) print(Dialogue_case_corrected_count) # indentatioin corrected dialogue_left_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['left_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])),:]) dialogue_right_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['right_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])),:]) dialogue_wrapped_lines_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_wrapped_at_prescribed_right_indent'] != 'No')& (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])),:]) dialogue_line_indentation = dialogue_left_indent_corrected_count + dialogue_right_indent_corrected_count + dialogue_wrapped_lines_count print("dialogue_line_indentation:",dialogue_line_indentation) # formate corrected dialogue_formate1 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_removed_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])),:]) dialogue_formate2 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_added_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])),:]) dialogue_formate3 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_merged_with_next_line'] != 'No')& (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])),:]) dialogue_formate4 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_broken_into_multiple_lines'] != 'No')& (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])),:]) dialogue_formate5 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_added'] != 'No')& (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])),:]) dialogue_formate6 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_removed'] != 'No')& (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])),:]) dialogue_formated = dialogue_formate1 + dialogue_formate2 + dialogue_formate3 + dialogue_formate4 + dialogue_formate5 + dialogue_formate6 print("dialogue_formated",dialogue_formated) # total number of dialogue total_no_dialogue = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['Identification_Status'].isin(['ps13', 'ps14', 'ps15'])),:]) print(total_no_dialogue) # for Transistion -----line # case corrected transitions_case_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['case_corrected'] != 'No') & (audit_df['Identification_Status'].isin(['ps16'])), :]) print(transitions_case_corrected_count) # indentatioin corrected transitions_left_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['left_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps16'])),:]) transitions_right_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['right_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps16'])),:]) transitions_wrapped_lines_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_wrapped_at_prescribed_right_indent'] != 'No')& (audit_df['Identification_Status'].isin(['ps16'])),:]) transitions_line_indentation = transitions_left_indent_corrected_count + transitions_right_indent_corrected_count + transitions_wrapped_lines_count print("transitions_line_indentation:",transitions_line_indentation) # formate corrected transitions_formate1 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_removed_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps16'])),:]) transitions_formate2 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_added_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps16'])),:]) transitions_formate3 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_merged_with_next_line'] != 'No')& (audit_df['Identification_Status'].isin(['ps16'])),:]) transitions_formate4 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_broken_into_multiple_lines'] != 'No')& (audit_df['Identification_Status'].isin(['ps16'])),:]) transitions_formate5 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_added'] != 'No')& (audit_df['Identification_Status'].isin(['ps16'])),:]) transitions_formate6 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_removed'] != 'No')& (audit_df['Identification_Status'].isin(['ps16'])),:]) transitions_formated = transitions_formate1 + transitions_formate2 + transitions_formate3 + transitions_formate4 + transitions_formate5 + transitions_formate6 print("transitions_formated",transitions_formated) #total transition total_no_transition = len(audit_df.loc[audit_df['Identification_Status'].isin(['ps16']),:]) print(total_no_transition) # for Spectial Terms -----line # case corrected st_case_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['case_corrected'] != 'No') & (audit_df['Identification_Status'].isin(['ps17'])), :]) print("st_case_corrected_count",st_case_corrected_count) # indentatioin corrected st_left_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['left_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps17'])),:]) st_right_indent_corrected_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['right_indent_corrected'] != 'No')& (audit_df['Identification_Status'].isin(['ps17'])),:]) st_wrapped_lines_count = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_wrapped_at_prescribed_right_indent'] != 'No')& (audit_df['Identification_Status'].isin(['ps17'])),:]) st_line_indentation = st_left_indent_corrected_count + st_right_indent_corrected_count + st_wrapped_lines_count print("st_line_indentation:",st_line_indentation) # formate corrected st_formate1 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_removed_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps17'])),:]) st_formate2 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['space_added_between_characters'] != 'No')& (audit_df['Identification_Status'].isin(['ps17'])),:]) st_formate3 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_merged_with_next_line'] != 'No')& (audit_df['Identification_Status'].isin(['ps17'])),:]) st_formate4 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['line_broken_into_multiple_lines'] != 'No')& (audit_df['Identification_Status'].isin(['ps17'])),:]) st_formate5 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_added'] != 'No')& (audit_df['Identification_Status'].isin(['ps17'])),:]) st_formate6 = len(audit_df.loc[(audit_df['line_removed'] == 'No') & (audit_df['punctuation_mark_removed'] != 'No')& (audit_df['Identification_Status'].isin(['ps17'])),:]) st_formated = st_formate1 + st_formate2 + st_formate3 + st_formate4 + st_formate5 + st_formate6 print("st_formated",st_formated) #total numner of special terms total_special_terms = len(audit_df.loc[audit_df['Identification_Status'].isin(['ps17']),:]) if total_special_terms < 1 : total_special_terms = 1 print(total_special_terms) # write logic for the percentage #a difference_of_page_no = int(pre_audit_pagenumber) - int(postauditpagenumber) average_of_page_no = (int(pre_audit_pagenumber) + int(postauditpagenumber)) / 2 final_ratio_pageno = (difference_of_page_no / average_of_page_no) * 100 #b difference_of_line_no = int(preaudit_line_no)- int(postaudit_line_no) average_of_line_no = (int(preaudit_line_no) + int(postaudit_line_no)) / 2 final_ratio_lineno = (difference_of_line_no / average_of_line_no) * 100 #c try: ratio_for_blanklines = ((int(blankline_inserted) + int(blankline_removed_total)) / average_of_line_no) *100 except: ratio_for_blanklines = 0 #j try: ratio_for_sluglines = ((int(sluglinecase_corrected_count)+int(slugline_indentation)+int(slugline_formated))/total_no_sluglines)*100 except: ratio_for_sluglines = 0 #d try: ratio_for_actionlines = ((int(actionlinecase_corrected_count)+ int(actionline_indentation)+ int(total_actionlines))/total_actionlines)*100 except: ratio_for_actionlines = 0 #e try: ratio_for_Speaker = ((int(speakercase_corrected_count)+int(speaker_formated)+int(speaker_formated))/ total_actionlines)*100 except: ratio_for_Speaker = 0 #f try: ratio_for_parenthetical = ((int(parentheticalcase_corrected_count)+int(parenthetical_line_indentation)+int(parenthetical_formated)) / total_no_parenthetical)*100 except: ratio_for_parenthetical = 0 #g try: ratio_for_dialogues = ((int(Dialogue_case_corrected_count)+int(dialogue_line_indentation)+int(dialogue_formated)) / total_no_dialogue)*100 except: ratio_for_dialogues = 0 #h try: ratio_for_transitions = ((int(transitions_case_corrected_count)+int(transitions_line_indentation)+int(transitions_formated)) / total_no_transition)*100 except: ratio_for_transitions = 0 #i try: ratio_for_special_terms = ((int(st_case_corrected_count)+int(st_line_indentation)+int(st_formated))/total_special_terms) * 100 except: ratio_for_special_terms = 0 average_of_c_j = (ratio_for_sluglines+ratio_for_actionlines+ratio_for_Speaker+ratio_for_parenthetical+ratio_for_dialogues+ratio_for_transitions+ratio_for_special_terms)/7 audit_configuration_percentage = (final_ratio_pageno+final_ratio_lineno+ratio_for_blanklines) + (average_of_c_j) audit_configuration_percentage_str = f"{audit_configuration_percentage:.2f}%" print("audit_configuration_percentage",audit_configuration_percentage_str) total_script_element_correct = (total_no_sluglines+total_actionlines+total_no_speaker+total_no_parenthetical+total_no_dialogue+total_no_transition+total_special_terms) print("total_script_element_correct",total_script_element_correct) audit_script_accuracy = (total_no_sluglines+total_actionlines+total_no_speaker+total_no_parenthetical+total_no_dialogue+total_no_transition+total_special_terms+total_no_blanklines)/preaudit_line_no print("audit_script_accuracy",audit_script_accuracy) # audit_script_accuracy_str = min(audit_script_accuracy*100 , 100) audit_script_accuracy_str = audit_script_accuracy*100 audit_script_accuracy_str = f"{audit_script_accuracy_str:.2f}%" print("audit_script_accuracy_str",audit_script_accuracy_str) # the table logics ends here # percenteage table from here output_doc = Document() style = output_doc.styles['Normal'] font = style.font #font.name = 'Courier New' font.size = Pt(10) section = output_doc.sections[-1] section.orientation = WD_ORIENT.LANDSCAPE section.page_width = Inches(11) section.left_margin = Inches(0.25) section.right_margin = Inches(0.25) para = output_doc.add_paragraph() para.alignment = WD_ALIGN_PARAGRAPH.CENTER # Audit Summary at center of the page with bold run = para.add_run() font = run.font font.bold = True font.size = Pt(18) run.text = ' Audit Summary' para = output_doc.add_paragraph() # run.add_break() # Add a paragraph for the left-aligned "Audit Date" current_date = date.today() # Convert to the "day month year" format formatted_date = current_date.strftime("%d %B %Y") string_date = "š€š®šš¢š­ šƒššš­šž" left_aligned_text = output_doc.add_paragraph("\t\t\t\t\t\t\t\t\t\t\t\t\t\t"+ string_date+ " : " + str(formatted_date)) left_aligned_text.alignment = WD_ALIGN_PARAGRAPH.LEFT font_audit_date = left_aligned_text.runs[0].font font_audit_date.size = Pt(14) font.bold = True para = output_doc.add_paragraph() table = output_doc.add_table(rows=2, cols=2) table.alignment = WD_TABLE_ALIGNMENT.CENTER table.style = 'Colorful Shading Accent 6' table.autofit = False # Turn off autofit to set cell widths explicitly # Set cell widths (you can adjust these values as needed) table.columns[0].width = Pt(150) table.columns[1].width = Pt(100) # Access the first cell in the first row cell = table.cell(0, 0) cell.text = "Audit Contributions" cell1 = table.cell(0, 1) cell1.text = audit_configuration_percentage_str for paragraph in cell.paragraphs: for run in paragraph.runs: run.font.size = Pt(14) for paragraph in cell1.paragraphs: for run in paragraph.runs: run.font.size = Pt(14) paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER # Set vertical alignment to top cell.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER # Access the first cell in the second row cell = table.cell(1, 0) cell.text = "Audited Accuracy" cell1 = table.cell(1,1) cell1.text = audit_script_accuracy_str for paragraph in cell.paragraphs: for run in paragraph.runs: run.font.size = Pt(14) for paragraph in cell1.paragraphs: for run in paragraph.runs: run.font.size = Pt(14) paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER # Set vertical alignment to top cell.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER para = output_doc.add_paragraph() para = output_doc.add_paragraph() para = output_doc.add_paragraph() right_aligned_text = para.add_run('\t\tš’šœš«š¢š©š­ šššš¦šž: ' + str(scriptname)) right_aligned_text.alignment = WD_ALIGN_PARAGRAPH.RIGHT font_right = right_aligned_text.font font_right.size = Pt(14) author_para = output_doc.add_paragraph() run_author = author_para.add_run("\t\tš€š®ļæ½ļ潚”šØš«: " + str(author)) font_author = run_author.font font_author.size = Pt(14) language_script_para = output_doc.add_paragraph() run_language_script = language_script_para.add_run("\t\tš‹ššš§š š®ššš šž šØšŸ š’šœš«š¢š©š­: " + str(script_language)) font_language_script = run_language_script.font font_language_script.size = Pt(14) language_dialogue_para = output_doc.add_paragraph() run_language_dialogue = language_dialogue_para.add_run("\t\tš‹ššš§š š®ššš šž šØšŸ šƒš¢ššš„šØš š®šž: " + str(dialogue_language)) font_language_dialogue = run_language_dialogue.font font_language_dialogue.size = Pt(14) # Remove line spacing for the entire document for para in output_doc.paragraphs: para.paragraph_format.space_before = Pt(1) para.paragraph_format.space_after = Pt(1) para = output_doc.add_paragraph() para = output_doc.add_paragraph() # changes_string_line = output_doc.add_paragraph() # run_changes_string_line = changes_string_line.add_run("\t\tStructural Changes\t\t\t\t\\t Blank Lines Adjustments ") # font_changes_string_line = run_changes_string_line.font # # Set font properties # font_changes_string_line.color.rgb = WD_COLOR_INDEX.RED # Red font color # font_changes_string_line.italic = True # Italic style # font_changes_string_line.bold = True paragraph = output_doc.add_paragraph() paragraph = output_doc.add_paragraph() run = paragraph.add_run("\t\t\t\tStructural Changes\t\t\t\t\t\tBlank Lines Adjustments ") # Set font size font = run.font font.size = Pt(14) # Set font color to red font.color.rgb = RGBColor(255, 0, 0) font.bold = True font.italic = True for para in output_doc.paragraphs: para.paragraph_format.space_before = Pt(0) para.paragraph_format.space_after = Pt(0) table = output_doc.add_table(rows=1, cols=2) table.allow_autofit = False table.alignment = WD_TABLE_ALIGNMENT.CENTER table._cells[0].width = Inches(4.3) table._cells[1].width = Inches(4.3) column_first = table._cells[0].add_table(rows=3, cols=3) column_second = table._cells[1].add_table(rows=2, cols=2) column_first.style = 'Colorful Shading Accent 6' column_second.style = 'Colorful Shading Accent 6' column_first_row1 = column_first.cell(0,1) column_first_row1.text ="Pre Audit" column_first_row1.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER for paragraph in column_first_row1.paragraphs: for run in paragraph.runs: run.font.size = Pt(14) column_first_row1_c2 = column_first.cell(0,2) column_first_row1_c2.text ="Post Audit" column_first_row1_c2.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER for paragraph in column_first_row1_c2.paragraphs: for run in paragraph.runs: run.font.size = Pt(14) column_first_row2_c1 = column_first.cell(1,0) column_first_row2_c1.text ="No of Pages" column_first_row2_c1.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER for paragraph in column_first_row2_c1.paragraphs: for run in paragraph.runs: run.font.size = Pt(14) column_first_row2_c2 = column_first.cell(1,1) column_first_row2_c2.text = str(pre_audit_pagenumber) column_first_row2_c2.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER for paragraph in column_first_row2_c2.paragraphs: for run in paragraph.runs: run.font.size = Pt(14) paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER column_first_row2_c3 = column_first.cell(1,2) column_first_row2_c3.text = str(postauditpagenumber) column_first_row2_c3.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER for paragraph in column_first_row2_c3.paragraphs: for run in paragraph.runs: run.font.size = Pt(14) paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER column_first_row3_c1 = column_first.cell(2,0) column_first_row3_c1.text = "No of Lines" column_first_row3_c1.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER for paragraph in column_first_row3_c1.paragraphs: for run in paragraph.runs: run.font.size = Pt(14) column_first_row3_c2 = column_first.cell(2,1) column_first_row3_c2.text = str(preaudit_line_no) column_first_row3_c2.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER for paragraph in column_first_row3_c2.paragraphs: for run in paragraph.runs: run.font.size = Pt(14) paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER column_first_row3_c3 = column_first.cell(2,2) column_first_row3_c3.text = str(postaudit_line_no) column_first_row3_c3.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER for paragraph in column_first_row3_c3.paragraphs: for run in paragraph.runs: run.font.size = Pt(14) paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER column_second_row1_c1 = column_second.cell(0,0) column_second_row1_c1.text = "Blank Lines Added" column_second_row1_c1.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER for paragraph in column_second_row1_c1.paragraphs: for run in paragraph.runs: run.font.size = Pt(14) column_second_row1_c2 = column_second.cell(0,1) column_second_row1_c2.text = str(blankline_inserted) column_second_row1_c2.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER for paragraph in column_second_row1_c2.paragraphs: for run in paragraph.runs: run.font.size = Pt(14) paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER column_second_row2_c1 = column_second.cell(1,0) column_second_row2_c1.text = "Blank Lines Removed" column_second_row2_c1.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER for paragraph in column_second_row2_c1.paragraphs: for run in paragraph.runs: run.font.size = Pt(14) column_second_row2_c2 = column_second.cell(1,1) column_second_row2_c2.text = str(blankline_removed_total) column_second_row2_c2.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER for paragraph in column_second_row2_c2.paragraphs: for run in paragraph.runs: run.font.size = Pt(14) paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER paragraph = output_doc.add_paragraph() paragraph = output_doc.add_paragraph() paragraph = output_doc.add_paragraph() run = paragraph.add_run("Summary of Correction made") # Set font size font = run.font font.size = Pt(14) # Set font color to red font.color.rgb = RGBColor(255, 0, 0) font.bold = True font.italic = True paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER summary_table = output_doc.add_table(rows=9, cols=5) summary_table.allow_autofit = False summary_table.alignment = WD_TABLE_ALIGNMENT.CENTER summary_table.style = 'Colorful Shading Accent 6' # # Calculate the column widths # column_widths = [Inches(1), Inches(1.5), Inches(1.5), Inches(1.5), Inches(1)] # Adjust the widths as needed # # Set the column widths # for col, width in enumerate(column_widths): # summary_table.columns[col].width = width # table.columns[0].width = Pt(150) # table.columns[1].width = Pt(100) # summary_table.columns[0].width = Inches(1) # summary_table.columns[1].width = Inches(1) # summary_table.columns[2].width = Inches(1) # summary_table.columns[3].width = Inches(1) # summary_table.columns[4].width = Inches(0.5) summary_table_row1_col_2 = summary_table.cell(0,1) summary_table_row1_col_2.text ="Case Correction" summary_table_row1_col_2.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER for paragraph in summary_table_row1_col_2.paragraphs: for run in paragraph.runs: run.font.size = Pt(14) paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER summary_table_row1_col_3 = summary_table.cell(0,2) summary_table_row1_col_3.text ="Indent Correction" summary_table_row1_col_3.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER for paragraph in summary_table_row1_col_3.paragraphs: for run in paragraph.runs: run.font.size = Pt(14) paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER summary_table_row1_col_4 = summary_table.cell(0,3) summary_table_row1_col_4.text ="Format Correction" summary_table_row1_col_4.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER for paragraph in summary_table_row1_col_4.paragraphs: for run in paragraph.runs: run.font.size = Pt(14) paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER summary_table_row1_col_5 = summary_table.cell(0,4) summary_table_row1_col_5.text ="Total" summary_table_row1_col_5.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER summary_table_row1_col_5.width = Inches(0.5) for paragraph in summary_table_row1_col_5.paragraphs: for run in paragraph.runs: run.font.size = Pt(14) run.font.bold = True paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER #---------ROW 2------------ summary_table_row2_col_1 = summary_table.cell(1,0) summary_table_row2_col_1.text ="Sluglines" summary_table_row2_col_1.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER for paragraph in summary_table_row2_col_1.paragraphs: for run in paragraph.runs: run.font.size = Pt(14) summary_table_row2_col_2 = summary_table.cell(1,1) summary_table_row2_col_2.text = str(sluglinecase_corrected_count) summary_table_row2_col_2.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER for paragraph in summary_table_row2_col_2.paragraphs: for run in paragraph.runs: run.font.size = Pt(14) paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER summary_table_row2_col_3 = summary_table.cell(1,2) summary_table_row2_col_3.text = str(slugline_indentation) summary_table_row2_col_3.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER for paragraph in summary_table_row2_col_3.paragraphs: for run in paragraph.runs: run.font.size = Pt(14) paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER summary_table_row2_col_4 = summary_table.cell(1,3) summary_table_row2_col_4.text = str(slugline_formated) summary_table_row2_col_4.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER for paragraph in summary_table_row2_col_4.paragraphs: for run in paragraph.runs: run.font.size = Pt(14) paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER summary_table_row2_col_5 = summary_table.cell(1,4) total_slug = slugline_formated+slugline_indentation+sluglinecase_corrected_count summary_table_row2_col_5.text = str(total_slug) summary_table_row2_col_5.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER for paragraph in summary_table_row2_col_5.paragraphs: for run in paragraph.runs: run.font.size = Pt(14) paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER summary_table_row3_col_1 = summary_table.cell(2,0) summary_table_row3_col_1.text = "Action Lines" summary_table_row3_col_1.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER for paragraph in summary_table_row3_col_1.paragraphs: for run in paragraph.runs: run.font.size = Pt(14) summary_table_row3_col_2 = summary_table.cell(2,1) summary_table_row3_col_2.text = str(actionlinecase_corrected_count) summary_table_row3_col_2.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER for paragraph in summary_table_row3_col_2.paragraphs: for run in paragraph.runs: run.font.size = Pt(14) paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER summary_table_row3_col_3 = summary_table.cell(2,2) summary_table_row3_col_3.text = str(actionline_indentation) summary_table_row3_col_3.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER for paragraph in summary_table_row3_col_3.paragraphs: for run in paragraph.runs: run.font.size = Pt(14) paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER summary_table_row3_col_4 = summary_table.cell(2,3) summary_table_row3_col_4.text = str(actionline_formated) summary_table_row3_col_4.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER for paragraph in summary_table_row3_col_4.paragraphs: for run in paragraph.runs: run.font.size = Pt(14) paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER summary_table_row3_col_5 = summary_table.cell(2,4) total_action_line = actionlinecase_corrected_count+actionline_indentation+actionline_formated summary_table_row3_col_5.text = str(total_action_line) summary_table_row3_col_5.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER for paragraph in summary_table_row3_col_5.paragraphs: for run in paragraph.runs: run.font.size = Pt(14) paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER #-----ROW 4 --------- summary_table_row4_col_1 = summary_table.cell(3,0) summary_table_row4_col_1.text = "Speaker" summary_table_row4_col_1.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER for paragraph in summary_table_row4_col_1.paragraphs: for run in paragraph.runs: run.font.size = Pt(14) # paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER summary_table_row4_col_2 = summary_table.cell(3,1) summary_table_row4_col_2.text = str(speakercase_corrected_count) summary_table_row4_col_2.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER for paragraph in summary_table_row4_col_2.paragraphs: for run in paragraph.runs: run.font.size = Pt(14) paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER summary_table_row4_col_3 = summary_table.cell(3,2) summary_table_row4_col_3.text = str(speaker_indentation) summary_table_row4_col_3.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER for paragraph in summary_table_row4_col_3.paragraphs: for run in paragraph.runs: run.font.size = Pt(14) paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER summary_table_row4_col_4 = summary_table.cell(3,3) summary_table_row4_col_4.text = str(speaker_formated) summary_table_row4_col_4.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER for paragraph in summary_table_row4_col_4.paragraphs: for run in paragraph.runs: run.font.size = Pt(14) paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER summary_table_row4_col_5 = summary_table.cell(3,4) total_speaker = speaker_formated+speaker_indentation+speakercase_corrected_count summary_table_row4_col_5.text = str(total_speaker) summary_table_row4_col_5.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER for paragraph in summary_table_row4_col_5.paragraphs: for run in paragraph.runs: run.font.size = Pt(14) paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER # ----ROW 5 ------- summary_table_row5_col_1 = summary_table.cell(4,0) summary_table_row5_col_1.text = "Parentheticals" summary_table_row5_col_1.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER for paragraph in summary_table_row5_col_1.paragraphs: for run in paragraph.runs: run.font.size = Pt(14) summary_table_row5_col_2 = summary_table.cell(4,1) summary_table_row5_col_2.text = str(parentheticalcase_corrected_count) summary_table_row5_col_2.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER for paragraph in summary_table_row5_col_2.paragraphs: for run in paragraph.runs: run.font.size = Pt(14) paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER summary_table_row5_col_3 = summary_table.cell(4,2) summary_table_row5_col_3.text = str(parenthetical_line_indentation) summary_table_row5_col_3.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER for paragraph in summary_table_row5_col_3.paragraphs: for run in paragraph.runs: run.font.size = Pt(14) paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER summary_table_row5_col_4 = summary_table.cell(4,3) summary_table_row5_col_4.text = str(parenthetical_formated) summary_table_row5_col_4.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER for paragraph in summary_table_row5_col_4.paragraphs: for run in paragraph.runs: run.font.size = Pt(14) paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER summary_table_row5_col_5 = summary_table.cell(4,4) total_parenthetical = parenthetical_formated + parenthetical_line_indentation+parentheticalcase_corrected_count summary_table_row5_col_5.text = str(total_parenthetical) summary_table_row5_col_5.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER for paragraph in summary_table_row5_col_5.paragraphs: for run in paragraph.runs: run.font.size = Pt(14) paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER #ROW --- 6 summary_table_row6_col_1 = summary_table.cell(5,0) summary_table_row6_col_1.text = "Dialogue" summary_table_row6_col_1.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER for paragraph in summary_table_row6_col_1.paragraphs: for run in paragraph.runs: run.font.size = Pt(14) summary_table_row6_col_2 = summary_table.cell(5,1) summary_table_row6_col_2.text = str(Dialogue_case_corrected_count) summary_table_row6_col_2.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER for paragraph in summary_table_row6_col_2.paragraphs: for run in paragraph.runs: run.font.size = Pt(14) paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER summary_table_row6_col_3 = summary_table.cell(5,2) summary_table_row6_col_3.text = str(dialogue_line_indentation) summary_table_row6_col_3.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER for paragraph in summary_table_row6_col_3.paragraphs: for run in paragraph.runs: run.font.size = Pt(14) paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER summary_table_row6_col_4 = summary_table.cell(5,3) summary_table_row6_col_4.text = str(dialogue_formated) summary_table_row6_col_4.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER for paragraph in summary_table_row6_col_4.paragraphs: for run in paragraph.runs: run.font.size = Pt(14) paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER summary_table_row6_col_5 = summary_table.cell(5,4) total_dialogue = dialogue_formated + dialogue_line_indentation+Dialogue_case_corrected_count summary_table_row6_col_5.text = str(total_dialogue) summary_table_row6_col_5.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER for paragraph in summary_table_row6_col_5.paragraphs: for run in paragraph.runs: run.font.size = Pt(14) paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER #ROW --- 7 summary_table_row7_col_1 = summary_table.cell(6,0) summary_table_row7_col_1.text = "Transitions" summary_table_row7_col_1.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER for paragraph in summary_table_row7_col_1.paragraphs: for run in paragraph.runs: run.font.size = Pt(14) summary_table_row7_col_2 = summary_table.cell(6,1) summary_table_row7_col_2.text = str(transitions_case_corrected_count) summary_table_row7_col_2.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER for paragraph in summary_table_row7_col_2.paragraphs: for run in paragraph.runs: run.font.size = Pt(14) paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER summary_table_row7_col_3 = summary_table.cell(6,2) summary_table_row7_col_3.text = str(transitions_line_indentation) summary_table_row7_col_3.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER for paragraph in summary_table_row7_col_3.paragraphs: for run in paragraph.runs: run.font.size = Pt(14) paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER summary_table_row7_col_4 = summary_table.cell(6,3) summary_table_row7_col_4.text = str(transitions_formated) summary_table_row7_col_4.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER for paragraph in summary_table_row7_col_4.paragraphs: for run in paragraph.runs: run.font.size = Pt(14) paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER summary_table_row7_col_5 = summary_table.cell(6,4) total_transition = transitions_formated+transitions_line_indentation+transitions_case_corrected_count summary_table_row7_col_5.text = str(total_transition) summary_table_row7_col_5.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER for paragraph in summary_table_row7_col_5.paragraphs: for run in paragraph.runs: run.font.size = Pt(14) paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER #ROW --- 8 summary_table_row8_col_1 = summary_table.cell(7,0) summary_table_row8_col_1.text = "Special Terms" summary_table_row8_col_1.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER for paragraph in summary_table_row8_col_1.paragraphs: for run in paragraph.runs: run.font.size = Pt(14) summary_table_row8_col_2 = summary_table.cell(7,1) summary_table_row8_col_2.text = str(st_case_corrected_count) summary_table_row8_col_2.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER for paragraph in summary_table_row8_col_2.paragraphs: for run in paragraph.runs: run.font.size = Pt(14) paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER summary_table_row8_col_3 = summary_table.cell(7,2) summary_table_row8_col_3.text = str(st_line_indentation) summary_table_row8_col_3.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER for paragraph in summary_table_row8_col_3.paragraphs: for run in paragraph.runs: run.font.size = Pt(14) paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER summary_table_row8_col_4 = summary_table.cell(7,3) summary_table_row8_col_4.text = str(st_formated) summary_table_row8_col_4.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER for paragraph in summary_table_row8_col_4.paragraphs: for run in paragraph.runs: run.font.size = Pt(14) paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER summary_table_row8_col_5 = summary_table.cell(7,4) total_special_term = st_formated +st_line_indentation+ st_case_corrected_count summary_table_row8_col_5.text = str(total_special_term) summary_table_row8_col_5.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER for paragraph in summary_table_row8_col_5.paragraphs: for run in paragraph.runs: run.font.size = Pt(14) paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER #ROW --- 9 summary_table_row9_col_1 = summary_table.cell(8,0) summary_table_row9_col_1.text = "Total" summary_table_row9_col_1.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER for paragraph in summary_table_row9_col_1.paragraphs: for run in paragraph.runs: run.font.size = Pt(14) run.font.bold = True summary_table_row9_col_2 = summary_table.cell(8,1) summary_table_row9_col_2.text = str(sluglinecase_corrected_count+actionlinecase_corrected_count+speakercase_corrected_count+parentheticalcase_corrected_count +Dialogue_case_corrected_count+transitions_case_corrected_count+st_case_corrected_count) summary_table_row9_col_2.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER for paragraph in summary_table_row9_col_2.paragraphs: for run in paragraph.runs: run.font.size = Pt(14) paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER summary_table_row9_col_3 = summary_table.cell(8,2) summary_table_row9_col_3.text = str(slugline_indentation+actionline_indentation+speaker_indentation+parenthetical_line_indentation+dialogue_line_indentation+transitions_line_indentation+st_line_indentation) summary_table_row9_col_3.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER for paragraph in summary_table_row9_col_3.paragraphs: for run in paragraph.runs: run.font.size = Pt(14) paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER summary_table_row9_col_4 = summary_table.cell(8,3) summary_table_row9_col_4.text = str(slugline_formated+actionline_formated+speaker_formated+parenthetical_formated+dialogue_formated+transitions_formated+st_formated) summary_table_row9_col_4.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER for paragraph in summary_table_row9_col_4.paragraphs: for run in paragraph.runs: run.font.size = Pt(14) paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER summary_table_row9_col_5 = summary_table.cell(8,4) summary_table_row9_col_5.text = str(total_slug+total_action_line+total_speaker+total_parenthetical+total_dialogue+total_transition+total_special_term) summary_table_row9_col_5.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER for paragraph in summary_table_row9_col_5.paragraphs: for run in paragraph.runs: run.font.size = Pt(14) paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER #--------------------------- 14-09-2023 for _ in range(3): output_doc.add_paragraph() #----------------------- 14-09-23 para = output_doc.add_paragraph() para.alignment = WD_ALIGN_PARAGRAPH.CENTER # Audit detail at center of the page with bold run = para.add_run() font = run.font font.bold = True font.size = Pt(18) run.text = ' Audit Details' run.add_break() # -------------------------- 14-09-23 no_rows = len(audit_df.index) table =output_doc.add_table(1, cols =6) table.alignment = WD_TABLE_ALIGNMENT.CENTER table.style = 'Colorful Shading Accent 6' table.autofit = False table.columns[0].width = Inches(0.5) table.columns[1].width = Inches(1.2) table.columns[2].width = Inches(2) table.columns[3].width = Inches(1.5) table.columns[4].width = Inches(2) table.columns[5].width = Inches(2.5) # table.columns[3].width = Inches(0.5) heading_cells = table.rows[0].cells heading_cells[0].width = Inches(0.1) heading_cells[1].width = Inches(0.1) heading_cells[2].width = Inches(3.5) heading_cells[3].width = Inches(0.8) heading_cells[4].width = Inches(3.5) heading_cells[5].width = Inches(2) heading_cells[0].text = 'Line No' heading_cells[1].text = 'Audited Line No' heading_cells[2].text = 'Current Content' heading_cells[3].text = 'Script Element' heading_cells[4].text = 'New Content' heading_cells[5].text = 'Changes Done' for i in range(0,6): heading_cells[i].paragraphs[0].runs[0].font.bold = True heading_cells[i].paragraphs[0].runs[0].font.size = Pt(9) #------------------------------->LOGIC HERE<--------------------------------------------- report_df = pd.DataFrame(columns=['line_no', 'audited_line_no', 'current_content', 'script_element', 'new_content', 'changes_done', 'para_no']) for index in audit_df.index: columns_to_check = ["line_removed","introduction", "appendix", "page_no" ,"left_indent_corrected" ,"right_indent_corrected" ,"line_wrapped_at_prescribed_right_indent", "case_corrected", "blank_inserted_before" ,"blank_inserted_after" ,"blank_deleted_before" ,"blank_deleted_after" ,"space_removed_between_characters" ,"space_added_between_characters" ,"line_merged_with_next_line", "line_broken_into_multiple_lines" ,"punctuation_mark_added" ,"punctuation_mark_removed" ,"language_specific_audit_comments"] audit_df[columns_to_check] = audit_df[columns_to_check].fillna('No') if audit_df.loc[index, columns_to_check].eq('No').all().all(): # All columns contain 'No', skip this row continue elif audit_df['introduction'][index] == 'Yes': continue elif audit_df['appendix'][index] == 'Yes': continue elif audit_df['Identification_Status'][index] == 'blank': continue elif pd.isna(audit_df.loc[index, "Identification_Status"]): continue para_value = audit_df["para_no"][index] # ---------------------------------------------><------------------------- current_para_value = report_df['para_no'].iloc[-1] if not report_df.empty else None if para_value == current_para_value: continue else: # report_df = report_df.append(audit_df.loc[index], ignore_index=True) new_row = audit_df.loc[index].to_frame().T report_df = pd.concat([report_df, new_row], ignore_index=True) print("current_para_value",current_para_value) row_index = 1 old_line_no_index = index collection_old_line_no = [] while old_line_no_index < len(audit_df) and str(audit_df["para_no"][old_line_no_index]) == str(para_value): if audit_df['Identification_Status'][old_line_no_index] != "blank": try: data = int(old_line_no_index) collection_old_line_no.append(str(data)) except ValueError: pass old_line_no_index += 1 cells = table.add_row().cells cells[0].width = Inches(0.1) cells[0].text = ', '.join(collection_old_line_no) audited_line_index = index #--------------------------------------audited_lino_no------------------ collection_audited_line_no = [] while audited_line_index < len(audit_df) and str(audit_df["para_no"][audited_line_index]) == str(para_value): if audit_df['Identification_Status'][audited_line_index] != "blank": audited_line_no = audit_df['audited_line_no'][audited_line_index] try: data = int(audited_line_no) collection_audited_line_no.append(str(data)) except ValueError: pass audited_line_index += 1 print("collection_audited_line_no", collection_audited_line_no) data_string = ', '.join(collection_audited_line_no) print("data_string:", data_string) cells[1].width = Inches(0.1) cells[1].text = data_string #------------------------------>OLD DATA<--------------------------------- data_index = index collection_data = [] while data_index < len(audit_df) and str(audit_df["para_no"][data_index]) == str(para_value): cur_data = audit_df['data'][data_index] if not pd.isna(cur_data): # Check if the value is not NaN data = str(cur_data).strip() collection_data.append(data) data_index += 1 cells[2].width = Inches(3.5) data = str(data) cells[2].text = '\n '.join(collection_data) if audit_df['Identification_Status'][index] == 'blank': script_element = 'Blank Line' elif audit_df['Identification_Status'][index] == '': if audit_df['introduction'][index] == 'Yes': script_element = 'Title/Introduction' elif audit_df['appendix'][index] == 'Yes': script_element = 'Appendix' else: continue else: script_element = ps_to_script_element(audit_df['Identification_Status'][index]) data = script_element cells[3].width = Inches(0.8) cells[3].text = data collection_new_data = [] new_data_index = index while new_data_index < len(audit_df) and str(audit_df["para_no"][new_data_index]) == str(para_value): if audit_df["line_removed"][new_data_index] == "No": new_data = audit_df['data_corrected'][new_data_index] if not pd.isna(new_data): # Check if the value is not NaN data = str(new_data).strip() collection_new_data.append(data) new_data_index += 1 data = str(new_data).strip() cells[4].width = Inches(3.5) data = str(data) cells[4].text = '\n '.join(collection_new_data) sno = 1 changes_done = False # identification_status = audit_df['Identification_Status'][index] if pd.isnull(audit_df['Identification_Status'][index]) or audit_df['Identification_Status'][index] == "": continue if audit_df['left_indent_corrected'][index] != 'No': change_comment = audit_df['left_indent_corrected'][index] try: str_int = change_comment[-2]+change_comment[-1] except Exception as e: pass if ps_to_script_element(audit_df['Identification_Status'][index]) == "Dialogue": if str_int == "15": change_comment = "Dialogue line left index corrected to 1.5 Inch" elif str_int == "25": change_comment = "Dialogue line left index corrected to 2.5 Inch" if str_int == "15": name = ps_to_script_element(audit_df['Identification_Status'][index]) change_comment = f"{name} Line left indent corrected to 1.5 Inch" print(change_comment) elif str_int == "25": name = ps_to_script_element(audit_df['Identification_Status'][index]) change_commen = f"{name} Left indent corrected to 2.5 Inch" elif str_int == "30": change_comment = "Parenthetical left indent corrected to 3 Inch" elif str_int == "35": change_comment = "Speaker left indent corrected to 3.5 Inch" data = str(sno) + '. ' + str(change_comment) # dataa = data.split() # if dataa[-1] == "nan": # continue cells[5].width = Inches(2) para = cells[5].add_paragraph() run = para.add_run() run.text = data run.add_break() sno += 1 changes_done = True if audit_df['right_indent_corrected'][index] != 'No': name = ps_to_script_element(audit_df['Identification_Status'][index]) change_comment = audit_df['right_indent_corrected'][index] try: str_int = change_comment[-2]+change_comment[-1] except Exception as e: pass if str_int == "10": change_comment = f"{name} Line right indent corrected to 1 Inch" data = str(sno) + '. ' + str(change_comment) # dataa = data.split() # if dataa[-1] == "nan": # continue cells[5].width = Inches(2) para = cells[5].add_paragraph() run = para.add_run() run.text = data run.add_break() sno += 1 changes_done = True if audit_df['case_corrected'][index] != 'No': name = ps_to_script_element(audit_df['Identification_Status'][index]) string = str(audit_df['case_corrected'][index]) string = string.split() content = string[-1] if content == "AllUpper": change_comment = f'{name} Case ' + "Corrected to All Upper" elif content == "AllLower": change_comment = f'{name} Case ' + "Corrected to All Lowerr" if len(str(change_comment)) <= 2 : continue data = str(sno) + '. ' + str(change_comment) # dataa = data.split() # if dataa[-1] == "nan": # continue cells[5].width = Inches(2) para = cells[5].add_paragraph() run = para.add_run() run.text = data run.add_break() sno += 1 changes_done = True if audit_df['line_wrapped_at_prescribed_right_indent'][index] != 'No': change_comment = 'Line Wrapped at Prescribed Right Indent 1 Inch' name = ps_to_script_element(audit_df['Identification_Status'][index]) if name == "Action": change_comment = f'{name} Line Wrapped at Prescribed Right Indent 1 Inch' elif name == "Dialogue": change_comment = f'{name} Line Wrapped at Prescribed Right Indent 2 Inch' data = str(sno) + '. ' + str(change_comment) # dataa = data.split() # if dataa[-1] == "nan": # continue cells[5].width = Inches(2) para = cells[5].add_paragraph() run = para.add_run() run.text = data run.add_break() sno += 1 changes_done = True if audit_df['line_broken_into_multiple_lines'][index] != 'No': name = ps_to_script_element(audit_df['Identification_Status'][index]) change_comment = f'{name} line Broken into Multiple Lines' data = str(sno) + '. ' + str(change_comment) # dataa = data.split() # if dataa[-1] == "nan": # continue cells[5].width = Inches(2) para = cells[5].add_paragraph() run = para.add_run() run.text = data run.add_break() sno += 1 changes_done = True if audit_df['line_merged_with_next_line'][index] != 'No': name = ps_to_script_element(audit_df['Identification_Status'][index]) change_comment = f'{name} line Merged with Next Line' data = str(sno) + '. ' + str(change_comment) # dataa = data.split() # if dataa[-1] == "nan": # continue cells[5].width = Inches(2) para = cells[5].add_paragraph() run = para.add_run() run.text = data run.add_break() sno += 1 changes_done = True if audit_df['language_specific_audit_comments'][index] != 'No': pass name = ps_to_script_element(audit_df['Identification_Status'][index]) change_comment = f"{name}",str(audit_df['language_specific_audit_comments'][index]) data = str(sno) + '. ' + str(change_comment) cells[5].width = Inches(2) para = cells[5].add_paragraph() run = para.add_run() run.text = data run.add_break() sno += 1 changes_done = True if audit_df['blank_inserted_after'][index] != 'No': change_comment = 'A blank line is added below' # name = ps_to_script_element(audit_df['Identification_Status'][index]) # if name == "Action": # change_comment = f'{name}Line Wrapped at Prescribed Right Indent 1 Inch' # elif name == "Dialogue": # change_comment = f'{name}Line Wrapped at Prescribed Right Indent 2 Inch' data = str(sno) + '. ' + str(change_comment) # dataa = data.split() # if dataa[-1] == "nan": # continue cells[5].width = Inches(2) para = cells[5].add_paragraph() run = para.add_run() run.text = data run.add_break() sno += 1 changes_done = True if not changes_done: continue # data = 'No Changes Done' # cells[5].width = Inches(2) # para = cells[5].add_paragraph() # run = para.add_run() # run.text = data # run.add_break() row_index += 1 buffer = io.BytesIO() output_doc.save(buffer) buffer.seek(0) #output_doc.save(audit_report_tabular_docx) return buffer def replace_dot_with_comma(slugline): pattern = r'((?:INT\./EXT\. |INT\. |EXT\. |E/I\. |INT |EXT)?)\s*(.*?)\s*-\s*([A-Z\s]+)' def replacer(match): location = match.group(2) location = location.replace(".", ",") return f'{match.group(1)}{location} - {match.group(3)}' return re.sub(pattern, replacer, slugline) def change_dot_to_comma_inslug(df): for index, row in df.iterrows(): if row['script_element'] == 'slugline': text = (row['content']) print(text) modified_sluglines = replace_dot_with_comma(text) print(modified_sluglines) df.loc[index, 'content'] = modified_sluglines return df def fdx_to_audited_df(input_script): fdx = open(input_script, 'r') fdx_df = utilities.fdx_to_csv(fdx) df = pd.DataFrame(columns=['para_no','scene_no','content','script_element']) df['content'] = fdx_df['Text'] df['script_element'] = fdx_df['Script_Element'] di = {'Scene Heading':'slugline','Character':'speaker','Parenthetical':'parenthetical','Transition':'transition','Action':'action','Dialogue':'dialogue'} df.replace({"script_element":di},inplace= True) ##inserting blanks ## after slugline ## after action ## after dialogue ## after transition count = len(df) for index in df.index: se = df['script_element'][index] if index < (count-1): if se in ('slugline','action','dialogue','transition'): # skip parenthticals in between dialogues if not(se == 'dialogue' and df['script_element'][index+1] in ('parenthetical','dialogue')): df.loc[index + 0.5] = np.nan df.loc[index + 0.5,'script_element'] = 'blank' df.loc[index + 0.5,'content'] = '' ## case upper for slugline, character, transition, lower for parenthetical ## more elaborate function for slugline is present in sa_functions_english which will have to be repurposed if se in ('slugline','speaker','transition'): df.loc[index,'content'] = str(df.loc[index,'content']).upper() if se == 'parenthetical': df.loc[index,'content'] = str(df.loc[index,'content']).lower() df = df.sort_index().reset_index(drop=True) ## add para_no and scene_no para_no = 1 scene_no = 1 for index in df.index: df['para_no'][index] = para_no df['scene_no'][index] = scene_no if df['script_element'][index] == 'slugline': scene_no += 1 para_no += 1 #best of luck return df