import re def trim_intro_english(df,audit_df): print("Identifying and Removing Introduction and Titles") ## if found fadein ## else if found first slugline - INT stopwords = ['FADE IN' ] remove_upto = -1 intro_removed = False for index in df.index: data = df['data'][index] data = ' '.join(data.split()) if not data.strip(): print("blank line move on") continue for sw in stopwords: if re.search(sw,data,re.IGNORECASE): print("Found Fade In",index) line_no = df['line_no'][index] df['Identification_Status'][index] = 'ps17' audit_df['Identification_Status'][line_no] = 'ps17' if index == 0: intro_removed = True return remove_upto = index-1 if remove_upto <= 100 : print("removing lines till ", remove_upto) while remove_upto != -1: line_no = df['line_no'][remove_upto] audit_df['line_removed'][line_no] = 'Yes' audit_df['introduction'][line_no] = 'Yes' df.drop(remove_upto,inplace= True) remove_upto -= 1 intro_removed = True print("title and introduction removed") break if intro_removed: return print("stop words not found") ## fade in not found check for first slugline slugwords = ['INT','EXT','I/E','E/I' ] remove_upto = -1 intro_removed = False print("looking for first slugline") for index in df.index: data = df['data'][index] data = ' '.join(data.split()) if not data.strip(): print("blank line move on") continue for sw in slugwords: print(sw,data) if re.search(sw,data.split()[0],re.IGNORECASE): print("Found Slugline , also idenfifying as slugline",index) df['Identification_Status'][index] = 'ps1' remove_upto = index-1 if remove_upto < 0: intro_removed = True print("no intro") break elif remove_upto <= 100 : print("removing lines till ", remove_upto) while remove_upto != -1: line_no = df['line_no'][remove_upto] audit_df['line_removed'][line_no] = 'Yes' audit_df['introduction'][line_no] = 'Yes' df.drop(remove_upto,inplace= True) remove_upto -= 1 intro_removed = True print("title and introduction removed before slugline") break if intro_removed: return def trim_appendix_english(df,audit_df): print("\nIdentifying and Removing Appendix /Epilogue ") ## if found fadein ## else if found first slugline - INT stopwords = ['FADE OUT','THE END','BLACK' ] remove_upto = df.index[-1] appendix_removed = False for index in df.index[::-1]: data = df['data'][index] data = ' '.join(data.split()) ##print(data) if not data.strip(): print("blank line move on") continue for sw in stopwords: search_data = data.replace(":","") if re.match(sw,search_data.strip(),re.IGNORECASE): print("Found stop word",sw,' at index ',index) remove_upto = index if remove_upto >= 5 : line_no = df['line_no'][remove_upto] audit_df['Identification_Status'][line_no] = 'ps17' print("removing lines from ", remove_upto) while remove_upto != len(df) : print(len(df)) print("in while",remove_upto) line_no = df['line_no'][df.index[-1]] audit_df['line_removed'][line_no] = 'Yes' audit_df['appendix'][line_no] = 'Yes' df.drop(df.index[-1],inplace= True) appendix_removed = True print("prologue /appendix after stop words removed for audit") break if appendix_removed: return print("stop words not found") return def check_and_remove_numbers(df,audit_df,index): data = df['data'][index] start_is_num = True ## check if number at start while start_is_num: sub_num = re.search('\d',data.lstrip()) if sub_num: if sub_num.start() == 0: data = data.replace(sub_num.group(0),'') df['data'][index] = data continue start_is_num = False def update_pos_wts_english(df): print("Running english specific weights update") print(df['Identification_Status'].iloc[0],df['Identification_Status'].iloc[1]) ## line after fade in if df['Identification_Status'].iloc[0] == 'ps17': if df['nlb'].iloc[0] == 'Y': df["ps1"].iloc[2] += 20 else: df["ps1"].iloc[1] += 20 elif df['Identification_Status'].iloc[1] == 'ps17': if df['nlb'].iloc[1] == 'Y': df["ps1"].iloc[3] += 20 else: df["ps1"].iloc[2] += 20 for index in df.index: line_no = df['line_no'][index] data = df['data'][index] plb = df['plb'][index] nlb = df['nlb'][index] par = df['parenthetical'][index] pnbl_index = False nnbl_index = False try: pnbl_line_no = df['pnbl_line_no'][index] pnbl_index = df.loc[df['line_no'] == pnbl_line_no,:].index.values[0] except: pnbl_index = False try: nnbl_line_no = df['nnbl_line_no'][index] nnbl_index = df.loc[df['line_no'] == nnbl_line_no,:].index.values[0] except: nnbl_index = False try: pnbl_indent = df['ssc'][pnbl_index] except: pnbl_indent = -1 try: nnbl_indent = df['ssc'][nnbl_index] except: nnbl_indent = -1 cur_indent = df['ssc'][index] ssc_col = 'ssc_' + str(cur_indent) case = df['case'][index] try: print("processing line no",line_no, data) except: pass print(plb) print(nlb) print(pnbl_indent) print(nnbl_indent) lcp = df['lcp'][index] #print("lcp ",lcp) lcp_col = "lcp_" + str(lcp) ########### ######### english specific wts ###### if number in less than 15 first_15 = '' if len(data) > 15: first_15 = data[0:15] else: first_15 = data print(first_15) num_in_15 = False num= '' for ch in first_15.strip(): if re.match('\d',ch): num_in_15 = True num += ch continue else: break df['scene_number'][index] = str(num) if num_in_15: print("number found in first 15") print(num) df["ps1"][index] += 10 ## int, ext, day, night slug_phrases = ['INT','EXT','DAY','NIGHT'] for slug_phrase in slug_phrases: if df['case'][index] != 'AllUpper': break elif re.search(slug_phrase,data): print("slugphrase found",slug_phrase) df["ps1"][index] += 20 break ## CUT TO if re.match('CUT TO',data.replace(":",'').strip(),re.IGNORECASE): print("cut to exact") df["ps16"][index] += 25 elif re.search('CUT TO',data.replace(":",'').strip(),re.IGNORECASE): print("cut to found") df["ps16"][index] += 10 ##speaker related terms sp_terms = ['voice from mobile','voice from phone','voice from tv','voice from radio'] for term in sp_terms: sub_str = re.search(term,data,re.IGNORECASE) if sub_str: print("possible speaker term found increasing sp weight",term) df["ps7"][index] += 10 break if par == 'PartMidEnd': par_data = data.split("(")[1][:-1].strip() sp_par_terms = ['continued','cont.d','cont'] for term in sp_par_terms: sub_str = re.match(term,par_data,re.IGNORECASE) if sub_str: print("possible speaker term found in parenthtical increasing sp weight",term) df["ps7"][index] += 10 break ## V.O. or O.S. or VO or OS sp_ext_par_terms = ['VO','OS','V\.O','O\.S'] for term in sp_ext_par_terms: sub_str = re.match(term,par_data) if sub_str: print("possible speaker extension term found in parenthtical increasing sp externsion weight weight",term) df["ps8"][index] += 10 break # #print(wts_df.head(0)) # # make space dict for getting relevant space columns for weights # sp_bin_dict = {1:'0-14',2:'15',3:'16-24',4:'25',5:'26-29',6:'30',7:'31-34',8:'35',9:'36-73',10:'74onwards' # } # #loop over for the possibilities # for i in range(1,32): # if i in ('23','24','32','33'): # continue # df["ps{0}".format(i)][index] = 0 # ## get weights for the case # if case in ('EndUpper','MidUpper'): # case = 'FirstLowerSomeUpper' # if case != 'None': # df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),case] # ## get weights based on the starting space count # try: # df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),ssc_col] # #print("starting wight code was here") # except: # pass # #print("ps{0}".format(i),df["ps{0}".format(i)][index]) # ## get weights for <19 with Numeric character or <19 without Numeric character # pos_num = re.search('[0-9]',data) # if (pos_num!= None) and cur_indent<15: # df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'<15withNumeric'] # elif check_space(data)<15: # df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'<15withoutNumeric'] # if cur_indent>65: # df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'ssc_gt_65'] # ## get weights based on the last character placement # try: # df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),lcp_col] # #print("code was here") # except: # pass # # how far is it from position 51 63 78 # # 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 # # modify the wights matrix and create in between weights # #print("ps{0}".format(i),df["ps{0}".format(i)][index]) # # Calculation of weights based on plb and nlb(L-O column in sheet) # if plb == "Y": # df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'PLB_Yes'] # if plb == "N": # df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'PLB_No'] # if nlb == "Y": # df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'NLB_Yes'] # if nlb == "N": # df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'NLB_No'] # #print("ps{0}".format(i),df["ps{0}".format(i)][index]) # # Calculation of weights based on parenthesis(H-K column in sheet) # if re.match('\(',data.strip()[:1]) and re.match('\)',data.strip()[-1:]) : # # print('EntireLine') # df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'EntireLine'] # elif re.search('\(',data.strip()) and re.search('\)',data.strip()) : # #print('PartofLine') # df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'PartofLine'] # elif re.search('\(',data.strip()) and not(re.search('\)',data.strip())) : # #print('only left parenthetical present') # df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'only left parenthetical present'] # elif not(re.search('\(',data.strip())) and re.search('\)',data.strip()) : # #print('only right parenthetical present') # df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'only right parenthetical present'] # #df["ps{0}".format(i)][index] = math.trunc(df["ps{0}".format(i)][index]) # #print("i is ",i) # #print(math.trunc(ps_dict["ps{0}".format(i)])) # ## Calculation of weights based on indent equals previous / next non blank line # if cur_indent == pnbl_indent: # df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'cur_indent_equals_pnbl'] # if cur_indent == nnbl_indent: # df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'cur_indent_equals_nnbl'] # print("ps{0}".format(i),df["ps{0}".format(i)][index]) # if 'actual_element' not in df.columns: # df['actual_element'] = '' return df def run_audit_on_identified_english(df,audit_df): def check_slug_start(data): slug_start_correction_required = True ########### slug start checks correct_slugstarts = ['INT\. ','EXT\. ','INT\.\/EXT\. ','EXT\.\/INT\. '] for ss in correct_slugstarts: sub_str = re.search(ss,data.lstrip()) try: print(ss,sub_str,data) except: pass if sub_str: if sub_str.start() == 0: slug_start_correction_required = False break return slug_start_correction_required def check_slug_mid_special_before(data): change_done = False slug_parts = data.split() if len(slug_parts) > 1: slug_mid_end = '' for i in range(1,len(slug_parts)): slug_mid_end += slug_parts[i] + ' ' slug_mid_end = slug_mid_end.rstrip() sub_str = re.match('\w',slug_mid_end[0]) if not sub_str: change_done = True print(sub_str,' in ',slug_mid_end) while not sub_str and slug_mid_end.strip(): slug_mid_end = slug_mid_end[1:] try: sub_str = re.match('\w',slug_mid_end[0]) print(sub_str,' in ',slug_mid_end) except: pass slug_mid_end = slug_mid_end.replace('-',' - ') slug_mid_end = (' ').join(slug_mid_end.split()) data = slug_parts[0] + ' ' + slug_mid_end data = data.rjust(15 + len(data)) else: print("could not identify slug middle") return data,change_done def check_slug_mid_extra_dot(data): change_done = False slug_parts = data.split() if len(slug_parts) > 1: slug_mid_end = '' for i in range(1,len(slug_parts)): slug_mid_end += slug_parts[i] + ' ' slug_mid_end = slug_mid_end.rstrip() sub_str = re.search('\.',slug_mid_end) if sub_str: change_done = True slug_mid_end = slug_mid_end.replace('.','-') slug_mid_end = (' ').join(slug_mid_end.split()) data = slug_parts[0] + ' ' + slug_mid_end data = data.rjust(15 + len(data)) return data,change_done def check_slug_time(data): slug_time_correction_required = True ########### slug start checks correct_slugtimes = ['DAY','NIGHT','EVENING'] for st in correct_slugtimes: sub_str = re.search(st,data.strip()) try: print(st,sub_str,data) except: pass if sub_str: if sub_str.start() == 0: slug_time_correction_required = False break return slug_time_correction_required def audit_english_ps1(df,audit_df,index): line_no = df['line_no'][index] print("Auditing Slugline - Language Specific - English") data = df['data'][index] current_comment = '' change_done = False slug_start_correction_required = False try: slug_start_correction_required = check_slug_start(data) except: pass print(slug_start_correction_required) while slug_start_correction_required: ## INT- becomes INT. ##replace comma by fullstop ## space added if not present print("slugline start neeeds correction") slugerrors_dict = {'INT. ':'INT. ', 'EXT. ':'EXT. ', 'INT-':'INT.', 'EXT-':'EXT.', 'INT\/EXT-':'INT./EXT.', 'EXT\/INT-':'EXT./INT.', 'INT -':'INT.', 'EXT -':'EXT.', 'INT\/EXT -':'INT./EXT.', 'EXT\/INT -':'EXT./INT.', 'INT ':'INT.', 'EXT ':'EXT.', 'INT\/EXT ':'INT./EXT.', 'EXT\/INT ':'EXT./INT.', 'INT,':'INT.', 'EXT,':'EXT.', 'INT\/EXT,':'INT./EXT.', 'EXT\/INT,':'EXT./INT.', 'INT\/EXT\.':'INT./EXT.', 'EXT\/INT\.':'EXT./INT.', 'NT\.':'INT.', 'XT\.':'EXT.', 'INT\/ EXT ':'INT./EXT. ', 'EXT\/ INT ':'EXT./INT. ', 'INT \/ EXT ':'INT./EXT. ', 'EXT \/ INT ':'EXT./INT. ', 'I \s*T':'INT.', 'E \s*XT':'EXT.', 'INT\.':'INT. ', 'EXT\.':'EXT. ', 'INT\.\/EXT\.':'INT./EXT. ', 'EXT\.\/INT\.':'EXT./INT. ' } for error,corrected in slugerrors_dict.items(): sub_str = re.search(error,data.lstrip(),re.IGNORECASE) if sub_str: if sub_str.start() == 0: data = data.replace(sub_str.group(0),corrected) df['data'][index] = data audit_df['language_specific_audit_comments'][line_no] = 'Slugline start corrected ' print("corrected slug start") slug_start_correction_required = check_slug_start(data) break else: slug_start_correction_required = False continue ## slugline keep only single space between slugstart and location ######### slug location check slug_start_correction_required = check_slug_start(data) if not slug_start_correction_required: data,change_done = check_slug_mid_special_before(data) if str(audit_df['language_specific_audit_comments'][line_no]) == 'No': current_comment = '' else: current_comment = str(audit_df['language_specific_audit_comments'][line_no]) + '\n' if change_done: audit_df['language_specific_audit_comments'][line_no] = current_comment + 'Slugline special char like hyphen removed before location ' df['data'][index] = data data,change_done = check_slug_mid_extra_dot(data) if str(audit_df['language_specific_audit_comments'][line_no]) == 'No': current_comment = '' else: current_comment = str(audit_df['language_specific_audit_comments'][line_no]) + '\n' if change_done: audit_df['language_specific_audit_comments'][line_no] = current_comment + 'Slugline extra dots replaced by hyphen ' df['data'][index] = data ############## #####remove extra hyphens sub_str = re.search('-[ ]*-',data) cmt = False while sub_str: data = data.replace(sub_str.group(0),'-') df['data'][index] = data if str(audit_df['language_specific_audit_comments'][line_no]) == 'No': current_comment = '' else: current_comment = str(audit_df['language_specific_audit_comments'][line_no]) + '\n' if not cmt: audit_df['language_specific_audit_comments'][line_no] = current_comment + 'Slugline extra hyphen removed' cmt = True sub_str = re.search('-[ ]*-',data) slug_time_correction_required = False change_done = False ########### slug time of day check/correction slug_parts = data.split('-') slug_before_time = slug_parts[0] slug_time = '' if len(slug_parts) >= 2: slug_time = ('-').join(slug_parts[1:]) #slug_time = slug_parts[1] print(slug_time) slug_time_correction_required = check_slug_time(slug_time) print("slug time is",slug_time) slugtimeerrors_dict = {'EVE':'EVENING'} if not slug_time: data = slug_before_time.rstrip() + ' - DAY' df['data'][index] = data if str(audit_df['language_specific_audit_comments'][line_no]) == 'No': current_comment = '' else: current_comment = str(audit_df['language_specific_audit_comments'][line_no]) + '\n' audit_df['language_specific_audit_comments'][line_no] = current_comment + 'Slugline Added Default time DAY ' print("corrected slug time - added DAY") elif slug_time_correction_required: print("slugline time neeeds correction") for error,corrected in slugtimeerrors_dict.items(): sub_str = re.search(error,slug_time.lstrip(),re.IGNORECASE) if sub_str: if sub_str.start() == 0: slug_time = slug_time.replace(sub_str.group(0),corrected) data = slug_before_time.rstrip() + ' - ' + slug_time.lstrip() df['data'][index] = data if str(audit_df['language_specific_audit_comments'][line_no]) == 'No': current_comment = '' else: current_comment = str(audit_df['language_specific_audit_comments'][line_no]) + '\n' audit_df['language_specific_audit_comments'][line_no] = current_comment + 'Slugline EVE replaced by EVENING ' print("corrected slug time") slug_time_correction_required = check_slug_time(data) break else: slug_time_correction_required = False continue #### slug year correction index_iter = iter(df.index) for index in index_iter: if (df['isIdentified'][index] == 'No'): continue nl_deleted = False cur_line_pos = df['Identification_Status'][index] fn_name = 'audit_english_' + cur_line_pos line_no = df['line_no'][index] print("\n") print("line no",line_no) print("index ",index) print(cur_line_pos) try: to_call_fn = locals()[fn_name] print(to_call_fn) except: continue try: #nl_deleted = to_call_fn(df,audit_df,index) to_call_fn(df,audit_df,index) except: pass # if nl_deleted : # next(index_iter) #df = df.sort_index().reset_index(drop=True) #df = df.sort_values(by=['line_no']).reset_index(drop =True) return df def ai_gen_script_to_audited_df(df): for index in df.index: if df['isIdentified'][index] == 'Yes' or df['Identification_Status'][index] == 'blank' : continue if str(df['data'][index]).strip() == "" : df['isIdentified'][index] = 'Yes' df['Identification_Status'][index] = 'blank' continue if df['data'][index].startswith('INT.') or df['data'][index].startswith('EXT.') : df['Identification_Status'][index] = 'ps1' df['isIdentified'][index] = 'Yes' #print(df['data'][index]) continue if df['nlb'][index] == 'Y' and df['plb'][index] == 'Y' and df['case'][index] == 'AllUpper': df['Identification_Status'][index] = 'ps16' df['isIdentified'][index] = 'Yes' #print(df['data'][index]) continue if df['nlb'][index] == 'Y' and df['plb'][index] == 'Y' : df['Identification_Status'][index] = 'ps6' df['isIdentified'][index] = 'Yes' #print(df['data'][index]) continue if df['nlb'][index] == 'Y' and df['plb'][index] == 'N' : df['Identification_Status'][index] = 'ps15' df['isIdentified'][index] = 'Yes' #print(df['data'][index]) continue if df['nlb'][index] == 'N' and df['plb'][index] == 'Y' and df['parenthetical'][index] == 'PartMidEnd': df['Identification_Status'][index] = 'ps8' df['isIdentified'][index] = 'Yes' #print(df['data'][index]) continue if df['nlb'][index] == 'N' and df['plb'][index] == 'Y' and df['parenthetical'][index] == 'Absent': df['Identification_Status'][index] = 'ps7' df['isIdentified'][index] = 'Yes' #print(df['data'][index]) continue if df['nlb'][index] == 'N' and df['plb'][index] == 'N' and df['parenthetical'][index] == 'Complete': df['Identification_Status'][index] = 'ps10' df['isIdentified'][index] = 'Yes' #print(df['data'][index]) continue ## identify unidentified as actions df['Identification_Status'][index] = 'ps6' df['isIdentified'][index] = 'Yes' return df