Conversion_Kitchen_Code/kitchen_counter/scriptAudit/sa_functions_english.py

755 lines
27 KiB
Python
Raw Permalink Normal View History

2024-04-27 09:33:09 +00:00
import re
def trim_intro_english(df,audit_df):
print("Identifying and Removing Introduction and Titles")
## if found fadein
## else if found first slugline - INT
stopwords = ['FADE IN' ]
remove_upto = -1
intro_removed = False
for index in df.index:
data = df['data'][index]
data = ' '.join(data.split())
if not data.strip():
print("blank line move on")
continue
for sw in stopwords:
if re.search(sw,data,re.IGNORECASE):
print("Found Fade In",index)
line_no = df['line_no'][index]
df['Identification_Status'][index] = 'ps17'
audit_df['Identification_Status'][line_no] = 'ps17'
if index == 0:
intro_removed = True
return
remove_upto = index-1
if remove_upto <= 100 :
print("removing lines till ", remove_upto)
while remove_upto != -1:
line_no = df['line_no'][remove_upto]
audit_df['line_removed'][line_no] = 'Yes'
audit_df['introduction'][line_no] = 'Yes'
df.drop(remove_upto,inplace= True)
remove_upto -= 1
intro_removed = True
print("title and introduction removed")
break
if intro_removed:
return
print("stop words not found")
## fade in not found check for first slugline
slugwords = ['INT','EXT','I/E','E/I' ]
remove_upto = -1
intro_removed = False
print("looking for first slugline")
for index in df.index:
data = df['data'][index]
data = ' '.join(data.split())
if not data.strip():
print("blank line move on")
continue
for sw in slugwords:
print(sw,data)
if re.search(sw,data.split()[0],re.IGNORECASE):
print("Found Slugline , also idenfifying as slugline",index)
df['Identification_Status'][index] = 'ps1'
remove_upto = index-1
if remove_upto < 0:
intro_removed = True
print("no intro")
break
elif remove_upto <= 100 :
print("removing lines till ", remove_upto)
while remove_upto != -1:
line_no = df['line_no'][remove_upto]
audit_df['line_removed'][line_no] = 'Yes'
audit_df['introduction'][line_no] = 'Yes'
df.drop(remove_upto,inplace= True)
remove_upto -= 1
intro_removed = True
print("title and introduction removed before slugline")
break
if intro_removed:
return
def trim_appendix_english(df,audit_df):
print("\nIdentifying and Removing Appendix /Epilogue ")
## if found fadein
## else if found first slugline - INT
stopwords = ['FADE OUT','THE END','BLACK' ]
remove_upto = df.index[-1]
appendix_removed = False
for index in df.index[::-1]:
data = df['data'][index]
data = ' '.join(data.split())
##print(data)
if not data.strip():
print("blank line move on")
continue
for sw in stopwords:
search_data = data.replace(":","")
if re.match(sw,search_data.strip(),re.IGNORECASE):
print("Found stop word",sw,' at index ',index)
remove_upto = index
if remove_upto >= 5 :
line_no = df['line_no'][remove_upto]
audit_df['Identification_Status'][line_no] = 'ps17'
print("removing lines from ", remove_upto)
while remove_upto != len(df) :
print(len(df))
print("in while",remove_upto)
line_no = df['line_no'][df.index[-1]]
audit_df['line_removed'][line_no] = 'Yes'
audit_df['appendix'][line_no] = 'Yes'
df.drop(df.index[-1],inplace= True)
appendix_removed = True
print("prologue /appendix after stop words removed for audit")
break
if appendix_removed:
return
print("stop words not found")
return
def check_and_remove_numbers(df,audit_df,index):
data = df['data'][index]
start_is_num = True
## check if number at start
while start_is_num:
sub_num = re.search('\d',data.lstrip())
if sub_num:
if sub_num.start() == 0:
data = data.replace(sub_num.group(0),'')
df['data'][index] = data
continue
start_is_num = False
def update_pos_wts_english(df):
print("Running english specific weights update")
print(df['Identification_Status'].iloc[0],df['Identification_Status'].iloc[1])
## line after fade in
if df['Identification_Status'].iloc[0] == 'ps17':
if df['nlb'].iloc[0] == 'Y':
df["ps1"].iloc[2] += 20
else:
df["ps1"].iloc[1] += 20
elif df['Identification_Status'].iloc[1] == 'ps17':
if df['nlb'].iloc[1] == 'Y':
df["ps1"].iloc[3] += 20
else:
df["ps1"].iloc[2] += 20
for index in df.index:
line_no = df['line_no'][index]
data = df['data'][index]
plb = df['plb'][index]
nlb = df['nlb'][index]
par = df['parenthetical'][index]
pnbl_index = False
nnbl_index = False
try:
pnbl_line_no = df['pnbl_line_no'][index]
pnbl_index = df.loc[df['line_no'] == pnbl_line_no,:].index.values[0]
except:
pnbl_index = False
try:
nnbl_line_no = df['nnbl_line_no'][index]
nnbl_index = df.loc[df['line_no'] == nnbl_line_no,:].index.values[0]
except:
nnbl_index = False
try:
pnbl_indent = df['ssc'][pnbl_index]
except:
pnbl_indent = -1
try:
nnbl_indent = df['ssc'][nnbl_index]
except:
nnbl_indent = -1
cur_indent = df['ssc'][index]
ssc_col = 'ssc_' + str(cur_indent)
case = df['case'][index]
try:
print("processing line no",line_no, data)
except:
pass
print(plb)
print(nlb)
print(pnbl_indent)
print(nnbl_indent)
lcp = df['lcp'][index]
#print("lcp ",lcp)
lcp_col = "lcp_" + str(lcp)
###########
######### english specific wts
###### if number in less than 15
first_15 = ''
if len(data) > 15:
first_15 = data[0:15]
else:
first_15 = data
print(first_15)
num_in_15 = False
num= ''
for ch in first_15.strip():
if re.match('\d',ch):
num_in_15 = True
num += ch
continue
else:
break
df['scene_number'][index] = str(num)
if num_in_15:
print("number found in first 15")
print(num)
df["ps1"][index] += 10
## int, ext, day, night
slug_phrases = ['INT','EXT','DAY','NIGHT']
for slug_phrase in slug_phrases:
if df['case'][index] != 'AllUpper':
break
elif re.search(slug_phrase,data):
print("slugphrase found",slug_phrase)
df["ps1"][index] += 20
break
## CUT TO
if re.match('CUT TO',data.replace(":",'').strip(),re.IGNORECASE):
print("cut to exact")
df["ps16"][index] += 25
elif re.search('CUT TO',data.replace(":",'').strip(),re.IGNORECASE):
print("cut to found")
df["ps16"][index] += 10
##speaker related terms
sp_terms = ['voice from mobile','voice from phone','voice from tv','voice from radio']
for term in sp_terms:
sub_str = re.search(term,data,re.IGNORECASE)
if sub_str:
print("possible speaker term found increasing sp weight",term)
df["ps7"][index] += 10
break
if par == 'PartMidEnd':
par_data = data.split("(")[1][:-1].strip()
sp_par_terms = ['continued','cont.d','cont']
for term in sp_par_terms:
sub_str = re.match(term,par_data,re.IGNORECASE)
if sub_str:
print("possible speaker term found in parenthtical increasing sp weight",term)
df["ps7"][index] += 10
break
## V.O. or O.S. or VO or OS
sp_ext_par_terms = ['VO','OS','V\.O','O\.S']
for term in sp_ext_par_terms:
sub_str = re.match(term,par_data)
if sub_str:
print("possible speaker extension term found in parenthtical increasing sp externsion weight weight",term)
df["ps8"][index] += 10
break
# #print(wts_df.head(0))
# # make space dict for getting relevant space columns for weights
# sp_bin_dict = {1:'0-14',2:'15',3:'16-24',4:'25',5:'26-29',6:'30',7:'31-34',8:'35',9:'36-73',10:'74onwards'
# }
# #loop over for the possibilities
# for i in range(1,32):
# if i in ('23','24','32','33'):
# continue
# df["ps{0}".format(i)][index] = 0
# ## get weights for the case
# if case in ('EndUpper','MidUpper'):
# case = 'FirstLowerSomeUpper'
# if case != 'None':
# df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),case]
# ## get weights based on the starting space count
# try:
# df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),ssc_col]
# #print("starting wight code was here")
# except:
# pass
# #print("ps{0}".format(i),df["ps{0}".format(i)][index])
# ## get weights for <19 with Numeric character or <19 without Numeric character
# pos_num = re.search('[0-9]',data)
# if (pos_num!= None) and cur_indent<15:
# df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'<15withNumeric']
# elif check_space(data)<15:
# df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'<15withoutNumeric']
# if cur_indent>65:
# df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'ssc_gt_65']
# ## get weights based on the last character placement
# try:
# df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),lcp_col]
# #print("code was here")
# except:
# pass
# # how far is it from position 51 63 78
# # 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78
# # modify the wights matrix and create in between weights
# #print("ps{0}".format(i),df["ps{0}".format(i)][index])
# # Calculation of weights based on plb and nlb(L-O column in sheet)
# if plb == "Y":
# df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'PLB_Yes']
# if plb == "N":
# df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'PLB_No']
# if nlb == "Y":
# df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'NLB_Yes']
# if nlb == "N":
# df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'NLB_No']
# #print("ps{0}".format(i),df["ps{0}".format(i)][index])
# # Calculation of weights based on parenthesis(H-K column in sheet)
# if re.match('\(',data.strip()[:1]) and re.match('\)',data.strip()[-1:]) :
# # print('EntireLine')
# df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'EntireLine']
# elif re.search('\(',data.strip()) and re.search('\)',data.strip()) :
# #print('PartofLine')
# df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'PartofLine']
# elif re.search('\(',data.strip()) and not(re.search('\)',data.strip())) :
# #print('only left parenthetical present')
# df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'only left parenthetical present']
# elif not(re.search('\(',data.strip())) and re.search('\)',data.strip()) :
# #print('only right parenthetical present')
# df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'only right parenthetical present']
# #df["ps{0}".format(i)][index] = math.trunc(df["ps{0}".format(i)][index])
# #print("i is ",i)
# #print(math.trunc(ps_dict["ps{0}".format(i)]))
# ## Calculation of weights based on indent equals previous / next non blank line
# if cur_indent == pnbl_indent:
# df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'cur_indent_equals_pnbl']
# if cur_indent == nnbl_indent:
# df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'cur_indent_equals_nnbl']
# print("ps{0}".format(i),df["ps{0}".format(i)][index])
# if 'actual_element' not in df.columns:
# df['actual_element'] = ''
return df
def run_audit_on_identified_english(df,audit_df):
def check_slug_start(data):
slug_start_correction_required = True
########### slug start checks
correct_slugstarts = ['INT\. ','EXT\. ','INT\.\/EXT\. ','EXT\.\/INT\. ']
for ss in correct_slugstarts:
sub_str = re.search(ss,data.lstrip())
try:
print(ss,sub_str,data)
except:
pass
if sub_str:
if sub_str.start() == 0:
slug_start_correction_required = False
break
return slug_start_correction_required
def check_slug_mid_special_before(data):
change_done = False
slug_parts = data.split()
if len(slug_parts) > 1:
slug_mid_end = ''
for i in range(1,len(slug_parts)):
slug_mid_end += slug_parts[i] + ' '
slug_mid_end = slug_mid_end.rstrip()
sub_str = re.match('\w',slug_mid_end[0])
if not sub_str:
change_done = True
print(sub_str,' in ',slug_mid_end)
while not sub_str and slug_mid_end.strip():
slug_mid_end = slug_mid_end[1:]
try:
sub_str = re.match('\w',slug_mid_end[0])
print(sub_str,' in ',slug_mid_end)
except:
pass
slug_mid_end = slug_mid_end.replace('-',' - ')
slug_mid_end = (' ').join(slug_mid_end.split())
data = slug_parts[0] + ' ' + slug_mid_end
data = data.rjust(15 + len(data))
else:
print("could not identify slug middle")
return data,change_done
def check_slug_mid_extra_dot(data):
change_done = False
slug_parts = data.split()
if len(slug_parts) > 1:
slug_mid_end = ''
for i in range(1,len(slug_parts)):
slug_mid_end += slug_parts[i] + ' '
slug_mid_end = slug_mid_end.rstrip()
sub_str = re.search('\.',slug_mid_end)
if sub_str:
change_done = True
slug_mid_end = slug_mid_end.replace('.','-')
slug_mid_end = (' ').join(slug_mid_end.split())
data = slug_parts[0] + ' ' + slug_mid_end
data = data.rjust(15 + len(data))
return data,change_done
def check_slug_time(data):
slug_time_correction_required = True
########### slug start checks
correct_slugtimes = ['DAY','NIGHT','EVENING']
for st in correct_slugtimes:
sub_str = re.search(st,data.strip())
try:
print(st,sub_str,data)
except:
pass
if sub_str:
if sub_str.start() == 0:
slug_time_correction_required = False
break
return slug_time_correction_required
def audit_english_ps1(df,audit_df,index):
line_no = df['line_no'][index]
print("Auditing Slugline - Language Specific - English")
data = df['data'][index]
current_comment = ''
change_done = False
slug_start_correction_required = False
try:
slug_start_correction_required = check_slug_start(data)
except:
pass
print(slug_start_correction_required)
while slug_start_correction_required:
## INT- becomes INT.
##replace comma by fullstop
## space added if not present
print("slugline start neeeds correction")
slugerrors_dict = {'INT. ':'INT. ',
'EXT. ':'EXT. ',
'INT-':'INT.',
'EXT-':'EXT.',
'INT\/EXT-':'INT./EXT.',
'EXT\/INT-':'EXT./INT.',
'INT -':'INT.',
'EXT -':'EXT.',
'INT\/EXT -':'INT./EXT.',
'EXT\/INT -':'EXT./INT.',
'INT ':'INT.',
'EXT ':'EXT.',
'INT\/EXT ':'INT./EXT.',
'EXT\/INT ':'EXT./INT.',
'INT,':'INT.',
'EXT,':'EXT.',
'INT\/EXT,':'INT./EXT.',
'EXT\/INT,':'EXT./INT.',
'INT\/EXT\.':'INT./EXT.',
'EXT\/INT\.':'EXT./INT.',
'NT\.':'INT.',
'XT\.':'EXT.',
'INT\/ EXT ':'INT./EXT. ',
'EXT\/ INT ':'EXT./INT. ',
'INT \/ EXT ':'INT./EXT. ',
'EXT \/ INT ':'EXT./INT. ',
'I \s*T':'INT.',
'E \s*XT':'EXT.',
'INT\.':'INT. ',
'EXT\.':'EXT. ',
'INT\.\/EXT\.':'INT./EXT. ',
'EXT\.\/INT\.':'EXT./INT. '
}
for error,corrected in slugerrors_dict.items():
sub_str = re.search(error,data.lstrip(),re.IGNORECASE)
if sub_str:
if sub_str.start() == 0:
data = data.replace(sub_str.group(0),corrected)
df['data'][index] = data
audit_df['language_specific_audit_comments'][line_no] = 'Slugline start corrected '
print("corrected slug start")
slug_start_correction_required = check_slug_start(data)
break
else:
slug_start_correction_required = False
continue
## slugline keep only single space between slugstart and location
######### slug location check
slug_start_correction_required = check_slug_start(data)
if not slug_start_correction_required:
data,change_done = check_slug_mid_special_before(data)
if str(audit_df['language_specific_audit_comments'][line_no]) == 'No':
current_comment = ''
else:
current_comment = str(audit_df['language_specific_audit_comments'][line_no]) + '\n'
if change_done:
audit_df['language_specific_audit_comments'][line_no] = current_comment + 'Slugline special char like hyphen removed before location '
df['data'][index] = data
data,change_done = check_slug_mid_extra_dot(data)
if str(audit_df['language_specific_audit_comments'][line_no]) == 'No':
current_comment = ''
else:
current_comment = str(audit_df['language_specific_audit_comments'][line_no]) + '\n'
if change_done:
audit_df['language_specific_audit_comments'][line_no] = current_comment + 'Slugline extra dots replaced by hyphen '
df['data'][index] = data
##############
#####remove extra hyphens
sub_str = re.search('-[ ]*-',data)
cmt = False
while sub_str:
data = data.replace(sub_str.group(0),'-')
df['data'][index] = data
if str(audit_df['language_specific_audit_comments'][line_no]) == 'No':
current_comment = ''
else:
current_comment = str(audit_df['language_specific_audit_comments'][line_no]) + '\n'
if not cmt:
audit_df['language_specific_audit_comments'][line_no] = current_comment + 'Slugline extra hyphen removed'
cmt = True
sub_str = re.search('-[ ]*-',data)
slug_time_correction_required = False
change_done = False
########### slug time of day check/correction
slug_parts = data.split('-')
slug_before_time = slug_parts[0]
slug_time = ''
if len(slug_parts) >= 2:
slug_time = ('-').join(slug_parts[1:])
#slug_time = slug_parts[1]
print(slug_time)
slug_time_correction_required = check_slug_time(slug_time)
print("slug time is",slug_time)
slugtimeerrors_dict = {'EVE':'EVENING'}
if not slug_time:
data = slug_before_time.rstrip() + ' - DAY'
df['data'][index] = data
if str(audit_df['language_specific_audit_comments'][line_no]) == 'No':
current_comment = ''
else:
current_comment = str(audit_df['language_specific_audit_comments'][line_no]) + '\n'
audit_df['language_specific_audit_comments'][line_no] = current_comment + 'Slugline Added Default time DAY '
print("corrected slug time - added DAY")
elif slug_time_correction_required:
print("slugline time neeeds correction")
for error,corrected in slugtimeerrors_dict.items():
sub_str = re.search(error,slug_time.lstrip(),re.IGNORECASE)
if sub_str:
if sub_str.start() == 0:
slug_time = slug_time.replace(sub_str.group(0),corrected)
data = slug_before_time.rstrip() + ' - ' + slug_time.lstrip()
df['data'][index] = data
if str(audit_df['language_specific_audit_comments'][line_no]) == 'No':
current_comment = ''
else:
current_comment = str(audit_df['language_specific_audit_comments'][line_no]) + '\n'
audit_df['language_specific_audit_comments'][line_no] = current_comment + 'Slugline EVE replaced by EVENING '
print("corrected slug time")
slug_time_correction_required = check_slug_time(data)
break
else:
slug_time_correction_required = False
continue
#### slug year correction
index_iter = iter(df.index)
for index in index_iter:
if (df['isIdentified'][index] == 'No'):
continue
nl_deleted = False
cur_line_pos = df['Identification_Status'][index]
fn_name = 'audit_english_' + cur_line_pos
line_no = df['line_no'][index]
print("\n")
print("line no",line_no)
print("index ",index)
print(cur_line_pos)
try:
to_call_fn = locals()[fn_name]
print(to_call_fn)
except:
continue
try:
#nl_deleted = to_call_fn(df,audit_df,index)
to_call_fn(df,audit_df,index)
except:
pass
# if nl_deleted :
# next(index_iter)
#df = df.sort_index().reset_index(drop=True)
#df = df.sort_values(by=['line_no']).reset_index(drop =True)
return df
def ai_gen_script_to_audited_df(df):
for index in df.index:
if df['isIdentified'][index] == 'Yes' or df['Identification_Status'][index] == 'blank' :
continue
if str(df['data'][index]).strip() == "" :
df['isIdentified'][index] = 'Yes'
df['Identification_Status'][index] = 'blank'
continue
if df['data'][index].startswith('INT.') or df['data'][index].startswith('EXT.') :
df['Identification_Status'][index] = 'ps1'
df['isIdentified'][index] = 'Yes'
#print(df['data'][index])
continue
if df['nlb'][index] == 'Y' and df['plb'][index] == 'Y' and df['case'][index] == 'AllUpper':
df['Identification_Status'][index] = 'ps16'
df['isIdentified'][index] = 'Yes'
#print(df['data'][index])
continue
if df['nlb'][index] == 'Y' and df['plb'][index] == 'Y' :
df['Identification_Status'][index] = 'ps6'
df['isIdentified'][index] = 'Yes'
#print(df['data'][index])
continue
if df['nlb'][index] == 'Y' and df['plb'][index] == 'N' :
df['Identification_Status'][index] = 'ps15'
df['isIdentified'][index] = 'Yes'
#print(df['data'][index])
continue
if df['nlb'][index] == 'N' and df['plb'][index] == 'Y' and df['parenthetical'][index] == 'PartMidEnd':
df['Identification_Status'][index] = 'ps8'
df['isIdentified'][index] = 'Yes'
#print(df['data'][index])
continue
if df['nlb'][index] == 'N' and df['plb'][index] == 'Y' and df['parenthetical'][index] == 'Absent':
df['Identification_Status'][index] = 'ps7'
df['isIdentified'][index] = 'Yes'
#print(df['data'][index])
continue
if df['nlb'][index] == 'N' and df['plb'][index] == 'N' and df['parenthetical'][index] == 'Complete':
df['Identification_Status'][index] = 'ps10'
df['isIdentified'][index] = 'Yes'
#print(df['data'][index])
continue
## identify unidentified as actions
df['Identification_Status'][index] = 'ps6'
df['isIdentified'][index] = 'Yes'
return df