2024-04-27 09:33:09 +00:00
|
|
|
import re
|
|
|
|
|
|
|
|
def trim_intro_english(df,audit_df):
|
|
|
|
|
|
|
|
|
|
|
|
print("Identifying and Removing Introduction and Titles")
|
|
|
|
## if found fadein
|
|
|
|
## else if found first slugline - INT
|
|
|
|
|
|
|
|
stopwords = ['FADE IN' ]
|
|
|
|
remove_upto = -1
|
|
|
|
intro_removed = False
|
|
|
|
|
|
|
|
for index in df.index:
|
|
|
|
data = df['data'][index]
|
|
|
|
data = ' '.join(data.split())
|
|
|
|
if not data.strip():
|
|
|
|
print("blank line move on")
|
|
|
|
continue
|
|
|
|
for sw in stopwords:
|
|
|
|
if re.search(sw,data,re.IGNORECASE):
|
|
|
|
print("Found Fade In",index)
|
|
|
|
line_no = df['line_no'][index]
|
|
|
|
df['Identification_Status'][index] = 'ps17'
|
|
|
|
audit_df['Identification_Status'][line_no] = 'ps17'
|
|
|
|
if index == 0:
|
|
|
|
intro_removed = True
|
|
|
|
return
|
|
|
|
|
|
|
|
remove_upto = index-1
|
|
|
|
if remove_upto <= 100 :
|
|
|
|
print("removing lines till ", remove_upto)
|
|
|
|
while remove_upto != -1:
|
|
|
|
line_no = df['line_no'][remove_upto]
|
|
|
|
audit_df['line_removed'][line_no] = 'Yes'
|
|
|
|
audit_df['introduction'][line_no] = 'Yes'
|
|
|
|
|
|
|
|
df.drop(remove_upto,inplace= True)
|
|
|
|
remove_upto -= 1
|
|
|
|
|
|
|
|
intro_removed = True
|
|
|
|
print("title and introduction removed")
|
|
|
|
break
|
|
|
|
if intro_removed:
|
|
|
|
return
|
|
|
|
|
|
|
|
print("stop words not found")
|
|
|
|
|
|
|
|
## fade in not found check for first slugline
|
|
|
|
slugwords = ['INT','EXT','I/E','E/I' ]
|
|
|
|
remove_upto = -1
|
|
|
|
intro_removed = False
|
|
|
|
print("looking for first slugline")
|
|
|
|
|
|
|
|
for index in df.index:
|
|
|
|
data = df['data'][index]
|
|
|
|
data = ' '.join(data.split())
|
|
|
|
if not data.strip():
|
|
|
|
print("blank line move on")
|
|
|
|
continue
|
|
|
|
for sw in slugwords:
|
|
|
|
print(sw,data)
|
|
|
|
if re.search(sw,data.split()[0],re.IGNORECASE):
|
|
|
|
print("Found Slugline , also idenfifying as slugline",index)
|
|
|
|
df['Identification_Status'][index] = 'ps1'
|
|
|
|
remove_upto = index-1
|
|
|
|
if remove_upto < 0:
|
|
|
|
intro_removed = True
|
|
|
|
print("no intro")
|
|
|
|
break
|
|
|
|
elif remove_upto <= 100 :
|
|
|
|
|
|
|
|
print("removing lines till ", remove_upto)
|
|
|
|
while remove_upto != -1:
|
|
|
|
line_no = df['line_no'][remove_upto]
|
|
|
|
audit_df['line_removed'][line_no] = 'Yes'
|
|
|
|
audit_df['introduction'][line_no] = 'Yes'
|
|
|
|
|
|
|
|
df.drop(remove_upto,inplace= True)
|
|
|
|
remove_upto -= 1
|
|
|
|
|
|
|
|
intro_removed = True
|
|
|
|
print("title and introduction removed before slugline")
|
|
|
|
break
|
|
|
|
if intro_removed:
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
def trim_appendix_english(df,audit_df):
|
|
|
|
|
|
|
|
|
|
|
|
print("\nIdentifying and Removing Appendix /Epilogue ")
|
|
|
|
## if found fadein
|
|
|
|
## else if found first slugline - INT
|
|
|
|
|
|
|
|
stopwords = ['FADE OUT','THE END','BLACK' ]
|
|
|
|
remove_upto = df.index[-1]
|
|
|
|
appendix_removed = False
|
|
|
|
|
|
|
|
for index in df.index[::-1]:
|
|
|
|
data = df['data'][index]
|
|
|
|
data = ' '.join(data.split())
|
|
|
|
##print(data)
|
|
|
|
if not data.strip():
|
|
|
|
print("blank line move on")
|
|
|
|
continue
|
|
|
|
for sw in stopwords:
|
|
|
|
search_data = data.replace(":","")
|
|
|
|
if re.match(sw,search_data.strip(),re.IGNORECASE):
|
|
|
|
print("Found stop word",sw,' at index ',index)
|
|
|
|
|
|
|
|
remove_upto = index
|
|
|
|
if remove_upto >= 5 :
|
|
|
|
line_no = df['line_no'][remove_upto]
|
|
|
|
audit_df['Identification_Status'][line_no] = 'ps17'
|
|
|
|
|
|
|
|
print("removing lines from ", remove_upto)
|
|
|
|
|
|
|
|
while remove_upto != len(df) :
|
|
|
|
print(len(df))
|
|
|
|
print("in while",remove_upto)
|
|
|
|
line_no = df['line_no'][df.index[-1]]
|
|
|
|
audit_df['line_removed'][line_no] = 'Yes'
|
|
|
|
audit_df['appendix'][line_no] = 'Yes'
|
|
|
|
|
|
|
|
df.drop(df.index[-1],inplace= True)
|
|
|
|
|
|
|
|
|
|
|
|
appendix_removed = True
|
|
|
|
print("prologue /appendix after stop words removed for audit")
|
|
|
|
break
|
|
|
|
if appendix_removed:
|
|
|
|
return
|
|
|
|
|
|
|
|
print("stop words not found")
|
|
|
|
|
|
|
|
return
|
|
|
|
|
|
|
|
def check_and_remove_numbers(df,audit_df,index):
|
|
|
|
data = df['data'][index]
|
|
|
|
start_is_num = True
|
|
|
|
## check if number at start
|
|
|
|
while start_is_num:
|
|
|
|
sub_num = re.search('\d',data.lstrip())
|
|
|
|
if sub_num:
|
|
|
|
if sub_num.start() == 0:
|
|
|
|
data = data.replace(sub_num.group(0),'')
|
|
|
|
df['data'][index] = data
|
|
|
|
continue
|
|
|
|
start_is_num = False
|
|
|
|
|
|
|
|
|
|
|
|
def update_pos_wts_english(df):
|
|
|
|
|
|
|
|
print("Running english specific weights update")
|
|
|
|
print(df['Identification_Status'].iloc[0],df['Identification_Status'].iloc[1])
|
|
|
|
## line after fade in
|
|
|
|
if df['Identification_Status'].iloc[0] == 'ps17':
|
|
|
|
if df['nlb'].iloc[0] == 'Y':
|
|
|
|
df["ps1"].iloc[2] += 20
|
|
|
|
else:
|
|
|
|
df["ps1"].iloc[1] += 20
|
|
|
|
|
|
|
|
elif df['Identification_Status'].iloc[1] == 'ps17':
|
|
|
|
if df['nlb'].iloc[1] == 'Y':
|
|
|
|
df["ps1"].iloc[3] += 20
|
|
|
|
else:
|
|
|
|
df["ps1"].iloc[2] += 20
|
|
|
|
|
|
|
|
|
|
|
|
for index in df.index:
|
|
|
|
|
|
|
|
line_no = df['line_no'][index]
|
|
|
|
data = df['data'][index]
|
|
|
|
plb = df['plb'][index]
|
|
|
|
nlb = df['nlb'][index]
|
|
|
|
par = df['parenthetical'][index]
|
|
|
|
|
|
|
|
|
|
|
|
pnbl_index = False
|
|
|
|
nnbl_index = False
|
|
|
|
|
|
|
|
try:
|
|
|
|
pnbl_line_no = df['pnbl_line_no'][index]
|
|
|
|
pnbl_index = df.loc[df['line_no'] == pnbl_line_no,:].index.values[0]
|
|
|
|
except:
|
|
|
|
pnbl_index = False
|
|
|
|
|
|
|
|
try:
|
|
|
|
nnbl_line_no = df['nnbl_line_no'][index]
|
|
|
|
nnbl_index = df.loc[df['line_no'] == nnbl_line_no,:].index.values[0]
|
|
|
|
except:
|
|
|
|
nnbl_index = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
pnbl_indent = df['ssc'][pnbl_index]
|
|
|
|
except:
|
|
|
|
pnbl_indent = -1
|
|
|
|
try:
|
|
|
|
nnbl_indent = df['ssc'][nnbl_index]
|
|
|
|
except:
|
|
|
|
nnbl_indent = -1
|
|
|
|
|
|
|
|
cur_indent = df['ssc'][index]
|
|
|
|
ssc_col = 'ssc_' + str(cur_indent)
|
|
|
|
|
|
|
|
|
|
|
|
case = df['case'][index]
|
|
|
|
try:
|
|
|
|
print("processing line no",line_no, data)
|
|
|
|
except:
|
|
|
|
pass
|
|
|
|
|
|
|
|
print(plb)
|
|
|
|
print(nlb)
|
|
|
|
print(pnbl_indent)
|
|
|
|
print(nnbl_indent)
|
|
|
|
|
|
|
|
|
|
|
|
lcp = df['lcp'][index]
|
|
|
|
#print("lcp ",lcp)
|
|
|
|
lcp_col = "lcp_" + str(lcp)
|
|
|
|
|
|
|
|
###########
|
|
|
|
######### english specific wts
|
|
|
|
###### if number in less than 15
|
|
|
|
first_15 = ''
|
|
|
|
|
|
|
|
if len(data) > 15:
|
|
|
|
first_15 = data[0:15]
|
|
|
|
else:
|
|
|
|
first_15 = data
|
|
|
|
|
|
|
|
print(first_15)
|
|
|
|
num_in_15 = False
|
|
|
|
num= ''
|
|
|
|
|
|
|
|
for ch in first_15.strip():
|
|
|
|
if re.match('\d',ch):
|
|
|
|
num_in_15 = True
|
|
|
|
num += ch
|
|
|
|
continue
|
|
|
|
else:
|
|
|
|
break
|
|
|
|
|
|
|
|
df['scene_number'][index] = str(num)
|
|
|
|
|
|
|
|
if num_in_15:
|
|
|
|
print("number found in first 15")
|
|
|
|
print(num)
|
|
|
|
df["ps1"][index] += 10
|
|
|
|
|
|
|
|
## int, ext, day, night
|
|
|
|
slug_phrases = ['INT','EXT','DAY','NIGHT']
|
|
|
|
for slug_phrase in slug_phrases:
|
|
|
|
if df['case'][index] != 'AllUpper':
|
|
|
|
break
|
|
|
|
elif re.search(slug_phrase,data):
|
|
|
|
print("slugphrase found",slug_phrase)
|
|
|
|
df["ps1"][index] += 20
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
## CUT TO
|
|
|
|
|
|
|
|
if re.match('CUT TO',data.replace(":",'').strip(),re.IGNORECASE):
|
|
|
|
print("cut to exact")
|
|
|
|
df["ps16"][index] += 25
|
|
|
|
elif re.search('CUT TO',data.replace(":",'').strip(),re.IGNORECASE):
|
|
|
|
print("cut to found")
|
|
|
|
df["ps16"][index] += 10
|
|
|
|
|
|
|
|
##speaker related terms
|
|
|
|
sp_terms = ['voice from mobile','voice from phone','voice from tv','voice from radio']
|
|
|
|
for term in sp_terms:
|
|
|
|
sub_str = re.search(term,data,re.IGNORECASE)
|
|
|
|
if sub_str:
|
|
|
|
print("possible speaker term found increasing sp weight",term)
|
|
|
|
df["ps7"][index] += 10
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
if par == 'PartMidEnd':
|
|
|
|
par_data = data.split("(")[1][:-1].strip()
|
|
|
|
sp_par_terms = ['continued','cont.d','cont']
|
|
|
|
for term in sp_par_terms:
|
|
|
|
sub_str = re.match(term,par_data,re.IGNORECASE)
|
|
|
|
if sub_str:
|
|
|
|
print("possible speaker term found in parenthtical increasing sp weight",term)
|
|
|
|
df["ps7"][index] += 10
|
|
|
|
break
|
|
|
|
## V.O. or O.S. or VO or OS
|
|
|
|
sp_ext_par_terms = ['VO','OS','V\.O','O\.S']
|
|
|
|
for term in sp_ext_par_terms:
|
|
|
|
sub_str = re.match(term,par_data)
|
|
|
|
if sub_str:
|
|
|
|
print("possible speaker extension term found in parenthtical increasing sp externsion weight weight",term)
|
|
|
|
df["ps8"][index] += 10
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
# #print(wts_df.head(0))
|
|
|
|
# # make space dict for getting relevant space columns for weights
|
|
|
|
# sp_bin_dict = {1:'0-14',2:'15',3:'16-24',4:'25',5:'26-29',6:'30',7:'31-34',8:'35',9:'36-73',10:'74onwards'
|
|
|
|
# }
|
|
|
|
# #loop over for the possibilities
|
|
|
|
|
|
|
|
# for i in range(1,32):
|
|
|
|
|
|
|
|
# if i in ('23','24','32','33'):
|
|
|
|
# continue
|
|
|
|
|
|
|
|
# df["ps{0}".format(i)][index] = 0
|
|
|
|
|
|
|
|
# ## get weights for the case
|
|
|
|
# if case in ('EndUpper','MidUpper'):
|
|
|
|
# case = 'FirstLowerSomeUpper'
|
|
|
|
|
|
|
|
# if case != 'None':
|
|
|
|
# df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),case]
|
|
|
|
|
|
|
|
|
|
|
|
# ## get weights based on the starting space count
|
|
|
|
|
|
|
|
# try:
|
|
|
|
# df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),ssc_col]
|
|
|
|
# #print("starting wight code was here")
|
|
|
|
# except:
|
|
|
|
# pass
|
|
|
|
|
|
|
|
# #print("ps{0}".format(i),df["ps{0}".format(i)][index])
|
|
|
|
# ## get weights for <19 with Numeric character or <19 without Numeric character
|
|
|
|
|
|
|
|
# pos_num = re.search('[0-9]',data)
|
|
|
|
# if (pos_num!= None) and cur_indent<15:
|
|
|
|
# df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'<15withNumeric']
|
|
|
|
# elif check_space(data)<15:
|
|
|
|
# df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'<15withoutNumeric']
|
|
|
|
|
|
|
|
# if cur_indent>65:
|
|
|
|
# df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'ssc_gt_65']
|
|
|
|
# ## get weights based on the last character placement
|
|
|
|
|
|
|
|
# try:
|
|
|
|
# df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),lcp_col]
|
|
|
|
# #print("code was here")
|
|
|
|
# except:
|
|
|
|
# pass
|
|
|
|
|
|
|
|
# # how far is it from position 51 63 78
|
|
|
|
# # 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78
|
|
|
|
# # modify the wights matrix and create in between weights
|
|
|
|
|
|
|
|
# #print("ps{0}".format(i),df["ps{0}".format(i)][index])
|
|
|
|
|
|
|
|
# # Calculation of weights based on plb and nlb(L-O column in sheet)
|
|
|
|
# if plb == "Y":
|
|
|
|
# df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'PLB_Yes']
|
|
|
|
# if plb == "N":
|
|
|
|
# df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'PLB_No']
|
|
|
|
# if nlb == "Y":
|
|
|
|
# df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'NLB_Yes']
|
|
|
|
# if nlb == "N":
|
|
|
|
# df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'NLB_No']
|
|
|
|
|
|
|
|
# #print("ps{0}".format(i),df["ps{0}".format(i)][index])
|
|
|
|
|
|
|
|
# # Calculation of weights based on parenthesis(H-K column in sheet)
|
|
|
|
|
|
|
|
# if re.match('\(',data.strip()[:1]) and re.match('\)',data.strip()[-1:]) :
|
|
|
|
# # print('EntireLine')
|
|
|
|
# df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'EntireLine']
|
|
|
|
# elif re.search('\(',data.strip()) and re.search('\)',data.strip()) :
|
|
|
|
# #print('PartofLine')
|
|
|
|
# df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'PartofLine']
|
|
|
|
# elif re.search('\(',data.strip()) and not(re.search('\)',data.strip())) :
|
|
|
|
# #print('only left parenthetical present')
|
|
|
|
# df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'only left parenthetical present']
|
|
|
|
# elif not(re.search('\(',data.strip())) and re.search('\)',data.strip()) :
|
|
|
|
# #print('only right parenthetical present')
|
|
|
|
# df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'only right parenthetical present']
|
|
|
|
|
|
|
|
# #df["ps{0}".format(i)][index] = math.trunc(df["ps{0}".format(i)][index])
|
|
|
|
# #print("i is ",i)
|
|
|
|
# #print(math.trunc(ps_dict["ps{0}".format(i)]))
|
|
|
|
|
|
|
|
# ## Calculation of weights based on indent equals previous / next non blank line
|
|
|
|
# if cur_indent == pnbl_indent:
|
|
|
|
# df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'cur_indent_equals_pnbl']
|
|
|
|
|
|
|
|
# if cur_indent == nnbl_indent:
|
|
|
|
# df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'cur_indent_equals_nnbl']
|
|
|
|
|
|
|
|
# print("ps{0}".format(i),df["ps{0}".format(i)][index])
|
|
|
|
|
|
|
|
|
|
|
|
# if 'actual_element' not in df.columns:
|
|
|
|
# df['actual_element'] = ''
|
|
|
|
|
|
|
|
return df
|
|
|
|
|
|
|
|
|
|
|
|
def run_audit_on_identified_english(df,audit_df):
|
|
|
|
def check_slug_start(data):
|
|
|
|
|
|
|
|
slug_start_correction_required = True
|
|
|
|
########### slug start checks
|
|
|
|
correct_slugstarts = ['INT\. ','EXT\. ','INT\.\/EXT\. ','EXT\.\/INT\. ']
|
|
|
|
for ss in correct_slugstarts:
|
|
|
|
sub_str = re.search(ss,data.lstrip())
|
|
|
|
try:
|
|
|
|
print(ss,sub_str,data)
|
|
|
|
except:
|
|
|
|
pass
|
|
|
|
if sub_str:
|
|
|
|
if sub_str.start() == 0:
|
|
|
|
slug_start_correction_required = False
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
return slug_start_correction_required
|
|
|
|
|
|
|
|
def check_slug_mid_special_before(data):
|
|
|
|
|
|
|
|
change_done = False
|
|
|
|
slug_parts = data.split()
|
|
|
|
if len(slug_parts) > 1:
|
|
|
|
slug_mid_end = ''
|
|
|
|
for i in range(1,len(slug_parts)):
|
|
|
|
slug_mid_end += slug_parts[i] + ' '
|
|
|
|
|
|
|
|
slug_mid_end = slug_mid_end.rstrip()
|
|
|
|
sub_str = re.match('\w',slug_mid_end[0])
|
|
|
|
if not sub_str:
|
|
|
|
change_done = True
|
|
|
|
print(sub_str,' in ',slug_mid_end)
|
|
|
|
while not sub_str and slug_mid_end.strip():
|
|
|
|
slug_mid_end = slug_mid_end[1:]
|
|
|
|
|
|
|
|
try:
|
|
|
|
sub_str = re.match('\w',slug_mid_end[0])
|
|
|
|
print(sub_str,' in ',slug_mid_end)
|
|
|
|
except:
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
slug_mid_end = slug_mid_end.replace('-',' - ')
|
|
|
|
|
|
|
|
slug_mid_end = (' ').join(slug_mid_end.split())
|
|
|
|
|
|
|
|
data = slug_parts[0] + ' ' + slug_mid_end
|
|
|
|
data = data.rjust(15 + len(data))
|
|
|
|
|
|
|
|
else:
|
|
|
|
print("could not identify slug middle")
|
|
|
|
|
|
|
|
|
|
|
|
return data,change_done
|
|
|
|
|
|
|
|
def check_slug_mid_extra_dot(data):
|
|
|
|
|
|
|
|
change_done = False
|
|
|
|
slug_parts = data.split()
|
|
|
|
if len(slug_parts) > 1:
|
|
|
|
slug_mid_end = ''
|
|
|
|
for i in range(1,len(slug_parts)):
|
|
|
|
slug_mid_end += slug_parts[i] + ' '
|
|
|
|
|
|
|
|
slug_mid_end = slug_mid_end.rstrip()
|
|
|
|
sub_str = re.search('\.',slug_mid_end)
|
|
|
|
if sub_str:
|
|
|
|
change_done = True
|
|
|
|
slug_mid_end = slug_mid_end.replace('.','-')
|
|
|
|
|
|
|
|
slug_mid_end = (' ').join(slug_mid_end.split())
|
|
|
|
|
|
|
|
data = slug_parts[0] + ' ' + slug_mid_end
|
|
|
|
data = data.rjust(15 + len(data))
|
|
|
|
|
|
|
|
return data,change_done
|
|
|
|
|
|
|
|
def check_slug_time(data):
|
|
|
|
|
|
|
|
slug_time_correction_required = True
|
|
|
|
########### slug start checks
|
|
|
|
correct_slugtimes = ['DAY','NIGHT','EVENING']
|
|
|
|
for st in correct_slugtimes:
|
|
|
|
sub_str = re.search(st,data.strip())
|
|
|
|
try:
|
|
|
|
print(st,sub_str,data)
|
|
|
|
except:
|
|
|
|
pass
|
|
|
|
if sub_str:
|
|
|
|
if sub_str.start() == 0:
|
|
|
|
slug_time_correction_required = False
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
return slug_time_correction_required
|
|
|
|
|
|
|
|
|
|
|
|
def audit_english_ps1(df,audit_df,index):
|
|
|
|
|
|
|
|
line_no = df['line_no'][index]
|
|
|
|
print("Auditing Slugline - Language Specific - English")
|
|
|
|
data = df['data'][index]
|
|
|
|
current_comment = ''
|
|
|
|
change_done = False
|
|
|
|
|
|
|
|
slug_start_correction_required = False
|
|
|
|
try:
|
|
|
|
slug_start_correction_required = check_slug_start(data)
|
|
|
|
except:
|
|
|
|
pass
|
|
|
|
print(slug_start_correction_required)
|
|
|
|
|
|
|
|
while slug_start_correction_required:
|
|
|
|
|
|
|
|
## INT- becomes INT.
|
|
|
|
##replace comma by fullstop
|
|
|
|
## space added if not present
|
|
|
|
print("slugline start neeeds correction")
|
|
|
|
slugerrors_dict = {'INT. ':'INT. ',
|
|
|
|
'EXT. ':'EXT. ',
|
|
|
|
'INT-':'INT.',
|
|
|
|
'EXT-':'EXT.',
|
|
|
|
'INT\/EXT-':'INT./EXT.',
|
|
|
|
'EXT\/INT-':'EXT./INT.',
|
|
|
|
'INT -':'INT.',
|
|
|
|
'EXT -':'EXT.',
|
|
|
|
'INT\/EXT -':'INT./EXT.',
|
|
|
|
'EXT\/INT -':'EXT./INT.',
|
|
|
|
'INT ':'INT.',
|
|
|
|
'EXT ':'EXT.',
|
|
|
|
'INT\/EXT ':'INT./EXT.',
|
|
|
|
'EXT\/INT ':'EXT./INT.',
|
|
|
|
'INT,':'INT.',
|
|
|
|
'EXT,':'EXT.',
|
|
|
|
'INT\/EXT,':'INT./EXT.',
|
|
|
|
'EXT\/INT,':'EXT./INT.',
|
|
|
|
'INT\/EXT\.':'INT./EXT.',
|
|
|
|
'EXT\/INT\.':'EXT./INT.',
|
|
|
|
'NT\.':'INT.',
|
|
|
|
'XT\.':'EXT.',
|
|
|
|
'INT\/ EXT ':'INT./EXT. ',
|
|
|
|
'EXT\/ INT ':'EXT./INT. ',
|
|
|
|
'INT \/ EXT ':'INT./EXT. ',
|
|
|
|
'EXT \/ INT ':'EXT./INT. ',
|
|
|
|
'I \s*T':'INT.',
|
|
|
|
'E \s*XT':'EXT.',
|
|
|
|
'INT\.':'INT. ',
|
|
|
|
'EXT\.':'EXT. ',
|
|
|
|
'INT\.\/EXT\.':'INT./EXT. ',
|
|
|
|
'EXT\.\/INT\.':'EXT./INT. '
|
|
|
|
}
|
|
|
|
|
|
|
|
for error,corrected in slugerrors_dict.items():
|
|
|
|
sub_str = re.search(error,data.lstrip(),re.IGNORECASE)
|
|
|
|
if sub_str:
|
|
|
|
if sub_str.start() == 0:
|
|
|
|
data = data.replace(sub_str.group(0),corrected)
|
|
|
|
df['data'][index] = data
|
|
|
|
audit_df['language_specific_audit_comments'][line_no] = 'Slugline start corrected '
|
|
|
|
print("corrected slug start")
|
|
|
|
slug_start_correction_required = check_slug_start(data)
|
|
|
|
break
|
|
|
|
else:
|
|
|
|
slug_start_correction_required = False
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
## slugline keep only single space between slugstart and location
|
|
|
|
######### slug location check
|
|
|
|
slug_start_correction_required = check_slug_start(data)
|
|
|
|
if not slug_start_correction_required:
|
|
|
|
data,change_done = check_slug_mid_special_before(data)
|
|
|
|
if str(audit_df['language_specific_audit_comments'][line_no]) == 'No':
|
|
|
|
current_comment = ''
|
|
|
|
else:
|
|
|
|
current_comment = str(audit_df['language_specific_audit_comments'][line_no]) + '\n'
|
|
|
|
|
|
|
|
if change_done:
|
|
|
|
audit_df['language_specific_audit_comments'][line_no] = current_comment + 'Slugline special char like hyphen removed before location '
|
|
|
|
df['data'][index] = data
|
|
|
|
|
|
|
|
data,change_done = check_slug_mid_extra_dot(data)
|
|
|
|
if str(audit_df['language_specific_audit_comments'][line_no]) == 'No':
|
|
|
|
current_comment = ''
|
|
|
|
else:
|
|
|
|
current_comment = str(audit_df['language_specific_audit_comments'][line_no]) + '\n'
|
|
|
|
|
|
|
|
if change_done:
|
|
|
|
audit_df['language_specific_audit_comments'][line_no] = current_comment + 'Slugline extra dots replaced by hyphen '
|
|
|
|
df['data'][index] = data
|
|
|
|
|
|
|
|
##############
|
|
|
|
#####remove extra hyphens
|
|
|
|
sub_str = re.search('-[ ]*-',data)
|
|
|
|
cmt = False
|
|
|
|
while sub_str:
|
|
|
|
data = data.replace(sub_str.group(0),'-')
|
|
|
|
df['data'][index] = data
|
|
|
|
if str(audit_df['language_specific_audit_comments'][line_no]) == 'No':
|
|
|
|
current_comment = ''
|
|
|
|
else:
|
|
|
|
current_comment = str(audit_df['language_specific_audit_comments'][line_no]) + '\n'
|
|
|
|
|
|
|
|
if not cmt:
|
|
|
|
audit_df['language_specific_audit_comments'][line_no] = current_comment + 'Slugline extra hyphen removed'
|
|
|
|
cmt = True
|
|
|
|
sub_str = re.search('-[ ]*-',data)
|
|
|
|
|
|
|
|
slug_time_correction_required = False
|
|
|
|
change_done = False
|
|
|
|
########### slug time of day check/correction
|
|
|
|
slug_parts = data.split('-')
|
|
|
|
slug_before_time = slug_parts[0]
|
|
|
|
slug_time = ''
|
|
|
|
if len(slug_parts) >= 2:
|
|
|
|
slug_time = ('-').join(slug_parts[1:])
|
|
|
|
#slug_time = slug_parts[1]
|
|
|
|
print(slug_time)
|
|
|
|
slug_time_correction_required = check_slug_time(slug_time)
|
|
|
|
print("slug time is",slug_time)
|
|
|
|
slugtimeerrors_dict = {'EVE':'EVENING'}
|
|
|
|
|
|
|
|
if not slug_time:
|
|
|
|
data = slug_before_time.rstrip() + ' - DAY'
|
|
|
|
df['data'][index] = data
|
|
|
|
if str(audit_df['language_specific_audit_comments'][line_no]) == 'No':
|
|
|
|
current_comment = ''
|
|
|
|
else:
|
|
|
|
current_comment = str(audit_df['language_specific_audit_comments'][line_no]) + '\n'
|
|
|
|
|
|
|
|
audit_df['language_specific_audit_comments'][line_no] = current_comment + 'Slugline Added Default time DAY '
|
|
|
|
print("corrected slug time - added DAY")
|
|
|
|
|
|
|
|
|
|
|
|
elif slug_time_correction_required:
|
|
|
|
print("slugline time neeeds correction")
|
|
|
|
|
|
|
|
for error,corrected in slugtimeerrors_dict.items():
|
|
|
|
sub_str = re.search(error,slug_time.lstrip(),re.IGNORECASE)
|
|
|
|
if sub_str:
|
|
|
|
if sub_str.start() == 0:
|
|
|
|
slug_time = slug_time.replace(sub_str.group(0),corrected)
|
|
|
|
data = slug_before_time.rstrip() + ' - ' + slug_time.lstrip()
|
|
|
|
df['data'][index] = data
|
|
|
|
if str(audit_df['language_specific_audit_comments'][line_no]) == 'No':
|
|
|
|
current_comment = ''
|
|
|
|
else:
|
|
|
|
current_comment = str(audit_df['language_specific_audit_comments'][line_no]) + '\n'
|
|
|
|
|
|
|
|
audit_df['language_specific_audit_comments'][line_no] = current_comment + 'Slugline EVE replaced by EVENING '
|
|
|
|
print("corrected slug time")
|
|
|
|
slug_time_correction_required = check_slug_time(data)
|
|
|
|
break
|
|
|
|
else:
|
|
|
|
slug_time_correction_required = False
|
|
|
|
continue
|
|
|
|
#### slug year correction
|
|
|
|
|
|
|
|
index_iter = iter(df.index)
|
|
|
|
|
|
|
|
|
|
|
|
for index in index_iter:
|
|
|
|
|
|
|
|
if (df['isIdentified'][index] == 'No'):
|
|
|
|
continue
|
|
|
|
|
|
|
|
nl_deleted = False
|
|
|
|
cur_line_pos = df['Identification_Status'][index]
|
|
|
|
fn_name = 'audit_english_' + cur_line_pos
|
|
|
|
line_no = df['line_no'][index]
|
|
|
|
print("\n")
|
|
|
|
print("line no",line_no)
|
|
|
|
print("index ",index)
|
|
|
|
print(cur_line_pos)
|
|
|
|
|
|
|
|
try:
|
|
|
|
to_call_fn = locals()[fn_name]
|
|
|
|
print(to_call_fn)
|
|
|
|
except:
|
|
|
|
continue
|
|
|
|
|
|
|
|
try:
|
|
|
|
#nl_deleted = to_call_fn(df,audit_df,index)
|
|
|
|
to_call_fn(df,audit_df,index)
|
|
|
|
except:
|
|
|
|
pass
|
|
|
|
|
|
|
|
# if nl_deleted :
|
|
|
|
# next(index_iter)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#df = df.sort_index().reset_index(drop=True)
|
|
|
|
|
|
|
|
#df = df.sort_values(by=['line_no']).reset_index(drop =True)
|
|
|
|
|
|
|
|
return df
|
2024-09-03 12:07:33 +00:00
|
|
|
|
|
|
|
def ai_gen_script_to_audited_df(df):
|
|
|
|
|
|
|
|
for index in df.index:
|
|
|
|
if df['isIdentified'][index] == 'Yes' or df['Identification_Status'][index] == 'blank' :
|
|
|
|
continue
|
|
|
|
if str(df['data'][index]).strip() == "" :
|
|
|
|
df['isIdentified'][index] = 'Yes'
|
|
|
|
df['Identification_Status'][index] = 'blank'
|
|
|
|
continue
|
|
|
|
if df['data'][index].startswith('INT.') or df['data'][index].startswith('EXT.') :
|
|
|
|
df['Identification_Status'][index] = 'ps1'
|
|
|
|
df['isIdentified'][index] = 'Yes'
|
|
|
|
#print(df['data'][index])
|
|
|
|
continue
|
|
|
|
if df['nlb'][index] == 'Y' and df['plb'][index] == 'Y' and df['case'][index] == 'AllUpper':
|
|
|
|
df['Identification_Status'][index] = 'ps16'
|
|
|
|
df['isIdentified'][index] = 'Yes'
|
|
|
|
#print(df['data'][index])
|
|
|
|
continue
|
|
|
|
if df['nlb'][index] == 'Y' and df['plb'][index] == 'Y' :
|
|
|
|
df['Identification_Status'][index] = 'ps6'
|
|
|
|
df['isIdentified'][index] = 'Yes'
|
|
|
|
#print(df['data'][index])
|
|
|
|
continue
|
|
|
|
if df['nlb'][index] == 'Y' and df['plb'][index] == 'N' :
|
|
|
|
df['Identification_Status'][index] = 'ps15'
|
|
|
|
df['isIdentified'][index] = 'Yes'
|
|
|
|
#print(df['data'][index])
|
|
|
|
continue
|
|
|
|
if df['nlb'][index] == 'N' and df['plb'][index] == 'Y' and df['parenthetical'][index] == 'PartMidEnd':
|
|
|
|
df['Identification_Status'][index] = 'ps8'
|
|
|
|
df['isIdentified'][index] = 'Yes'
|
|
|
|
#print(df['data'][index])
|
|
|
|
continue
|
|
|
|
if df['nlb'][index] == 'N' and df['plb'][index] == 'Y' and df['parenthetical'][index] == 'Absent':
|
|
|
|
df['Identification_Status'][index] = 'ps7'
|
|
|
|
df['isIdentified'][index] = 'Yes'
|
|
|
|
#print(df['data'][index])
|
|
|
|
continue
|
|
|
|
if df['nlb'][index] == 'N' and df['plb'][index] == 'N' and df['parenthetical'][index] == 'Complete':
|
|
|
|
df['Identification_Status'][index] = 'ps10'
|
|
|
|
df['isIdentified'][index] = 'Yes'
|
|
|
|
#print(df['data'][index])
|
|
|
|
continue
|
|
|
|
## identify unidentified as actions
|
|
|
|
df['Identification_Status'][index] = 'ps6'
|
|
|
|
df['isIdentified'][index] = 'Yes'
|
|
|
|
|
|
|
|
return df
|