Conversion_Kitchen_Code/kitchen_counter/scriptAudit/sa_functions_english.py

import re

def trim_intro_english(df,audit_df):


    print("Identifying and Removing Introduction and Titles")
    ## if found fadein
    ## else if found first slugline - INT

    stopwords = ['FADE IN' ]
    remove_upto = -1
    intro_removed = False

    for index in df.index:
        data = df['data'][index]
        data = ' '.join(data.split())
        if not data.strip():
            print("blank line move on")
            continue
        for sw in stopwords:
            if re.search(sw,data,re.IGNORECASE):
                print("Found Fade In",index)
                line_no = df['line_no'][index]
                df['Identification_Status'][index] = 'ps17'
                audit_df['Identification_Status'][line_no] = 'ps17'
                if index == 0:
                    intro_removed = True
                    return

                remove_upto = index-1
                if remove_upto <= 100 :
                    print("removing lines till ", remove_upto)
                    while remove_upto != -1:
                        line_no = df['line_no'][remove_upto]
                        audit_df['line_removed'][line_no] = 'Yes'
                        audit_df['introduction'][line_no] = 'Yes'

                        df.drop(remove_upto,inplace= True)
                        remove_upto -= 1

                        intro_removed = True
                    print("title and introduction removed")
                break
        if intro_removed:
            return

    print("stop words not found")

    ## fade in not found check for first slugline
    slugwords = ['INT','EXT','I/E','E/I' ]
    remove_upto = -1
    intro_removed = False
    print("looking for first slugline")

    for index in df.index:
        data = df['data'][index]
        data = ' '.join(data.split())
        if not data.strip():
            print("blank line move on")
            continue
        for sw in slugwords:
            print(sw,data)
            if re.search(sw,data.split()[0],re.IGNORECASE):
                print("Found Slugline , also idenfifying as slugline",index)
                df['Identification_Status'][index] = 'ps1'
                remove_upto = index-1
                if remove_upto < 0:
                    intro_removed = True
                    print("no intro")
                    break
                elif remove_upto <= 100 :

                    print("removing lines till ", remove_upto)
                    while remove_upto != -1:
                        line_no = df['line_no'][remove_upto]
                        audit_df['line_removed'][line_no] = 'Yes'
                        audit_df['introduction'][line_no] = 'Yes'

                        df.drop(remove_upto,inplace= True)
                        remove_upto -= 1

                        intro_removed = True
                    print("title and introduction removed before slugline")
                break
        if intro_removed:
            return


def trim_appendix_english(df,audit_df):


    print("\nIdentifying and Removing Appendix /Epilogue ")
    ## if found fadein
    ## else if found first slugline - INT

    stopwords = ['FADE OUT','THE END','BLACK' ]
    remove_upto = df.index[-1]
    appendix_removed = False

    for index in df.index[::-1]:
        data = df['data'][index]
        data = ' '.join(data.split())
        ##print(data)
        if not data.strip():
            print("blank line move on")
            continue
        for sw in stopwords:
            search_data = data.replace(":","")
            if re.match(sw,search_data.strip(),re.IGNORECASE):
                print("Found stop word",sw,' at index ',index)

                remove_upto = index
                if remove_upto >= 5 :
                    line_no = df['line_no'][remove_upto]
                    audit_df['Identification_Status'][line_no] = 'ps17'

                    print("removing lines from ", remove_upto)

                    while remove_upto != len(df)  :
                        print(len(df))
                        print("in while",remove_upto)
                        line_no = df['line_no'][df.index[-1]]
                        audit_df['line_removed'][line_no] = 'Yes'
                        audit_df['appendix'][line_no] = 'Yes'

                        df.drop(df.index[-1],inplace= True)


                        appendix_removed = True
                    print("prologue /appendix after stop words removed for audit")
                break
        if appendix_removed:
            return

    print("stop words not found")

    return

def check_and_remove_numbers(df,audit_df,index):
        data = df['data'][index]
        start_is_num = True
        ## check if number at start
        while start_is_num:
            sub_num = re.search('\d',data.lstrip())
            if sub_num:
                if sub_num.start() == 0:
                    data = data.replace(sub_num.group(0),'')
                    df['data'][index] = data
                    continue
            start_is_num = False


def update_pos_wts_english(df):

    print("Running english specific weights update")
    print(df['Identification_Status'].iloc[0],df['Identification_Status'].iloc[1])
    ## line after fade in
    if df['Identification_Status'].iloc[0] == 'ps17':
        if df['nlb'].iloc[0] == 'Y':
            df["ps1"].iloc[2] += 20
        else:
            df["ps1"].iloc[1] += 20

    elif df['Identification_Status'].iloc[1] == 'ps17':
        if df['nlb'].iloc[1] == 'Y':
            df["ps1"].iloc[3] += 20
        else:
            df["ps1"].iloc[2] += 20


    for index in df.index:

        line_no = df['line_no'][index]
        data = df['data'][index]
        plb = df['plb'][index]
        nlb = df['nlb'][index]
        par = df['parenthetical'][index]


        pnbl_index = False
        nnbl_index = False

        try:
            pnbl_line_no = df['pnbl_line_no'][index]
            pnbl_index = df.loc[df['line_no'] == pnbl_line_no,:].index.values[0]
        except:
            pnbl_index = False

        try:
            nnbl_line_no = df['nnbl_line_no'][index]
            nnbl_index = df.loc[df['line_no'] == nnbl_line_no,:].index.values[0]
        except:
            nnbl_index = False


        try:
            pnbl_indent = df['ssc'][pnbl_index]
        except:
            pnbl_indent = -1
        try:
            nnbl_indent = df['ssc'][nnbl_index]
        except:
            nnbl_indent = -1

        cur_indent = df['ssc'][index]
        ssc_col = 'ssc_' + str(cur_indent)


        case = df['case'][index]
        try:
            print("processing line no",line_no, data)
        except:
            pass

        print(plb)
        print(nlb)
        print(pnbl_indent)
        print(nnbl_indent)


        lcp = df['lcp'][index]
        #print("lcp ",lcp)
        lcp_col = "lcp_" + str(lcp)

        ###########
        ######### english specific wts
        ###### if number in less than 15
        first_15 = ''

        if len(data) > 15:
            first_15 = data[0:15]
        else:
            first_15 = data

        print(first_15)
        num_in_15 = False
        num= ''

        for ch in first_15.strip():
            if re.match('\d',ch):
                num_in_15 = True
                num += ch
                continue
            else:
                break

        df['scene_number'][index] = str(num)

        if num_in_15:
            print("number found in first 15")
            print(num)
            df["ps1"][index] += 10

        ## int, ext, day, night
        slug_phrases = ['INT','EXT','DAY','NIGHT']
        for slug_phrase in slug_phrases:
            if df['case'][index] != 'AllUpper':
                break
            elif re.search(slug_phrase,data):
                print("slugphrase found",slug_phrase)
                df["ps1"][index] += 20
                break


        ## CUT TO

        if re.match('CUT TO',data.replace(":",'').strip(),re.IGNORECASE):
            print("cut to exact")
            df["ps16"][index] += 25
        elif re.search('CUT TO',data.replace(":",'').strip(),re.IGNORECASE):
            print("cut to found")
            df["ps16"][index] += 10

        ##speaker related terms
        sp_terms = ['voice from mobile','voice from phone','voice from tv','voice from radio']
        for term in sp_terms:
            sub_str = re.search(term,data,re.IGNORECASE)
            if sub_str:
                print("possible speaker term found increasing sp weight",term)
                df["ps7"][index] += 10
                break


        if par == 'PartMidEnd':
            par_data = data.split("(")[1][:-1].strip()
            sp_par_terms = ['continued','cont.d','cont']
            for term in sp_par_terms:
                sub_str = re.match(term,par_data,re.IGNORECASE)
                if sub_str:
                    print("possible speaker term found in parenthtical increasing sp weight",term)
                    df["ps7"][index] += 10
                    break
            ## V.O. or O.S. or VO or OS
            sp_ext_par_terms = ['VO','OS','V\.O','O\.S']
            for term in sp_ext_par_terms:
                sub_str = re.match(term,par_data)
                if sub_str:
                    print("possible speaker extension term found in parenthtical increasing sp externsion weight weight",term)
                    df["ps8"][index] += 10
                    break


#         #print(wts_df.head(0))
#         # make space dict for getting relevant space columns for weights
#         sp_bin_dict = {1:'0-14',2:'15',3:'16-24',4:'25',5:'26-29',6:'30',7:'31-34',8:'35',9:'36-73',10:'74onwards'
#                   }
#         #loop over for  the possibilities

#         for i in range(1,32):

#             if i in ('23','24','32','33'):
#                 continue

#             df["ps{0}".format(i)][index] = 0

#             ## get weights for the case
#             if case in ('EndUpper','MidUpper'):
#                 case = 'FirstLowerSomeUpper'

#             if case != 'None':
#                 df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),case]


#             ## get weights based on the starting space count

#             try:
#                 df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),ssc_col]
#                 #print("starting wight code was here")
#             except:
#                 pass

#             #print("ps{0}".format(i),df["ps{0}".format(i)][index])
#             ## get weights for <19 with Numeric character or <19 without Numeric character

#             pos_num = re.search('[0-9]',data)
#             if (pos_num!= None) and cur_indent<15:
#                 df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'<15withNumeric']
#             elif check_space(data)<15:
#                 df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'<15withoutNumeric']

#             if cur_indent>65:
#                 df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'ssc_gt_65']
#             ## get weights based on the last character placement

#             try:
#                 df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),lcp_col]
#                 #print("code was here")
#             except:
#                 pass

#             # how far is it from position 51 63 78
#             # 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78
#             # modify the wights matrix and create in between weights

#             #print("ps{0}".format(i),df["ps{0}".format(i)][index])

#             # Calculation of weights based on plb and nlb(L-O column in sheet)
#             if plb == "Y":
#                 df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'PLB_Yes']
#             if plb == "N":
#                 df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'PLB_No']
#             if nlb == "Y":
#                 df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'NLB_Yes']
#             if nlb == "N":
#                 df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'NLB_No']

#             #print("ps{0}".format(i),df["ps{0}".format(i)][index])

#             # Calculation of weights based on parenthesis(H-K column in sheet)

#             if re.match('\(',data.strip()[:1]) and re.match('\)',data.strip()[-1:]) :
#                # print('EntireLine')
#                 df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'EntireLine']
#             elif re.search('\(',data.strip()) and  re.search('\)',data.strip())   :
#                 #print('PartofLine')
#                 df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'PartofLine']
#             elif re.search('\(',data.strip()) and  not(re.search('\)',data.strip()))   :
#                 #print('only left parenthetical present')
#                 df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'only left parenthetical present']
#             elif not(re.search('\(',data.strip())) and  re.search('\)',data.strip())   :
#                 #print('only right parenthetical present')
#                 df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'only right parenthetical present']

#             #df["ps{0}".format(i)][index] = math.trunc(df["ps{0}".format(i)][index])
#             #print("i is ",i)
#             #print(math.trunc(ps_dict["ps{0}".format(i)]))

#             ## Calculation of weights based on indent equals previous / next non blank line
#             if cur_indent == pnbl_indent:
#                 df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'cur_indent_equals_pnbl']

#             if cur_indent == nnbl_indent:
#                 df["ps{0}".format(i)][index] += wts_df.loc["ps{0}".format(i),'cur_indent_equals_nnbl']

#             print("ps{0}".format(i),df["ps{0}".format(i)][index])


#     if 'actual_element' not in df.columns:
#         df['actual_element'] = ''

    return df


def run_audit_on_identified_english(df,audit_df):
    def check_slug_start(data):

        slug_start_correction_required = True
        ########### slug start checks
        correct_slugstarts = ['INT\. ','EXT\. ','INT\.\/EXT\. ','EXT\.\/INT\. ']
        for ss in correct_slugstarts:
            sub_str = re.search(ss,data.lstrip())
            try:
                print(ss,sub_str,data)
            except:
                pass
            if sub_str:
                if sub_str.start() == 0:
                    slug_start_correction_required = False
                    break


        return slug_start_correction_required

    def check_slug_mid_special_before(data):

        change_done = False
        slug_parts = data.split()
        if len(slug_parts) > 1:
            slug_mid_end = ''
            for i in range(1,len(slug_parts)):
                slug_mid_end += slug_parts[i] + ' '

            slug_mid_end = slug_mid_end.rstrip()
            sub_str =  re.match('\w',slug_mid_end[0])
            if not sub_str:
                change_done = True
            print(sub_str,' in ',slug_mid_end)
            while not sub_str and slug_mid_end.strip():
                slug_mid_end = slug_mid_end[1:]

                try:
                    sub_str =  re.match('\w',slug_mid_end[0])
                    print(sub_str,' in ',slug_mid_end)
                except:
                    pass


            slug_mid_end = slug_mid_end.replace('-',' - ')

            slug_mid_end = (' ').join(slug_mid_end.split())

            data = slug_parts[0] + ' ' + slug_mid_end
            data = data.rjust(15 + len(data))

        else:
            print("could not identify slug middle")


        return data,change_done

    def check_slug_mid_extra_dot(data):

        change_done = False
        slug_parts = data.split()
        if len(slug_parts) > 1:
            slug_mid_end = ''
            for i in range(1,len(slug_parts)):
                slug_mid_end += slug_parts[i] + ' '

            slug_mid_end = slug_mid_end.rstrip()
            sub_str =  re.search('\.',slug_mid_end)
            if sub_str:
                change_done = True
                slug_mid_end = slug_mid_end.replace('.','-')

            slug_mid_end = (' ').join(slug_mid_end.split())

            data = slug_parts[0] + ' ' + slug_mid_end
            data = data.rjust(15 + len(data))

        return data,change_done

    def check_slug_time(data):

        slug_time_correction_required = True
        ########### slug start checks
        correct_slugtimes = ['DAY','NIGHT','EVENING']
        for st in correct_slugtimes:
            sub_str = re.search(st,data.strip())
            try:
                print(st,sub_str,data)
            except:
                pass
            if sub_str:
                if sub_str.start() == 0:
                    slug_time_correction_required = False
                    break


        return slug_time_correction_required


    def audit_english_ps1(df,audit_df,index):

        line_no = df['line_no'][index]
        print("Auditing Slugline -  Language Specific - English")
        data = df['data'][index]
        current_comment = ''
        change_done = False

        slug_start_correction_required = False
        try:
            slug_start_correction_required = check_slug_start(data)
        except:
            pass
        print(slug_start_correction_required)

        while slug_start_correction_required:

            ## INT- becomes INT.
            ##replace comma by fullstop
            ## space added if not present
            print("slugline start neeeds correction")
            slugerrors_dict = {'INT. ':'INT. ',
                               'EXT. ':'EXT. ',
                               'INT-':'INT.',
                               'EXT-':'EXT.',
                               'INT\/EXT-':'INT./EXT.',
                               'EXT\/INT-':'EXT./INT.',
                               'INT -':'INT.',
                               'EXT -':'EXT.',
                               'INT\/EXT -':'INT./EXT.',
                               'EXT\/INT -':'EXT./INT.',
                               'INT ':'INT.',
                               'EXT ':'EXT.',
                               'INT\/EXT ':'INT./EXT.',
                               'EXT\/INT ':'EXT./INT.',
                               'INT,':'INT.',
                               'EXT,':'EXT.',
                               'INT\/EXT,':'INT./EXT.',
                               'EXT\/INT,':'EXT./INT.',
                              'INT\/EXT\.':'INT./EXT.',
                               'EXT\/INT\.':'EXT./INT.',
                               'NT\.':'INT.',
                               'XT\.':'EXT.',
                               'INT\/ EXT ':'INT./EXT. ',
                               'EXT\/ INT ':'EXT./INT. ',
                               'INT \/ EXT ':'INT./EXT. ',
                               'EXT \/ INT ':'EXT./INT. ',
                               'I \s*T':'INT.',
                               'E \s*XT':'EXT.',
                               'INT\.':'INT. ',
                               'EXT\.':'EXT. ',
                               'INT\.\/EXT\.':'INT./EXT. ',
                               'EXT\.\/INT\.':'EXT./INT. '
                              }

            for error,corrected in slugerrors_dict.items():
                sub_str = re.search(error,data.lstrip(),re.IGNORECASE)
                if sub_str:
                    if sub_str.start() == 0:
                        data = data.replace(sub_str.group(0),corrected)
                        df['data'][index] = data
                        audit_df['language_specific_audit_comments'][line_no] = 'Slugline start corrected '
                        print("corrected slug start")
                        slug_start_correction_required = check_slug_start(data)
                        break
                else:
                    slug_start_correction_required = False
                    continue


        ## slugline keep only single space between slugstart and location
        ######### slug location check
        slug_start_correction_required = check_slug_start(data)
        if not slug_start_correction_required:
            data,change_done = check_slug_mid_special_before(data)
            if str(audit_df['language_specific_audit_comments'][line_no]) == 'No':
                current_comment = ''
            else:
                current_comment = str(audit_df['language_specific_audit_comments'][line_no]) + '\n'

            if change_done:
                audit_df['language_specific_audit_comments'][line_no] = current_comment + 'Slugline special char like hyphen removed before location '
            df['data'][index] = data

            data,change_done = check_slug_mid_extra_dot(data)
            if str(audit_df['language_specific_audit_comments'][line_no]) == 'No':
                current_comment = ''
            else:
                current_comment = str(audit_df['language_specific_audit_comments'][line_no]) + '\n'

            if change_done:
                audit_df['language_specific_audit_comments'][line_no] = current_comment + 'Slugline extra dots replaced by hyphen '
            df['data'][index] = data

        ##############
        #####remove extra hyphens
        sub_str = re.search('-[ ]*-',data)
        cmt = False
        while sub_str:
            data = data.replace(sub_str.group(0),'-')
            df['data'][index] = data
            if str(audit_df['language_specific_audit_comments'][line_no]) == 'No':
                current_comment = ''
            else:
                current_comment = str(audit_df['language_specific_audit_comments'][line_no]) + '\n'

            if not cmt:
                audit_df['language_specific_audit_comments'][line_no] = current_comment + 'Slugline extra hyphen removed'
            cmt = True
            sub_str = re.search('-[ ]*-',data)

        slug_time_correction_required = False
        change_done = False
        ########### slug time of day check/correction
        slug_parts = data.split('-')
        slug_before_time = slug_parts[0]
        slug_time = ''
        if len(slug_parts) >= 2:
            slug_time = ('-').join(slug_parts[1:])
            #slug_time = slug_parts[1]
            print(slug_time)
            slug_time_correction_required = check_slug_time(slug_time)
        print("slug time is",slug_time)
        slugtimeerrors_dict = {'EVE':'EVENING'}

        if not slug_time:
            data = slug_before_time.rstrip() + ' - DAY'
            df['data'][index] = data
            if str(audit_df['language_specific_audit_comments'][line_no]) == 'No':
                current_comment = ''
            else:
                current_comment = str(audit_df['language_specific_audit_comments'][line_no]) + '\n'

            audit_df['language_specific_audit_comments'][line_no] = current_comment + 'Slugline Added Default time DAY '
            print("corrected slug time - added DAY")

        
        elif slug_time_correction_required:
            print("slugline time neeeds correction")

            for error,corrected in slugtimeerrors_dict.items():
                sub_str = re.search(error,slug_time.lstrip(),re.IGNORECASE)
                if sub_str:
                    if sub_str.start() == 0:
                        slug_time = slug_time.replace(sub_str.group(0),corrected)
                        data = slug_before_time.rstrip() + ' - ' + slug_time.lstrip()
                        df['data'][index] = data
                        if str(audit_df['language_specific_audit_comments'][line_no]) == 'No':
                            current_comment = ''
                        else:
                            current_comment = str(audit_df['language_specific_audit_comments'][line_no]) + '\n'

                        audit_df['language_specific_audit_comments'][line_no] = current_comment + 'Slugline EVE replaced by EVENING '
                        print("corrected slug time")
                        slug_time_correction_required = check_slug_time(data)
                        break
                else:
                    slug_time_correction_required = False
                    continue
                #### slug year correction

    index_iter = iter(df.index)


    for index in index_iter:

        if (df['isIdentified'][index] == 'No'):
            continue

        nl_deleted = False
        cur_line_pos = df['Identification_Status'][index]
        fn_name  = 'audit_english_' + cur_line_pos
        line_no =  df['line_no'][index]
        print("\n")
        print("line no",line_no)
        print("index ",index)
        print(cur_line_pos)

        try:
            to_call_fn = locals()[fn_name]
            print(to_call_fn)
        except:
            continue

        try:
            #nl_deleted = to_call_fn(df,audit_df,index)
            to_call_fn(df,audit_df,index)
        except:
            pass

#         if nl_deleted :
#             next(index_iter)


    #df = df.sort_index().reset_index(drop=True)

    #df =  df.sort_values(by=['line_no']).reset_index(drop =True)

    return df

def ai_gen_script_to_audited_df(df):

    for index in df.index:
        if df['isIdentified'][index] == 'Yes' or df['Identification_Status'][index] == 'blank' :
            continue
        if str(df['data'][index]).strip() == "" :
            df['isIdentified'][index] = 'Yes'
            df['Identification_Status'][index] = 'blank'
            continue
        if df['data'][index].startswith('INT.') or df['data'][index].startswith('EXT.') :
            df['Identification_Status'][index] = 'ps1'
            df['isIdentified'][index] = 'Yes'
            #print(df['data'][index])
            continue
        if df['nlb'][index] == 'Y' and df['plb'][index] == 'Y' and df['case'][index] == 'AllUpper':
            df['Identification_Status'][index] = 'ps16'
            df['isIdentified'][index] = 'Yes'
            #print(df['data'][index])
            continue
        if df['nlb'][index] == 'Y' and df['plb'][index] == 'Y' :
            df['Identification_Status'][index] = 'ps6'
            df['isIdentified'][index] = 'Yes'
            #print(df['data'][index])
            continue
        if df['nlb'][index] == 'Y' and df['plb'][index] == 'N' :
            df['Identification_Status'][index] = 'ps15'
            df['isIdentified'][index] = 'Yes'
            #print(df['data'][index])
            continue
        if df['nlb'][index] == 'N' and df['plb'][index] == 'Y' and df['parenthetical'][index] == 'PartMidEnd':
            df['Identification_Status'][index] = 'ps8'
            df['isIdentified'][index] = 'Yes'
            #print(df['data'][index])
            continue
        if df['nlb'][index] == 'N' and df['plb'][index] == 'Y' and df['parenthetical'][index] == 'Absent':
            df['Identification_Status'][index] = 'ps7'
            df['isIdentified'][index] = 'Yes'
            #print(df['data'][index])
            continue
        if df['nlb'][index] == 'N' and df['plb'][index] == 'N' and df['parenthetical'][index] == 'Complete':
            df['Identification_Status'][index] = 'ps10'
            df['isIdentified'][index] = 'Yes'
            #print(df['data'][index])
            continue
        ## identify unidentified as actions
        df['Identification_Status'][index] = 'ps6'
        df['isIdentified'][index] = 'Yes'

    return df