Conversion_Kitchen_Code/kitchen_counter/conversion/translation/detection4march.py

# import textract
from tqdm import tqdm
import sys 
import re
import docx
import os
doc = docx.Document() 
from docx.shared import Inches, Cm, Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.table import WD_TABLE_ALIGNMENT, WD_ALIGN_VERTICAL
from collections import Counter
from statistics import mode


from .translation.script_detector import script_cat
from .translation.script_writing import default_script


# import textract
from tqdm import tqdm
import sys 
import re
import docx
import os
doc = docx.Document() 
from docx.shared import Inches, Cm, Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.table import WD_TABLE_ALIGNMENT, WD_ALIGN_VERTICAL
from MNF.settings import BasePath
basePath = BasePath()


#google
#os.environ["GOOGLE_APPLICATION_CREDENTIALS"]=rf"{basePath}/conversion/My First Project-2573112d5326.json"
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = rf"{basePath}/conversion/gifted-mountain-318504-4f001d5f08db.json"
from google.cloud import translate
from google.cloud import translate_v2 as Translate
translate_client = Translate.Client()
client = translate.TranslationServiceClient()
project_id = 'authentic-bongo-272808'
location = "global"
parent = f"projects/{project_id}/locations/{location}"


slug_pattern= r'[\d]*[.]?[\s]*[IE][NX]T'
pat = r'[\d]*[\s]*[IE]/[IE][.]?'
transitions = ['CUT TO:','FADE IN:','FADE OUT:','DISSOLVE TO:','JUMP CUT TO:','JUMP TO:','CUT BACK TO:','INTERCUT WITH:','I/C WITH:','BACK TO:', 'INTERVAL']
reserved_words = ['MONTAGE','PBS','FADE','FADE','TITLE','SPLIT', 'SCREEN','CUT']
style = doc.styles['Normal']
font = style.font
font.name = 'Courier New'
font.size = Pt(12)

def breaksen(s):
    l =[]
    #if len(s.split())<=256:
    if len(s.split())<=256:
        l.append(s)
    else:
        n = len(s.split())
        for i in range(n//32 + 1):
            l.append(" ".join(s.split()[32*i:32*(i+1)]))
    return l

def getRefined(filename1):
    print("get_refined_called")
    total_scenes = 0
    text = textract.process(filename1, encoding="utf8", errors='ignore')
    filename= rf"{basePath}/conversion/translation/file.txt"
    f=open(filename, 'wb')
    f.write(text)
    f.close()
    dialog_coming=False
    f=open(filename, 'r',  encoding="utf8", errors='ignore')
    doc11=f.read()

    f.close()
    f1=open(rf"{basePath}/conversion/translation/file1.txt",'w', encoding="utf8", errors='ignore')
    c=0
    flag=False
    print("Slugline")
    for line in doc11.split("\n"):
        line=line.strip()
        print("line 427:",line)
        if (line.strip().startswith(('INT.','INT ')) or \
            line.strip().startswith(('I/E','E/I')) or \
            line.strip().startswith(('EXT.','EXT ')) or \
            line.strip().startswith('EXT/INT') or \
            line.strip().startswith('INT/EXT') or \
            re.match(slug_pattern, line.strip()) or re.match(pat, line.strip())) and not (line.strip().startswith(('INTERCUT', 'INTERMISSION', 'INTERVAL'))):
    
            flag=True
            f1.write(line)
            f1.write('\n')
            continue
        else:
            print("line 96: else loop", line)
            #line = line.strip()
            if flag:
                print("line 99: if loop:", line)
                if line.strip()=='\n':
                    continue
                if dialog_coming and (line=='\n' or line.strip()==""):
                    print("line empty or just have newline", line)
                    continue
                if dialog_coming:
                    print("line 101 probable dialog or PC: ", line)
                    f1.write(line)
                    f1.write('\n')
                    if re.match(r"\(.*\)",line):
                        
                        continue
                    else:
                        print(" line 207: else of PCs", line)
                        dialog_coming=False
                        print(" line 457 dialog over")
                        continue
                    continue
                # if line.isupper() and re.fullmatch(r"([A-Z'’]*[.]*[\s]*[-]*)*([#]*[\s]*[1-9])*(\(.*\))*",line.strip()):
                if line.isupper() and (re.fullmatch(r"([A-Z'’]+[\s]*[-]*)*([#]*[\s]*[1-9])*(\(.*\))*", line.strip()) or re.fullmatch(r"(MRS?|DR|ER|PHD|ESQ|HON|JR|MS|MESSRS|MMES|MSGR|PROF|REV|RT. HON|SR|ST)\. [A-Z]+",line.strip())):
                    print("line 111: May be speaker: ", line)
                    f1.write(line)
                    f1.write('\n')
                    dialog_coming=True
                    continue
                
                if not line=='\n':
                    print("470 probably action or something else so just write it", line)
                    f1.write(line)
                    f1.write('\n')


    f1.close()
    print("line 132 file closed")
    filename1=rf"{basePath}/conversion/translation/file1.txt"
    #file.txt contains the data of file1.txt  , no usage as of now may be change the mame of the file.txt to file1.txt
    text = textract.process(filename1, encoding="utf8", errors='ignore')
    print("line 136: ",text)
    filename=rf"{basePath}/conversion/translation/file.txt"
    _, file_extension = os.path.splitext(filename1)
    f=open(filename, 'wb')
    f.write(text)
    f.close()

    with open(filename, "r") as input:
        input_ = input.read().split('\n\n')

    refined=[]

    for line in input_:
        refined.append(line.strip())
    refined=list(filter(lambda a: a != "", refined)) 
    #print("processing the script")                              

    for i in range(len(refined)):
        if not (refined[i].strip().startswith(('INT.','INT ')) or refined[i].strip().startswith(('EXT.','EXT ')) or refined[i].strip().startswith('EXT/INT') or refined[i].strip().startswith('INT/EXT') or refined[i].strip().startswith(('I/E','E/I')) or re.match(slug_pattern,refined[i].strip()) or re.match(pat,refined[i].strip())):
            total_scenes = total_scenes + 1
            continue
        refined=refined[i:]
        break
    # refined.append(line.strip())
    
    refined=list(filter(lambda a: a != "", refined))   
    print("line 163:Refined",refined)
    return refined,total_scenes

def getSlugAndNonSlug(refined):
    sluglines=[]
    without_slug=[]
    for para in refined:
        para=para.strip()
        if para.strip().startswith(('INT.','INT')) or para.strip().startswith(('EXT.','EXT')) or para.strip().startswith('EXT/INT') or para.strip().startswith(('I/E','E/I')) or para.strip().startswith('INT/EXT') or re.match(slug_pattern,para.strip()) or re.match(pat,para.strip()):
            sluglines.append(para)
            continue
        without_slug.append(para)
    return sluglines,without_slug

def getSpeakers(without_slug):
    characters=[]
    for para in without_slug:
        lis=para.split('\n')
        i=0
        for item in lis:
            i=i+1
            i=min(i,len(lis)-2)
            if item.isupper() and not(lis[i+1].strip()==""):
                if re.match(r"[A-Z'’]+[\s]*[-]*[A-Z'’]*([#]*[\s]*[1-9])*(\(.*\))*",item):
                    tem = item.split("(")[0].strip()
                    characters.append(tem.strip())
                else:
                    continue
                    
    characters=list(set(characters))
    characters=list(filter(lambda x: len(x) >0,characters))
    characters = [character for character in characters if set(character.split(" ")).intersection(reserved_words) == set()]
    return characters


def getScenes(refined,total_scenes,characters):
    # To find scenes data structure and prev and next scenes numbers
    i=0
    scene=[]
    dialogues=[]
    speakers=[]
    slugline_dic={}
    prev_dial_speaker=""
    next_dial_speaker=""
    pc=0
    scene_no=0
    actionline=[]
    successor_scene_no=0
    predecessor_scene_no=0
    parenthetical_lis=[]
    
    scenes=[]
    speaker=""
    parenthetical='NONE'
    patttern=r'[\d]*[.]?[\s]*[IE][NX]T'
    for line in refined:
        if line.strip().startswith(('INT.','INT')) or line.strip().startswith(('EXT.','EXT')) or line.strip().startswith('EXT/INT') or line.strip().startswith('INT/EXT') or line.strip().startswith(('I/E','E/I')) or re.match(slug_pattern,line.strip()) or re.match(pat,line.strip()):
            scenes.append(scene)
            scene=[]
            i=0
            scene_no+=1
            scene.append(line)
            slugline_dic[scene_no]=line.split("\n")[0].strip('0123456789.- ')

        else:
            lis=line.split("\n")
            lis=[l.strip() for l in lis]
            print(" \n Line 222 probable dialogue list", lis)
            word=lis[0]
            if word.split('(')[0].strip() in characters:
                mydic={}
                prev_dial_speaker=speaker
                speaker=word.split('(')[0].strip()
                print("Speaker 228", speaker)
                if len(lis)>1 and re.match(r"\(.*\)",lis[1]):
                    pc=pc+1
                    parenthetical=lis[1]
                    parenthetical=parenthetical.replace("\n","")
                    dia=' '.join(lis[2:])
                    dia=dia.replace("\n","")
                    ##renu
                    dia=dia.replace("\"", '')
                
                else:
                    dia=''.join(lis[1:])
                    dia=dia.replace("\n","")
                    dia=dia.replace("\"", '')
                    print("  length dia\n", len(dia))
                if not (len(dia)==0 and parenthetical=="NONE"):
                    print(" len dia != and Parenthetical == NONE: 384 ")
                    
                    if i-1 >= 0:
                        try:
                            prev=main_lis[scene_no-1][i-1]
                        except:
                            prev=""
                    else:
                        prev=""
                    try:
                        next=main_lis[scene_no-1][i+1]
                    except:
                        next=""
                    #prev is previous speaker and next is next speaker of the dialogue
                    mydic[speaker]=[parenthetical,scene_no,dia,len(dia),prev,next]
                    print("line 259",mydic)
                    #print("mydic  260", speaker, mydic[speaker])
                    prev,next="",""
                    i=i+1
                    speakers.append(speaker)
                    parenthetical_lis.append(parenthetical)
                    dialogues.append(mydic)
                    scene.append(mydic)
                parenthetical="NONE"
            else:
                line=line.replace("\n"," ")
                line=' '.join(line.split())
                if line.strip() in transitions:
                    scene.append({'Transition':line.strip()})
                    continue
                actionline.append(line)
                scene.append(line.strip())


    scenes.append(scene)
    speakers=list(set(speakers))
    scenes=scenes[1:]
    s = []
    for scene in scenes:
        s1=[]
        for ele in scene:
            if type(ele) == type(""):
                s1.extend(ele.split("\n"))
            else:
                s1.append(ele)
        s.append(s1)
    print("dialogue: ",dialogues)
    return s,actionline,parenthetical_lis,speakers,dialogues

# def getScenes(refined, total_scenes, characters):
#     # To find scenes data structure and prev and next scenes numbers
#     i = 0
#     scene = []
#     dialogues = []
#     speakers = []
#     slugline_dic = {}
#     prev_dial_speaker = ""
#     next_dial_speaker = ""
#     pc = 0
#     scene_no = 0
#     actionline = []
#     successor_scene_no = 0
#     predecessor_scene_no = 0
#     parenthetical_lis = []

#     scenes = []
#     speaker = ""
#     parenthetical = 'NONE'
#     predecessor_scene_no_dict = {
#         'Scene '+str(i+1): 0 for i in range(total_scenes)}
#     dia_count = {'Scene '+str(i+1): 0 for i in range(total_scenes)}
#     successor_scene_no_dict = {
#         'Scene '+str(i+1): 0 for i in range(total_scenes)}
#     parenthetical_count_dict = {
#         'Scene '+str(i+1): 0 for i in range(total_scenes)}
#     patttern = r'[\d]*[.]?[\s]*[IE][NX]T'
#     for line in refined:
#         if ((line.strip().startswith(('INT.', 'INT')) or line.strip().startswith(('EXT.', 'EXT')) or line.strip().startswith('EXT/INT') or line.strip().startswith('INT/EXT') or line.strip().startswith(('I/E', 'E/I')) or re.match(slug_pattern, line.strip()) or re.match(pat, line.strip())) and not (line.strip().startswith(('INTERCUT', 'INTERVAL', 'INTERMISSION')))):
#             # if re.match(patttern,line):
#             #     current_scene=line.split(" ")[0]
#             #     current_scene=current_scene[:1]
#             #     current_scene=int(current_scene)
#             #     successor_scene_no=min(int(current_scene)+2,total_scenes)
#             #     predecessor_scene_no=max(int(current_scene),-1)
#             # else:

#             successor_scene_no = min(scene_no+3, total_scenes)
#             predecessor_scene_no = max(scene_no+1, 0)
#             if scene_no > 0:
#                 parenthetical_count_dict['Scene '+str(scene_no+1)] = pc
#             pc = 0
#             scenes.append(scene)
#             scene = []
#             i = 0
#             scene_no += 1
#             predecessor_scene_no_dict['Scene ' +
#                                         str(scene_no+1)] = predecessor_scene_no
#             successor_scene_no_dict['Scene ' +
#                                     str(scene_no+1)] = successor_scene_no
#             successor_scene_no_dict['Scene '+str(1)] = 2
#             scene.append(line)
#             slugline_dic[scene_no] = line.split(
#                 "\n")[0].strip('0123456789.- ')

#         else:
#             lis = line.split("\n")
#             lis = [l.strip() for l in lis]
#             print(" \n Line 363 probable dialogue list", lis)
#             word = lis[0]
#             extendedSpeaker = ""
#             if word.split('(')[0].strip() in characters:
#                 mydic = {}
#                 prev_dial_speaker = speaker
#                 speakerline = word.split('(')
#                 # speaker = word.split('(')[0].strip()
#                 speaker = speakerline[0].strip()
#                 print("Speaker 378", speaker)
#                 extendedSpeaker = word.strip()
#                 if len(lis) > 1 and re.match(r"\(.*\)", lis[1]):
#                     pc = pc+1
#                     parenthetical = lis[1]
#                     parenthetical = parenthetical.replace("\n", "")
#                     dia = ' '.join(lis[2:])
#                     dia = dia.replace("\n", "")
#                     # renu
#                     # dia=dia.replace("\"", '')

#                 else:
#                     dia = ''.join(lis[1:])
#                     dia = dia.replace("\n", "")
#                     dia = dia.replace("\"", '')
#                     print("  length dia\n", len(dia))
#                 if not (len(dia) == 0 and parenthetical == "NONE"):
#                     print(" len dia != and Parenthetical == NONE: 384 ")
#                     if i-1 >= 0:
#                         try:
#                             prev = main_lis[scene_no-1][i-1]
#                         except:
#                             prev = ""
#                     else:
#                         prev = ""
#                     try:
#                         next = main_lis[scene_no-1][i+1]
#                     except:
#                         next = ""
#                     # prev is previous speaker and next is next speaker of the dialogue
#                     mydic[speaker] = [parenthetical,
#                                         scene_no, dia, len(dia), prev, next, extendedSpeaker]
#                     print("mydic  398", speaker, mydic[speaker])
#                     dia_count['Scene '+str(scene_no)] += 1
#                     # print(mydic)
#                     prev, next = "", ""
#                     i = i+1
#                     speakers.append(speaker)
#                     parenthetical_lis.append(parenthetical)
#                     dialogues.append(mydic)
#                     scene.append(mydic)
#                 parenthetical = "NONE"
#             else:
#                 line = line.replace("\n", " ")
#                 line = ' '.join(line.split())
#                 if line.strip() in transitions:
#                     scene.append({'Transition': line.strip()})
#                     continue
#                 actionline.append(line)
#                 scene.append(line.strip())

#     scenes.append(scene)
#     parenthetical_count_dict['Scene '+str(scene_no)] = pc
#     speakers = list(set(speakers))
#     scenes = scenes[1:]
#     print("Scenes:", scenes)
#     # for removing '\n' from action lines
#     # return scenes also if '\n' required and modify practice_with_db also
#     # s = []
#     # for scene in scenes:
#     #     s1=[]
#     #     for ele in scene:
#     #         if type(ele) == type(""):
#     #             s1.extend(ele.split("\n"))
#     #         else:
#     #             s1.append(ele)
#     #     s.append(s1)
#     return scenes,actionline,parenthetical_lis,speakers,dialogues


def language_detector(text):
    result = translate_client.translate(text, target_language='hi')
    det_lang = result["detectedSourceLanguage"]   
    return det_lang 

def script_det(text):
  punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
  no_punct = ""
  for char in text:
   if char not in punctuations:
       no_punct = char
       break    
  script = script_cat(no_punct)[0]
  return script


'''
A. Language of Highest number of full dialogues, 
B. Numbers of dialogues in action line language, 
C. Number of dialogues in other languages) 
'''

def A_B_C(dialogue_language, non_dial_src_lang):
  dict1 = dict(Counter(dialogue_language))
  sorted_values = sorted(dict1.values(), reverse=True) # Sort the values
  sorted_dict = {}
  for i in sorted_values:
      for k in dict1.keys():
          if dict1[k] == i:
              sorted_dict[k] = dict1[k]          
  sources = list(sorted_dict.keys())
  A = sources[0]

  if len(sources)==1:
    B=0
    C=0 
  elif non_dial_src_lang not in sources:
    B=0
    C = sum(sorted_values[1:])
  else:
    B = sorted_values[sources.index(non_dial_src_lang)]
    C = sum(sorted_values[2:])
  return A, B, C


def dial_each_word_lang1(non_dial_src_lang, dial):
    for word in dial.split():
        if language_detector(word)==non_dial_src_lang:
            #print("word", word)
            return "True"
    return "False"        

def dial_each_word_lang2(non_dial_src_lang, A, dial ):
    for word in dial.split():
        if (language_detector(word)!=non_dial_src_lang) or (language_detector(word)!=A):
            #print("in 4")
            #print("word", word)
            return "True"
    return "False"        

def word_with_actionline(scenes, A, non_dial_src_lang):
    if A==non_dial_src_lang:
        return "False"
    for scene in tqdm(scenes[:]):
        for i,line in enumerate(scene):
            if i == 0:
                continue
            if type(line)==type(""):
              continue

            else:
                [speaker] = line.keys()
                if speaker == 'Transition':
                    continue
                dial_src_lang = language_detector(line[speaker][2])
                if dial_src_lang==A:
                    word_lang_with_actionline = dial_each_word_lang1(non_dial_src_lang, line[speaker][2])
                if word_lang_with_actionline == "True":
                  return  word_lang_with_actionline


def word_with_other(scenes, A, non_dial_src_lang):
    word_lang_with_other = "False"
    for scene in tqdm(scenes[:]):
        for i,line in enumerate(scene):
            if i == 0:
                continue
            if type(line)==type(""):
              continue

            else:
                [speaker] = line.keys()
                if speaker == 'Transition':
                    continue
                dial_src_lang = language_detector(line[speaker][2])
                if dial_src_lang==A:
                  word_lang_with_other = dial_each_word_lang2(non_dial_src_lang, A, line[speaker][2])
                if word_lang_with_other == "True":
                  return word_lang_with_other                 


def getInputs(filename1):

    refined, total_scenes = getRefined(filename1)
    sluglines, without_slug = getSlugAndNonSlug(refined)
    characters = getSpeakers(without_slug)
    scenes, actionline, parenthetical_lis, speakers,dialogues = getScenes(refined,total_scenes,characters)
    print("line 405:scenes: ",scenes)

    language_of_all_dialogues = []
    script_of_all_dialogues = []
    count =0
    for scene in tqdm(scenes[:]):

        for i,line in enumerate(scene):
            if i == 0:
                continue
            if type(line)==type(""):
            #print("here")
                if count==0:
                    #print(line)  
                    non_dial_src_lang = language_detector(line)
                    non_dial_src_script=script_det(line)
                    count+=1
                    #print("non_dial_src_lang", non_dial_src_lang)
            else:
                #print("line", line)
                [speaker] = line.keys()
                #print([speaker])
                if speaker == 'Transition':
                    continue
                
                #print("dial", line[speaker][2])
                dial_src_lang = language_detector(line[speaker][2])
                language_of_all_dialogues.append(dial_src_lang)
                script_of_all_dialogues.append(script_det(line[speaker][2]))
    
    # print(non_dial_src_lang)
    # print(language_of_all_dialogues)
    # print(script_of_all_dialogues)

    A, B, C = A_B_C(language_of_all_dialogues, non_dial_src_lang)
    dial_src_script = mode(script_of_all_dialogues)


    word_lang_with_actionline = word_with_actionline(scenes, A, non_dial_src_lang)
    #print(word_lang_with_actionline) 

    word_lang_with_other = word_with_other(scenes, A, non_dial_src_lang)       
    #print(word_lang_with_other) 


    print("actionline_lanuge", non_dial_src_lang)
    non_dial_src_lang = non_dial_src_lang

    print("A = {} B = {} C = {}".format(A, B, C))
    print("dial_language", A)
    dial_src_lang = A

    print("dial_src_script", dial_src_script)

    # print("Steps in the process:")
    # print("")

    if B>0:
        print("UI option3 - yes" )
        UI_option3  = "Yes"
    else:
        print("UI option3 - no" )
        UI_option3  = "No"

    if C>0:
        print("UI option4 - yes" )
        UI_option4  = "Yes"
    else:
        print("UI option4 - no" )
        UI_option4  = "No"
    
    if word_lang_with_actionline=="True":
        print("UI option5 - Yes")
        UI_option5  = "Yes"
    else:
        print("UI_option5 - NO")  
        UI_option5  = "No"

    if word_lang_with_other=="True":
        print("UI option6 - Yes") 
        UI_option6  = "Yes"
    else:
        print("UI option6 - No")   
        UI_option6  = "No" 

    return [non_dial_src_lang, dial_src_lang, dial_src_script,non_dial_src_script, UI_option3, UI_option4, UI_option5, UI_option6]

# filename1 = sys.argv[1]
# getInputs(filename1)