Conversion_Kitchen_Code/kitchen_counter/conversion/translation/detection123.py

# import textract
from tqdm import tqdm
import sys 
import re
import docx
import os
doc = docx.Document() 
from docx.shared import Inches, Cm, Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.table import WD_TABLE_ALIGNMENT, WD_ALIGN_VERTICAL
from collections import Counter


from script_detector import script_cat

#google
#os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="gifted-mountain-318504-0a5f94cda0c8.json"
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = rf"{basePath}/conversion/gifted-mountain-318504-4f001d5f08db.json"
from google.cloud import translate
from google.cloud import translate_v2 as Translate
translate_client = Translate.Client()
client = translate.TranslationServiceClient()
project_id = 'authentic-bongo-272808'
location = "global"
parent = f"projects/{project_id}/locations/{location}"


slug_pattern= r'[\d]*[.]?[\s]*[IE][NX]T'
pat = r'[\d]*[\s]*[IE]/[IE][.]?'
transitions = ['CUT TO:','FADE IN:','FADE OUT:','DISSOLVE TO:','JUMP CUT TO:','JUMP TO:','CUT BACK TO:','INTERCUT WITH:','I/C WITH:','BACK TO:', 'INTERVAL']
reserved_words = ['MONTAGE','PBS','FADE','FADE','TITLE','SPLIT', 'SCREEN','CUT']
style = doc.styles['Normal']
font = style.font
font.name = 'Courier New'
font.size = Pt(12)

def breaksen(s):
    l =[]
    #if len(s.split())<=256:
    if len(s.split())<=256:
        l.append(s)
    else:
        n = len(s.split())
        for i in range(n//32 + 1):
            l.append(" ".join(s.split()[32*i:32*(i+1)]))
    return l

def getRefined(filename1):
    #print("get_refined_called")
    total_scenes = 0
    text = textract.process(filename1, encoding="utf8", errors='ignore')
    filename="file.txt"
    f=open(filename, 'wb')
    f.write(text)
    f.close()
    dialog_coming=False
    f=open(filename, 'r',  encoding="utf8", errors='ignore')
    doc11=f.read()

    f.close()
    f1=open("file1.txt",'w', encoding="utf8", errors='ignore')
    c=0
    flag=False
    for line in doc11.split("\n"):
        if (line.strip().startswith(('INT.','INT ')) or \
            line.strip().startswith(('I/E','E/I')) or \
            line.strip().startswith(('EXT.','EXT ')) or \
            line.strip().startswith('EXT/INT') or \
            line.strip().startswith('INT/EXT') or \
            re.match(slug_pattern,line.strip())):
    
            flag=True
            f1.write(line)
            f1.write('\n')
            continue
        else:
            line = line.strip()
            if flag:
                if line.strip()=='\n':
                    continue
                if dialog_coming and (line=='\n' or line.strip()==""):
                    continue
                if dialog_coming:
                    f1.write(line)
                    f1.write('\n')
                    if re.match(r"\(.*\)",line):
                        
                        continue
                    else:
                        dialog_coming=False
                        continue
                    continue
                if line.isupper() and re.fullmatch(r"([A-Z'’]+[\s]*[-]*)*([#]*[\s]*[1-9])*(\(.*\))*",line.strip()):
                    f1.write(line)
                    f1.write('\n')
                    dialog_coming=True
                    continue
                if not line=='\n':
                    f1.write(line)
                    f1.write('\n')


    f1.close()
    filename1="file1.txt"
    #file.txt contains the data of file1.txt  , no usage as of now may be change the mame of the file.txt to file1.txt
    text = textract.process(filename1, encoding="utf8", errors='ignore')

    filename="file.txt"
    _, file_extension = os.path.splitext(filename1)
    f=open(filename, 'wb')
    f.write(text)
    f.close()

    with open(filename, "r") as input:
        input_ = input.read().split('\n\n')

    refined=[]

    for line in input_:
        refined.append(line.strip())
    refined=list(filter(lambda a: a != "", refined)) 
    #print("processing the script")                              

    for i in range(len(refined)):
        if not (refined[i].strip().startswith(('INT.','INT ')) or refined[i].strip().startswith(('EXT.','EXT ')) or refined[i].strip().startswith('EXT/INT') or refined[i].strip().startswith('INT/EXT') or refined[i].strip().startswith(('I/E','E/I')) or re.match(slug_pattern,refined[i].strip()) or re.match(pat,refined[i].strip())):
            total_scenes = total_scenes + 1
            continue
        refined=refined[i:]
        break
    # refined.append(line.strip())
    
    refined=list(filter(lambda a: a != "", refined))   
    return refined,total_scenes

def getSlugAndNonSlug(refined):
    sluglines=[]
    without_slug=[]
    for para in refined:
        para=para.strip()
        if para.strip().startswith(('INT.','INT')) or para.strip().startswith(('EXT.','EXT')) or para.strip().startswith('EXT/INT') or para.strip().startswith(('I/E','E/I')) or para.strip().startswith('INT/EXT') or re.match(slug_pattern,para.strip()) or re.match(pat,para.strip()):
            sluglines.append(para)
            continue
        without_slug.append(para)
    return sluglines,without_slug

def getSpeakers(without_slug):
    characters=[]
    for para in without_slug:
        lis=para.split('\n')
        i=0
        for item in lis:
            i=i+1
            i=min(i,len(lis)-2)
            if item.isupper() and not(lis[i+1].strip()==""):
                if re.match(r"[A-Z'’]+[\s]*[-]*[A-Z'’]*([#]*[\s]*[1-9])*(\(.*\))*",item):
                    tem = item.split("(")[0].strip()
                    characters.append(tem.strip())
                else:
                    continue
                    
    characters=list(set(characters))
    characters=list(filter(lambda x: len(x) >0,characters))
    characters = [character for character in characters if set(character.split(" ")).intersection(reserved_words) == set()]
    return characters


def getScenes(refined,total_scenes,characters):
    # To find scenes data structure and prev and next scenes numbers
    i=0
    scene=[]
    dialogues=[]
    speakers=[]
    slugline_dic={}
    prev_dial_speaker=""
    next_dial_speaker=""
    pc=0
    scene_no=0
    actionline=[]
    successor_scene_no=0
    predecessor_scene_no=0
    parenthetical_lis=[]
    
    scenes=[]
    speaker=""
    parenthetical='NONE'
    patttern=r'[\d]*[.]?[\s]*[IE][NX]T'
    for line in refined:
        if line.strip().startswith(('INT.','INT')) or line.strip().startswith(('EXT.','EXT')) or line.strip().startswith('EXT/INT') or line.strip().startswith('INT/EXT') or line.strip().startswith(('I/E','E/I')) or re.match(slug_pattern,line.strip()) or re.match(pat,line.strip()):
            scenes.append(scene)
            scene=[]
            i=0
            scene_no+=1
            scene.append(line)
            slugline_dic[scene_no]=line.split("\n")[0].strip('0123456789.- ')

        else:
            lis=line.split("\n")
            lis=[l.strip() for l in lis]
            word=lis[0]
            if word.split('(')[0].strip() in characters:
                mydic={}
                prev_dial_speaker=speaker
                speaker=word.split('(')[0].strip()
                if len(lis)>1 and re.match(r"\(.*\)",lis[1]):
                    pc=pc+1
                    parenthetical=lis[1]
                    parenthetical=parenthetical.replace("\n","")
                    dia=' '.join(lis[2:])
                    dia=dia.replace("\n","")
                    ##renu
                    dia=dia.replace("\"", '')
                
                else:
                    dia=''.join(lis[1:])
                    dia=dia.replace("\n","")
                    dia=dia.replace("\"", '')
                if not (len(dia)==0 and parenthetical=="NONE"):
                    
                    if i-1 >= 0:
                        try:
                            prev=main_lis[scene_no-1][i-1]
                        except:
                            prev=""
                    else:
                        prev=""
                    try:
                        next=main_lis[scene_no-1][i+1]
                    except:
                        next=""
                    #prev is previous speaker and next is next speaker of the dialogue
                    mydic[speaker]=[parenthetical,scene_no,dia,len(dia),prev,next]
                    # print(mydic)
                    prev,next="",""
                    i=i+1
                    speakers.append(speaker)
                    parenthetical_lis.append(parenthetical)
                    dialogues.append(mydic)
                    scene.append(mydic)
                parenthetical="NONE"
            else:
                line=line.replace("\n"," ")
                line=' '.join(line.split())
                if line.strip() in transitions:
                    scene.append({'Transition':line.strip()})
                    continue
                actionline.append(line)
                scene.append(line.strip())


    scenes.append(scene)
    speakers=list(set(speakers))
    scenes=scenes[1:]
    s = []
    for scene in scenes:
        s1=[]
        for ele in scene:
            if type(ele) == type(""):
                s1.extend(ele.split("\n"))
            else:
                s1.append(ele)
        s.append(s1)
    return s,actionline,parenthetical_lis,speakers,dialogues


filename1 = sys.argv[1]
#print(filename1)

refined,total_scenes = getRefined(filename1)
#print(refined)
sluglines,without_slug = getSlugAndNonSlug(refined)
characters = getSpeakers(without_slug)
scenes,actionline,parenthetical_lis,speakers,dialogues = getScenes(refined,total_scenes,characters)
#print(scenes)

def language_detector(text):
    result = translate_client.translate(text, target_language='hi')
    det_lang = result["detectedSourceLanguage"]   
    return det_lang 

def script_det(text):
  punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
  no_punct = ""
  for char in text:
   if char not in punctuations:
       no_punct = char
       break    
  script = script_cat(no_punct)[0]
  return script

'''
A. Language of Highest number of full dialogues, 
B. Numbers of dialogues in action line language, 
C. Number of dialogues in other languages) 
'''

def A_B_C(dialogue_language, non_dial_src_lang):
  dict1 = dict(Counter(dialogue_language))
  sorted_values = sorted(dict1.values(), reverse=True) # Sort the values
  sorted_dict = {}
  for i in sorted_values:
      for k in dict1.keys():
          if dict1[k] == i:
              sorted_dict[k] = dict1[k]
  sources = list(sorted_dict.keys())
  A = sources[0]
  if len(sources)!=1:
    B = sorted_values[sources.index(non_dial_src_lang)]
    C = sum(sorted_values[2:])
  else:
    B=0
    C=0  
  return A, B, C


#print(scenes)

dialogue_language = []
count =0
for scene in tqdm(scenes[:]):
    #print("scene") 

    for i,line in enumerate(scene):
        if i == 0:
            continue
        if type(line)==type(""):
           #print("here")
           if count==0:
              #print(line)  
              non_dial_src_lang = language_detector(line)
              count+=1
              #print("non_dial_src_lang", non_dial_src_lang)

        else:
            #print("line", line)
            [speaker] = line.keys()
            #print([speaker])
            if speaker == 'Transition':
                continue
            
            #print("dial", line[speaker][2])
            dial_src_lang = language_detector(line[speaker][2])
            dialogue_language.append(dial_src_lang)

            #dial_src_script = script_det(line[speaker][2])

# print("non_dial_src_lang", non_dial_src_lang)
# print("dial_src_lang", dialogue_language)


#print(len(dialogue_language))
#print(Counter(dialogue_language))

A, B, C = A_B_C(dialogue_language, non_dial_src_lang)
# print("A = {} B = {} C = {}".format(A, B, C))


def dial_each_word_lang1(non_dial_src_lang, dial):
    for word in dial.split():
        if language_detector(word)==non_dial_src_lang:
            print("word", word)
            return "True"
    return "False"        

def dial_each_word_lang2(non_dial_src_lang, A, dial ):
    for word in dial.split():
        if (language_detector(word)!=non_dial_src_lang) or (language_detector(word)!=A):
            print("in 4")
            print("word", word)
            return "True"
    return "False"        

def word_with_actionline(scenes):

    for scene in tqdm(scenes[:]):
        for i,line in enumerate(scene):
            if i == 0:
                continue
            if type(line)==type(""):
              continue

            else:
                [speaker] = line.keys()
                if speaker == 'Transition':
                    continue
                
                dial_src_lang = language_detector(line[speaker][2])
                if dial_src_lang==A:
                    word_lang_with_actionline = dial_each_word_lang1(non_dial_src_lang, line[speaker][2])

                if word_lang_with_actionline == "True":
                  return  word_lang_with_actionline


def word_with_other(scenes):

    for scene in tqdm(scenes[:]):
        for i,line in enumerate(scene):
            if i == 0:
                continue
            if type(line)==type(""):
              continue

            else:
                [speaker] = line.keys()
                if speaker == 'Transition':
                    continue
                
                dial_src_lang = language_detector(line[speaker][2])

                if dial_src_lang==A:
                  word_lang_with_other = dial_each_word_lang2(non_dial_src_lang, A, line[speaker][2])

                if word_lang_with_other == "True":
                  return word_lang_with_other                 

word_lang_with_actionline = word_with_actionline(scenes)
#print(word_lang_with_actionline) 

word_lang_with_other = word_with_other(scenes)       
#print(word_lang_with_other) 


####
print("actionline_lanuge", non_dial_src_lang)
#print("A = {} B = {} C = {}".format(A, B, C))
print("dial_language", A)

if B>0:
    print("UI option3 - yes" )
else:
    print("UI option3 - no" )

if C>0:
    print("UI option4 - yes" )
else:
    print("UI option4 - no" )
 
if word_lang_with_actionline=="True":
    print("UI option5 - Yes")
else:
    print("UI_option5 - NO")  

if word_lang_with_other=="True":
    print("UI option6 - Yes") 
else:
    print("UI option6 - No")