Conversion_Kitchen_Code/kitchen_counter/conversion/translation/detection_20_dec.py

from google.cloud import translate_v2 as Translate
from google.cloud import translate
from .script_detector import script_cat
from docx.enum.table import WD_TABLE_ALIGNMENT, WD_ALIGN_VERTICAL
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.shared import Inches, Cm, Pt

# import textract
from tqdm import tqdm
import sys 
import re
import docx
import os
doc = docx.Document() 
from docx.shared import Inches, Cm, Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.table import WD_TABLE_ALIGNMENT, WD_ALIGN_VERTICAL
from MNF.settings import BasePath
basePath = BasePath()


#google
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]=rf"{basePath}/conversion/My First Project-2573112d5326.json"
from google.cloud import translate
from google.cloud import translate_v2 as Translate
translate_client = Translate.Client()
client = translate.TranslationServiceClient()
project_id = 'authentic-bongo-272808'
location = "global"
parent = f"projects/{project_id}/locations/{location}"


slug_pattern= r'[\d]*[.]?[\s]*[IE][NX]T'
pat = r'[\d]*[\s]*[IE]/[IE][.]?'
transitions = ['CUT TO:','FADE IN:','FADE OUT:','DISSOLVE TO:','JUMP CUT TO:','JUMP TO:','CUT BACK TO:','INTERCUT WITH:','I/C WITH:','BACK TO:', 'INTERVAL']
reserved_words = ['MONTAGE','PBS','FADE','FADE','TITLE','SPLIT', 'SCREEN','CUT']
style = doc.styles['Normal']
font = style.font
font.name = 'Courier New'
font.size = Pt(12)

def breaksen(s):
    l =[]
    #if len(s.split())<=256:
    if len(s.split())<=256:
        l.append(s)
    else:
        n = len(s.split())
        for i in range(n//32 + 1):
            l.append(" ".join(s.split()[32*i:32*(i+1)]))
    return l

def getRefined(filename1):
    #print("get_refined_called")
    total_scenes = 0
    text = textract.process(filename1, encoding="utf8", errors='ignore')
    filename= rf"{basePath}/conversion/translation/file.txt"
    f=open(filename, 'wb')
    f.write(text)
    f.close()
    dialog_coming=False
    f=open(filename, 'r',  encoding="utf8", errors='ignore')
    doc11=f.read()

    f.close()
    f1=open(rf"{basePath}/conversion/translation/file1.txt",'w', encoding="utf8", errors='ignore')
    c=0
    flag=False
    for line in doc11.split("\n"):
        if (line.strip().startswith(('INT.','INT ')) or \
            line.strip().startswith(('I/E','E/I')) or \
            line.strip().startswith(('EXT.','EXT ')) or \
            line.strip().startswith('EXT/INT') or \
            line.strip().startswith('INT/EXT') or \
            re.match(slug_pattern,line.strip())):
    
            flag=True
            f1.write(line)
            f1.write('\n')
            continue
        else:
            line = line.strip()
            if flag:
                if line.strip()=='\n':
                    continue
                if dialog_coming and (line=='\n' or line.strip()==""):
                    continue
                if dialog_coming:
                    f1.write(line)
                    f1.write('\n')
                    if re.match(r"\(.*\)",line):
                        
                        continue
                    else:
                        dialog_coming=False
                        continue
                    continue
                if line.isupper() and re.fullmatch(r"([A-Z'’]+[\s]*[-]*)*([#]*[\s]*[1-9])*(\(.*\))*",line.strip()):
                    f1.write(line)
                    f1.write('\n')
                    dialog_coming=True
                    continue
                if not line=='\n':
                    f1.write(line)
                    f1.write('\n')


    f1.close()
    filename1=rf"{basePath}/conversion/translation/file1.txt"
    #file.txt contains the data of file1.txt  , no usage as of now may be change the mame of the file.txt to file1.txt
    text = textract.process(filename1, encoding="utf8", errors='ignore')

    filename=rf"{basePath}/conversion/translation/file.txt"
    _, file_extension = os.path.splitext(filename1)
    f=open(filename, 'wb')
    f.write(text)
    f.close()

    with open(filename, "r") as input:
        input_ = input.read().split('\n\n')

    refined=[]

    for line in input_:
        refined.append(line.strip())
    refined=list(filter(lambda a: a != "", refined)) 
    #print("processing the script")                              

    for i in range(len(refined)):
        if not (refined[i].strip().startswith(('INT.','INT ')) or refined[i].strip().startswith(('EXT.','EXT ')) or refined[i].strip().startswith('EXT/INT') or refined[i].strip().startswith('INT/EXT') or refined[i].strip().startswith(('I/E','E/I')) or re.match(slug_pattern,refined[i].strip()) or re.match(pat,refined[i].strip())):
            total_scenes = total_scenes + 1
            continue
        refined=refined[i:]
        break
    # refined.append(line.strip())
    
    refined=list(filter(lambda a: a != "", refined))   
    return refined,total_scenes

def getSlugAndNonSlug(refined):
    sluglines=[]
    without_slug=[]
    for para in refined:
        para=para.strip()
        if para.strip().startswith(('INT.','INT')) or para.strip().startswith(('EXT.','EXT')) or para.strip().startswith('EXT/INT') or para.strip().startswith(('I/E','E/I')) or para.strip().startswith('INT/EXT') or re.match(slug_pattern,para.strip()) or re.match(pat,para.strip()):
            sluglines.append(para)
            continue
        without_slug.append(para)
    return sluglines,without_slug

def getSpeakers(without_slug):
    characters=[]
    for para in without_slug:
        lis=para.split('\n')
        i=0
        for item in lis:
            i=i+1
            i=min(i,len(lis)-2)
            if item.isupper() and not(lis[i+1].strip()==""):
                if re.match(r"[A-Z'’]+[\s]*[-]*[A-Z'’]*([#]*[\s]*[1-9])*(\(.*\))*",item):
                    tem = item.split("(")[0].strip()
                    characters.append(tem.strip())
                else:
                    continue
                    
    characters=list(set(characters))
    characters=list(filter(lambda x: len(x) >0,characters))
    characters = [character for character in characters if set(character.split(" ")).intersection(reserved_words) == set()]
    return characters


def getScenes(refined,total_scenes,characters):
    # To find scenes data structure and prev and next scenes numbers
    i=0
    scene=[]
    dialogues=[]
    speakers=[]
    slugline_dic={}
    prev_dial_speaker=""
    next_dial_speaker=""
    pc=0
    scene_no=0
    actionline=[]
    successor_scene_no=0
    predecessor_scene_no=0
    parenthetical_lis=[]
    
    scenes=[]
    speaker=""
    parenthetical='NONE'
    patttern=r'[\d]*[.]?[\s]*[IE][NX]T'
    for line in refined:
        if line.strip().startswith(('INT.','INT')) or line.strip().startswith(('EXT.','EXT')) or line.strip().startswith('EXT/INT') or line.strip().startswith('INT/EXT') or line.strip().startswith(('I/E','E/I')) or re.match(slug_pattern,line.strip()) or re.match(pat,line.strip()):
            scenes.append(scene)
            scene=[]
            i=0
            scene_no+=1
            scene.append(line)
            slugline_dic[scene_no]=line.split("\n")[0].strip('0123456789.- ')

        else:
            lis=line.split("\n")
            lis=[l.strip() for l in lis]
            word=lis[0]
            if word.split('(')[0].strip() in characters:
                mydic={}
                prev_dial_speaker=speaker
                speaker=word.split('(')[0].strip()
                if len(lis)>1 and re.match(r"\(.*\)",lis[1]):
                    pc=pc+1
                    parenthetical=lis[1]
                    parenthetical=parenthetical.replace("\n","")
                    dia=' '.join(lis[2:])
                    dia=dia.replace("\n","")
                    ##renu
                    dia=dia.replace("\"", '')
                
                else:
                    dia=''.join(lis[1:])
                    dia=dia.replace("\n","")
                    dia=dia.replace("\"", '')
                if not (len(dia)==0 and parenthetical=="NONE"):
                    
                    if i-1 >= 0:
                        try:
                            prev=main_lis[scene_no-1][i-1]
                        except:
                            prev=""
                    else:
                        prev=""
                    try:
                        next=main_lis[scene_no-1][i+1]
                    except:
                        next=""
                    #prev is previous speaker and next is next speaker of the dialogue
                    mydic[speaker]=[parenthetical,scene_no,dia,len(dia),prev,next]
                    # print(mydic)
                    prev,next="",""
                    i=i+1
                    speakers.append(speaker)
                    parenthetical_lis.append(parenthetical)
                    dialogues.append(mydic)
                    scene.append(mydic)
                parenthetical="NONE"
            else:
                line=line.replace("\n"," ")
                line=' '.join(line.split())
                if line.strip() in transitions:
                    scene.append({'Transition':line.strip()})
                    continue
                actionline.append(line)
                scene.append(line.strip())


    scenes.append(scene)
    speakers=list(set(speakers))
    scenes=scenes[1:]
    s = []
    for scene in scenes:
        s1=[]
        for ele in scene:
            if type(ele) == type(""):
                s1.extend(ele.split("\n"))
            else:
                s1.append(ele)
        s.append(s1)
    return s,actionline,parenthetical_lis,speakers,dialogues


def language_detector(text):
    result = translate_client.translate(text, target_language='hi')
    det_lang = result["detectedSourceLanguage"]
    return det_lang


def getInputs(filename1):
    '''
    non_dial_src_lang =''
    dial_src_lang =''
    dial_src_script =''
    '''
    refined, total_scenes = getRefined(filename1)
    sluglines, without_slug = getSlugAndNonSlug(refined)
    characters = getSpeakers(without_slug)
    scenes, actionline, parenthetical_lis, speakers, dialogues = getScenes(
        refined, total_scenes, characters)
      # print("scene")
    #     x = "False"
    #     y = "False"
    #     for i, line in enumerate(scene):
    #         if i == 0:
    #             continue
    #         if type(line) == type(""):
    #             x = "True"
    #             non_dial_src_lang = language_detector(line)
                

    #         else:
    #             y = "True"
    #             [speaker] = line.keys()
    #             if speaker == 'Transition':
    #                 continue
    #             if line[speaker][0] != 'NONE':
    #                 continue
    #             dial_src_lang = language_detector(line[speaker][2])
    #             dial_src_script = script_cat(line[speaker][2][1])[0]
    
    for scene in tqdm(scenes):
        #print("scene") 
        x = "False"
        y = "False"
        for i,line in enumerate(scene):
            if i == 0:
                continue
            if type(line)==type(""):
                x = "True" 
                non_dial_src_lang = language_detector(line)
                non_dial_src_script = script_cat(line[1])[0]
                

            else:
                y = "True"
                [speaker] = line.keys()
                if speaker == 'Transition':
                    continue
                # if line[speaker][0] != 'NONE':
                #     continue
                
                dial_src_lang = language_detector(line[speaker][2])
                dial_src_script = script_cat(line[speaker][2][1])[0]
                
        if x == "True" and y == "True":
            break 
            

    mydata = [non_dial_src_lang, dial_src_lang, dial_src_script, non_dial_src_script]
    return mydata