import azure.cognitiveservices.speech as speechsdk import os import time import pprint import json import srt import datetime import sys import moviepy.editor as mp import random basepath = "/home/user/mnf/project/MNF/conversion/subtitling" # filename2 = sys.argv[1] # lang_code = sys.argv[2] # movie_name = sys.argv[3] # #lang_code = sys.argv[1] # current = basepath + "/" + movie_name # filename1 = os.path.splitext(filename2)[0] # temp = basepath+"/"+filename2 # def azure_sub(filename2, lang_code, current): # # my_clip = mp.VideoFileClip(filename2) # uncomment when running from UI # # # my_clip = mp.VideoFileClip(temp) # comment when running from command prompt # # with open(rf"{current}/azure_subtitle.srt", "w") as f: # # temp = current+"/"+filename1+".wav" # # temp1 = filename1+".wav" # # my_clip.audio.write_audiofile(temp) # temp = current + "/" + filename2 # path = os.getcwd() # Creates an instance of a speech config with specified subscription key and service region. # Replace with your own subscription key and region identifier from here: https://aka.ms/speech/sdkregion def azure_sub_temp(wav_file, lang_code, current): global done # path = os.getcwd() wave_file_path = current + "/" + wav_file done = False print("Line 43", done) # Creates an instance of a speech config with specified subscription key and service region. # Replace with your own subscription key and region identifier from here: https://aka.ms/speech/sdkregion speech_key, service_region = "<>", "<>" speech_config = speechsdk.SpeechConfig( subscription="49301a4f2b7240d29c7ffcc4828d345d", region="eastus") # Creates an audio configuration that points to an audio file. # Replace with your own audio filename. audio_input = speechsdk.audio.AudioConfig(filename=wave_file_path) # Creates a recognizer with the given settings speech_config.speech_recognition_language = str(lang_code) speech_config.request_word_level_timestamps() speech_config.enable_dictation() speech_config.output_format = speechsdk.OutputFormat(1) speech_recognizer = speechsdk.SpeechRecognizer( speech_config=speech_config, audio_config=audio_input) #result = speech_recognizer.recognize_once() all_results = [] results = [] transcript = [] words = [] # https://docs.microsoft.com/en-us/python/api/azure-cognitiveservices-speech/azure.cognitiveservices.speech.recognitionresult?view=azure-python def handle_final_result(evt): import json all_results.append(evt.result.text) results = json.loads(evt.result.json) transcript.append(results['DisplayText']) confidence_list_temp = [item.get('Confidence') for item in results['NBest']] max_confidence_index = confidence_list_temp.index( max(confidence_list_temp)) words.extend(results['NBest'][max_confidence_index]['Words']) def stop_cb(evt): print('CLOSING on {}'.format(evt)) speech_recognizer.stop_continuous_recognition() global done done = True speech_recognizer.recognized.connect(handle_final_result) # Connect callbacks to the events fired by the speech recognizer speech_recognizer.recognizing.connect( lambda evt: print('RECOGNIZING: {}'.format(evt))) speech_recognizer.recognized.connect( lambda evt: print('RECOGNIZED: {}'.format(evt))) speech_recognizer.session_started.connect( lambda evt: print('SESSION STARTED: {}'.format(evt))) speech_recognizer.session_stopped.connect( lambda evt: print('SESSION STOPPED {}'.format(evt))) speech_recognizer.canceled.connect( lambda evt: print('CANCELED {}'.format(evt))) # stop continuous recognition on either session stopped or canceled events # done = False speech_recognizer.session_stopped.connect(stop_cb) speech_recognizer.canceled.connect(stop_cb) print("Line 105", done) speech_recognizer.start_continuous_recognition() print("Line before loop: ", done) while not done: print("In while loop line 106") time.sleep(.5) print("Printing all results:") print(all_results) speech_to_text_response = words def convertduration(t): x = t/10000 return int((x / 1000)), (x % 1000) ##-- Code to Create Subtitle --# # 3 Seconds bin = 3 duration = 0 transcriptions = [] transcript = "" index, prev = 0, 0 wordstartsec, wordstartmicrosec = 0, 0 for i in range(len(speech_to_text_response)): # Forms the sentence until the bin size condition is met transcript = transcript + " " + speech_to_text_response[i]["Word"] # Checks whether the elapsed duration is less than the bin size if(int((duration / 10000000)) < bin): wordstartsec, wordstartmicrosec = convertduration( speech_to_text_response[i]["Offset"]) duration = duration+speech_to_text_response[i]["Offset"]-prev prev = speech_to_text_response[i]["Offset"] #transcript = transcript + " " + speech_to_text_response[i]["Word"] else: index = index+1 #transcript = transcript + " " + speech_to_text_response[i]["Word"] transcriptions.append(srt.Subtitle(index, datetime.timedelta( 0, wordstartsec, wordstartmicrosec), datetime.timedelta(0, wordstartsec+bin, 0), transcript)) duration = 0 # print(transcript) transcript = "" transcriptions.append(srt.Subtitle(index, datetime.timedelta( 0, wordstartsec, wordstartmicrosec), datetime.timedelta(0, wordstartsec+bin, 0), transcript)) subtitles = srt.compose(transcriptions) with open(rf"{current}/az_subtitles.srt", "w") as f: f.write(subtitles) # wav = "audio_404.wav" # lang = "en-US" # loc = "/home/user/mnf/project/MNF/conversion/subtitling" # azure_sub_temp(wav, lang, loc)