165 lines
6.1 KiB
Python
Executable File
165 lines
6.1 KiB
Python
Executable File
import azure.cognitiveservices.speech as speechsdk
|
|
import os
|
|
import time
|
|
import pprint
|
|
import json
|
|
import srt
|
|
import datetime
|
|
import sys
|
|
import moviepy.editor as mp
|
|
import random
|
|
|
|
basepath = "/home/user/mnf/project/MNF/conversion/subtitling"
|
|
|
|
|
|
# filename2 = sys.argv[1]
|
|
# lang_code = sys.argv[2]
|
|
# movie_name = sys.argv[3]
|
|
# #lang_code = sys.argv[1]
|
|
# current = basepath + "/" + movie_name
|
|
|
|
# filename1 = os.path.splitext(filename2)[0]
|
|
|
|
# temp = basepath+"/"+filename2
|
|
|
|
# def azure_sub(filename2, lang_code, current):
|
|
# # my_clip = mp.VideoFileClip(filename2) # uncomment when running from UI
|
|
# # # my_clip = mp.VideoFileClip(temp) # comment when running from command prompt
|
|
# # with open(rf"{current}/azure_subtitle.srt", "w") as f:
|
|
# # temp = current+"/"+filename1+".wav"
|
|
# # temp1 = filename1+".wav"
|
|
# # my_clip.audio.write_audiofile(temp)
|
|
# temp = current + "/" + filename2
|
|
|
|
# path = os.getcwd()
|
|
# Creates an instance of a speech config with specified subscription key and service region.
|
|
# Replace with your own subscription key and region identifier from here: https://aka.ms/speech/sdkregion
|
|
def azure_sub_temp(wav_file, lang_code, current):
|
|
global done
|
|
# path = os.getcwd()
|
|
wave_file_path = current + "/" + wav_file
|
|
done = False
|
|
|
|
print("Line 43", done)
|
|
# Creates an instance of a speech config with specified subscription key and service region.
|
|
# Replace with your own subscription key and region identifier from here: https://aka.ms/speech/sdkregion
|
|
speech_key, service_region = "<>", "<>"
|
|
speech_config = speechsdk.SpeechConfig(
|
|
subscription="49301a4f2b7240d29c7ffcc4828d345d", region="eastus")
|
|
|
|
# Creates an audio configuration that points to an audio file.
|
|
# Replace with your own audio filename.
|
|
audio_input = speechsdk.audio.AudioConfig(filename=wave_file_path)
|
|
|
|
# Creates a recognizer with the given settings
|
|
speech_config.speech_recognition_language = str(lang_code)
|
|
speech_config.request_word_level_timestamps()
|
|
|
|
speech_config.enable_dictation()
|
|
speech_config.output_format = speechsdk.OutputFormat(1)
|
|
|
|
speech_recognizer = speechsdk.SpeechRecognizer(
|
|
speech_config=speech_config, audio_config=audio_input)
|
|
|
|
#result = speech_recognizer.recognize_once()
|
|
all_results = []
|
|
results = []
|
|
transcript = []
|
|
words = []
|
|
|
|
# https://docs.microsoft.com/en-us/python/api/azure-cognitiveservices-speech/azure.cognitiveservices.speech.recognitionresult?view=azure-python
|
|
|
|
def handle_final_result(evt):
|
|
import json
|
|
all_results.append(evt.result.text)
|
|
results = json.loads(evt.result.json)
|
|
transcript.append(results['DisplayText'])
|
|
confidence_list_temp = [item.get('Confidence')
|
|
for item in results['NBest']]
|
|
max_confidence_index = confidence_list_temp.index(
|
|
max(confidence_list_temp))
|
|
words.extend(results['NBest'][max_confidence_index]['Words'])
|
|
|
|
def stop_cb(evt):
|
|
print('CLOSING on {}'.format(evt))
|
|
speech_recognizer.stop_continuous_recognition()
|
|
global done
|
|
done = True
|
|
|
|
speech_recognizer.recognized.connect(handle_final_result)
|
|
# Connect callbacks to the events fired by the speech recognizer
|
|
speech_recognizer.recognizing.connect(
|
|
lambda evt: print('RECOGNIZING: {}'.format(evt)))
|
|
speech_recognizer.recognized.connect(
|
|
lambda evt: print('RECOGNIZED: {}'.format(evt)))
|
|
speech_recognizer.session_started.connect(
|
|
lambda evt: print('SESSION STARTED: {}'.format(evt)))
|
|
speech_recognizer.session_stopped.connect(
|
|
lambda evt: print('SESSION STOPPED {}'.format(evt)))
|
|
speech_recognizer.canceled.connect(
|
|
lambda evt: print('CANCELED {}'.format(evt)))
|
|
# stop continuous recognition on either session stopped or canceled events
|
|
|
|
# done = False
|
|
speech_recognizer.session_stopped.connect(stop_cb)
|
|
speech_recognizer.canceled.connect(stop_cb)
|
|
|
|
print("Line 105", done)
|
|
|
|
speech_recognizer.start_continuous_recognition()
|
|
|
|
print("Line before loop: ", done)
|
|
|
|
while not done:
|
|
print("In while loop line 106")
|
|
time.sleep(.5)
|
|
|
|
print("Printing all results:")
|
|
print(all_results)
|
|
|
|
speech_to_text_response = words
|
|
|
|
def convertduration(t):
|
|
x = t/10000
|
|
return int((x / 1000)), (x % 1000)
|
|
|
|
##-- Code to Create Subtitle --#
|
|
|
|
# 3 Seconds
|
|
bin = 3
|
|
duration = 0
|
|
transcriptions = []
|
|
transcript = ""
|
|
index, prev = 0, 0
|
|
wordstartsec, wordstartmicrosec = 0, 0
|
|
for i in range(len(speech_to_text_response)):
|
|
# Forms the sentence until the bin size condition is met
|
|
transcript = transcript + " " + speech_to_text_response[i]["Word"]
|
|
# Checks whether the elapsed duration is less than the bin size
|
|
if(int((duration / 10000000)) < bin):
|
|
wordstartsec, wordstartmicrosec = convertduration(
|
|
speech_to_text_response[i]["Offset"])
|
|
duration = duration+speech_to_text_response[i]["Offset"]-prev
|
|
prev = speech_to_text_response[i]["Offset"]
|
|
#transcript = transcript + " " + speech_to_text_response[i]["Word"]
|
|
else:
|
|
index = index+1
|
|
#transcript = transcript + " " + speech_to_text_response[i]["Word"]
|
|
transcriptions.append(srt.Subtitle(index, datetime.timedelta(
|
|
0, wordstartsec, wordstartmicrosec), datetime.timedelta(0, wordstartsec+bin, 0), transcript))
|
|
duration = 0
|
|
# print(transcript)
|
|
transcript = ""
|
|
|
|
transcriptions.append(srt.Subtitle(index, datetime.timedelta(
|
|
0, wordstartsec, wordstartmicrosec), datetime.timedelta(0, wordstartsec+bin, 0), transcript))
|
|
subtitles = srt.compose(transcriptions)
|
|
with open(rf"{current}/az_subtitles.srt", "w") as f:
|
|
f.write(subtitles)
|
|
|
|
|
|
# wav = "audio_404.wav"
|
|
# lang = "en-US"
|
|
# loc = "/home/user/mnf/project/MNF/conversion/subtitling"
|
|
# azure_sub_temp(wav, lang, loc)
|