Conversion_Kitchen_Code/kitchen_counter/conversion/subtitling/Azure_testing.py

146 lines
5.0 KiB
Python
Raw Normal View History

2024-04-27 09:33:09 +00:00
import azure.cognitiveservices.speech as speechsdk
import os
import time
import pprint
import json
import srt
import datetime
class Azure():
path = os.getcwd()
# Creates an instance of a speech config with specified subscription key and service region.
# Replace with your own subscription key and region identifier from here: https://aka.ms/speech/sdkregion
print("Rohit")
print("Running1")
speech_key, service_region = "<>", "<>"
speech_config = speechsdk.SpeechConfig(
subscription="49301a4f2b7240d29c7ffcc4828d345d", region="eastus")
# Creates an audio configuration that points to an audio file.
# Replace with your own audio filename.
# audio_filename = "sample.wav"
audio_input = speechsdk.audio.AudioConfig(filename="audio_404.wav")
# Creates a recognizer with the given settings
speech_config.speech_recognition_language = "en-US"
speech_config.request_word_level_timestamps()
speech_config.enable_dictation()
speech_config.output_format = speechsdk.OutputFormat(1)
speech_recognizer = speechsdk.SpeechRecognizer(
speech_config=speech_config, audio_config=audio_input)
#result = speech_recognizer.recognize_once()
all_results = []
results = []
transcript = []
words = []
# https://docs.microsoft.com/en-us/python/api/azure-cognitiveservices-speech/azure.cognitiveservices.speech.recognitionresult?view=azure-python
def handle_final_result(evt):
import json
all_results.append(evt.result.text)
results = json.loads(evt.result.json)
transcript.append(results['DisplayText'])
confidence_list_temp = [item.get('Confidence')
for item in results['NBest']]
max_confidence_index = confidence_list_temp.index(
max(confidence_list_temp))
words.extend(results['NBest'][max_confidence_index]['Words'])
global done
done = False
def stop_cb(evt):
print('CLOSING on {}'.format(evt))
speech_recognizer.stop_continuous_recognition()
global done
done = True
speech_recognizer.recognized.connect(handle_final_result)
# Connect callbacks to the events fired by the speech recognizer
print("Running2")
speech_recognizer.recognizing.connect(
lambda evt: print('RECOGNIZING: {}'.format(evt)))
print("Running3")
speech_recognizer.recognized.connect(
lambda evt: print('RECOGNIZED: {}'.format(evt)))
print("Running4")
speech_recognizer.session_started.connect(
lambda evt: print('SESSION STARTED: {}'.format(evt)))
print("Running5")
speech_recognizer.session_stopped.connect(
lambda evt: print('SESSION STOPPED {}'.format(evt)))
print("Running6")
speech_recognizer.canceled.connect(
lambda evt: print('CANCELED {}'.format(evt)))
# stop continuous recognition on either session stopped or canceled events
speech_recognizer.session_stopped.connect(stop_cb)
speech_recognizer.canceled.connect(stop_cb)
speech_recognizer.start_continuous_recognition()
print("checking done 91",done)
while not done:
print("in loop")
time.sleep(.5)
print("Printing all results:")
print(all_results)
speech_to_text_response = words
def convertduration(t):
x = t/10000
return int((x / 1000)), (x % 1000)
##-- Code to Create Subtitle --#
# 3 Seconds
bin = 3
duration = 0
transcriptions = []
transcript = ""
index, prev = 0, 0
wordstartsec, wordstartmicrosec = 0, 0
for i in range(len(speech_to_text_response)):
# Forms the sentence until the bin size condition is met
transcript = transcript + " " + speech_to_text_response[i]["Word"]
# Checks whether the elapsed duration is less than the bin size
if(int((duration / 10000000)) < bin):
wordstartsec, wordstartmicrosec = convertduration(
speech_to_text_response[i]["Offset"])
duration = duration+speech_to_text_response[i]["Offset"]-prev
prev = speech_to_text_response[i]["Offset"]
#transcript = transcript + " " + speech_to_text_response[i]["Word"]
else:
index = index+1
#transcript = transcript + " " + speech_to_text_response[i]["Word"]
transcriptions.append(srt.Subtitle(index, datetime.timedelta(
0, wordstartsec, wordstartmicrosec), datetime.timedelta(0, wordstartsec+bin, 0), transcript))
duration = 0
# print(transcript)
transcript = ""
transcriptions.append(srt.Subtitle(index, datetime.timedelta(
0, wordstartsec, wordstartmicrosec), datetime.timedelta(0, wordstartsec+bin, 0), transcript))
subtitles = srt.compose(transcriptions)
with open("subtitle.srt", "w") as f:
f.write(subtitles)
print("Rohit")