You want to process audio file into text.
We will use gapi which offers ability to transform audio files into text.
# GOOGLE_APPLICATION_CREDENTIALS=./gapi-auth.json
import json
import subprocess
from google.cloud import speech
from google.cloud.proto.speech.v1 import cloud_speech_pb2
speech_client = speech.SpeechClient()
path_to_file = './input.mp4'
output_path_to_file = './out.flac'
# command = 'ffmpeg -i {input_file} -ac 1 -c:a flac answer_11.flac'
subprocess.call(['ffmpeg', '-i', '{input_file}'.format(input_file=path_to_file), '-ac', '1', '-c:a', 'flac', output_path_to_file])
def parse_speech_recognition_result(speech_recognition_result):
data = {'data': []}
for result in speech_recognition_result.results:
for alternative in result.alternatives:
data['data'].append({
'transcript': alternative.transcript,
'confidence': alternative.confidence,
})
return data
with open(output_path_to_file, 'rb') as recording_file:
recording_bytes = recording_file.read()
audio = cloud_speech_pb2.RecognitionAudio(content=recording_bytes)
config = cloud_speech_pb2.RecognitionConfig(encoding="FLAC", sample_rate_hertz=44100, language_code="en-US")
speech_recognition_result = speech_client.recognize(config=config, audio=audio)
result = parse_speech_recognition_result(speech_recognition_result)
dump_result = json.dumps(result)
print(dump_result)