@@ -0,0 +1,130 @@ |
#!/usr/bin/env ruby |
require 'open3' |
require 'signal' |
require 'openai' |
# Define the silence threshold and duration for detection |
@silence_threshold = '-30dB' |
@silence_duration = 0.5 |
class AudioTranscriberApi |
def initialize(access_token:) |
@access_token = access_token |
@client = OpenAI::Client.new( |
access_token: @access_token, |
request_timeout: 20 |
) |
end |
def transcribe(audio_file) |
retries = 0 |
max_retries = 5 |
begin |
parameters = { |
model: "whisper-1", |
file: File.open(audio_file, "rb"), |
language: "en" |
} |
response = @client.audio.transcribe( |
parameters: parameters |
) |
response["text"] |
rescue StandardError => e |
puts "Error transcribing audio file: #{e.message}" |
if retries < max_retries |
retries += 1 |
puts "Retrying audio transcription for #{audio_file} (#{retries} times) after exponential backoff (#{2 ** retries} seconds)" |
sleep 2 ** retries |
else |
raise e |
end |
retry |
end |
end |
end |
def filename(file_index) |
"output_#{file_index.to_s.rjust(3, '0')}.wav" |
end |
def start_ffmpeg(file_index) |
file_name = filename(file_index) |
sox_cmd = [ |
'sox', |
'-t', 'coreaudio', |
'-d', # Use default audio input device |
'-c', '2', # Stereo channels |
'-r', '44100', # Sample rate |
'-b', '16', # Bit depth |
'-e', 'signed-integer', |
'-t', 'wav', # Output as WAV format for piping |
'-' |
] |
ffmpeg_cmd = [ |
'ffmpeg', |
'-f', 'wav', |
'-i', '-', # Input from sox via pipe |
'-af', "silencedetect=n=#{@silence_threshold}:d=#{@silence_duration}", |
'-c:a', 'pcm_s16le', |
'-y', # Overwrite output files |
file_name |
] |
# Start sox and pipe its output to ffmpeg |
sox_stdin, sox_stdout, sox_stderr, sox_wait_thr = Open3.popen3(*sox_cmd) |
stdin, stdout, stderr, ffmpeg_wait_thr = Open3.popen3(*ffmpeg_cmd) |
# Redirect sox output to ffmpeg input |
Thread.new do |
while (line = sox_stdout.gets) |
stdin.puts line |
end |
end |
# Return the threads and process handles |
return sox_stdin, stdout, stderr, sox_wait_thr, ffmpeg_wait_thr |
end |
def transcribe_audio(file_index) |
Thread.new do |
file_name = filename(file_index) |
transcriber = AudioTranscriberApi.new(access_token: ENV['OPENAI_API_KEY']) |
transcription = transcriber.transcribe(file_name) |
puts transcription |
File.delete(file_name) |
end |
end |
# Monitor for silence detection in stderr |
def monitor_for_silence |
Thread.new do |
@stderr.each do |line| |
if line.include?("silence_start") && !line.include?("silence_start: 0") |
# Gracefully interrupt the current ffmpeg process |
Process.kill("TERM", @sox_wait_thr.pid) if @sox_wait_thr.alive? |
Process.kill("TERM", @ffmpeg_wait_thr.pid) if @ffmpeg_wait_thr.alive? |
# Start a new recording |
transcribe_audio(@file_index) |
@file_index += 1 |
@stdin, @stdout, @stderr, @sox_wait_thr, @ffmpeg_wait_thr = start_ffmpeg(@file_index) |
monitor_for_silence |
break |
end |
end |
end |
end |
# Start the initial recording |
@file_index = 1 |
@stdin, @stdout, @stderr, @sox_wait_thr, @ffmpeg_wait_thr = start_ffmpeg(@file_index) |
monitor_for_silence |
# Keep the main thread alive to handle signals |
sleep |