davidteren · November 9, 2024 16:25 · Aug 11, 2024
diff --git a/transcribe-whisper-1.rb b/transcribe-whisper-1.rb
@@ -0,0 +1,130 @@
+#!/usr/bin/env ruby
+
+require 'open3'
+require 'signal'
+require 'openai'
+
+# Define the silence threshold and duration for detection
+@silence_threshold = '-30dB'
+@silence_duration = 0.5
+
+class AudioTranscriberApi
+  def initialize(access_token:)
+    @access_token = access_token
+    @client = OpenAI::Client.new(
+      access_token: @access_token,
+      request_timeout: 20
+      )
+  end
+
+  def transcribe(audio_file)
+    retries = 0
+    max_retries = 5
+
+    begin
+      parameters = {
+        model: "whisper-1",
+        file: File.open(audio_file, "rb"),
+        language: "en"
+      }
+
+      response = @client.audio.transcribe(
+        parameters: parameters
+      )
+      response["text"]
+    rescue StandardError => e
+      puts "Error transcribing audio file: #{e.message}"
+      if retries < max_retries
+        retries += 1
+        puts "Retrying audio transcription for #{audio_file} (#{retries} times) after exponential backoff (#{2 ** retries} seconds)"
+        sleep 2 ** retries
+      else
+        raise e
+      end
+      retry
+    end
+  end
+
+end
+
+def filename(file_index)
+  "output_#{file_index.to_s.rjust(3, '0')}.wav"
+end
+
+def start_ffmpeg(file_index)
+  file_name = filename(file_index)
+  sox_cmd = [
+    'sox',
+    '-t', 'coreaudio',
+    '-d', # Use default audio input device
+    '-c', '2', # Stereo channels
+    '-r', '44100', # Sample rate
+    '-b', '16', # Bit depth
+    '-e', 'signed-integer',
+    '-t', 'wav', # Output as WAV format for piping
+    '-'
+  ]
+
+  ffmpeg_cmd = [
+    'ffmpeg',
+    '-f', 'wav',
+    '-i', '-', # Input from sox via pipe
+    '-af', "silencedetect=n=#{@silence_threshold}:d=#{@silence_duration}",
+    '-c:a', 'pcm_s16le',
+    '-y', # Overwrite output files
+    file_name
+  ]
+
+  # Start sox and pipe its output to ffmpeg
+  sox_stdin, sox_stdout, sox_stderr, sox_wait_thr = Open3.popen3(*sox_cmd)
+  stdin, stdout, stderr, ffmpeg_wait_thr = Open3.popen3(*ffmpeg_cmd)
+
+  # Redirect sox output to ffmpeg input
+  Thread.new do
+    while (line = sox_stdout.gets)
+      stdin.puts line
+    end
+  end
+
+  # Return the threads and process handles
+  return sox_stdin, stdout, stderr, sox_wait_thr, ffmpeg_wait_thr
+end
+
+def transcribe_audio(file_index)
+  Thread.new do
+    file_name = filename(file_index)
+    transcriber = AudioTranscriberApi.new(access_token: ENV['OPENAI_API_KEY'])
+    transcription = transcriber.transcribe(file_name)
+    puts transcription
+    File.delete(file_name)
+  end
+end
+
+# Monitor for silence detection in stderr
+def monitor_for_silence
+  Thread.new do
+    @stderr.each do |line|
+      if line.include?("silence_start") && !line.include?("silence_start: 0")
+        # Gracefully interrupt the current ffmpeg process
+        Process.kill("TERM", @sox_wait_thr.pid) if @sox_wait_thr.alive?
+        Process.kill("TERM", @ffmpeg_wait_thr.pid) if @ffmpeg_wait_thr.alive?
+
+        # Start a new recording
+        transcribe_audio(@file_index)
+        @file_index += 1
+        @stdin, @stdout, @stderr, @sox_wait_thr, @ffmpeg_wait_thr = start_ffmpeg(@file_index)
+
+        monitor_for_silence
+        break
+      end
+    end
+  end
+end
+
+# Start the initial recording
+@file_index = 1
+@stdin, @stdout, @stderr, @sox_wait_thr, @ffmpeg_wait_thr = start_ffmpeg(@file_index)
+monitor_for_silence
+
+# Keep the main thread alive to handle signals
+sleep