Skip to content

Instantly share code, notes, and snippets.

@davidteren
Forked from swombat/transcribe-whisper-1.rb
Created November 9, 2024 16:25
Show Gist options
  • Save davidteren/0ab4a173aa70cd099462876ede85c96a to your computer and use it in GitHub Desktop.
Save davidteren/0ab4a173aa70cd099462876ede85c96a to your computer and use it in GitHub Desktop.

Revisions

  1. @swombat swombat created this gist Aug 11, 2024.
    130 changes: 130 additions & 0 deletions transcribe-whisper-1.rb
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,130 @@
    #!/usr/bin/env ruby

    require 'open3'
    require 'signal'
    require 'openai'

    # Define the silence threshold and duration for detection
    @silence_threshold = '-30dB'
    @silence_duration = 0.5

    class AudioTranscriberApi
    def initialize(access_token:)
    @access_token = access_token
    @client = OpenAI::Client.new(
    access_token: @access_token,
    request_timeout: 20
    )
    end

    def transcribe(audio_file)
    retries = 0
    max_retries = 5

    begin
    parameters = {
    model: "whisper-1",
    file: File.open(audio_file, "rb"),
    language: "en"
    }

    response = @client.audio.transcribe(
    parameters: parameters
    )
    response["text"]
    rescue StandardError => e
    puts "Error transcribing audio file: #{e.message}"
    if retries < max_retries
    retries += 1
    puts "Retrying audio transcription for #{audio_file} (#{retries} times) after exponential backoff (#{2 ** retries} seconds)"
    sleep 2 ** retries
    else
    raise e
    end
    retry
    end
    end

    end

    def filename(file_index)
    "output_#{file_index.to_s.rjust(3, '0')}.wav"
    end

    def start_ffmpeg(file_index)
    file_name = filename(file_index)
    sox_cmd = [
    'sox',
    '-t', 'coreaudio',
    '-d', # Use default audio input device
    '-c', '2', # Stereo channels
    '-r', '44100', # Sample rate
    '-b', '16', # Bit depth
    '-e', 'signed-integer',
    '-t', 'wav', # Output as WAV format for piping
    '-'
    ]

    ffmpeg_cmd = [
    'ffmpeg',
    '-f', 'wav',
    '-i', '-', # Input from sox via pipe
    '-af', "silencedetect=n=#{@silence_threshold}:d=#{@silence_duration}",
    '-c:a', 'pcm_s16le',
    '-y', # Overwrite output files
    file_name
    ]

    # Start sox and pipe its output to ffmpeg
    sox_stdin, sox_stdout, sox_stderr, sox_wait_thr = Open3.popen3(*sox_cmd)
    stdin, stdout, stderr, ffmpeg_wait_thr = Open3.popen3(*ffmpeg_cmd)

    # Redirect sox output to ffmpeg input
    Thread.new do
    while (line = sox_stdout.gets)
    stdin.puts line
    end
    end

    # Return the threads and process handles
    return sox_stdin, stdout, stderr, sox_wait_thr, ffmpeg_wait_thr
    end

    def transcribe_audio(file_index)
    Thread.new do
    file_name = filename(file_index)
    transcriber = AudioTranscriberApi.new(access_token: ENV['OPENAI_API_KEY'])
    transcription = transcriber.transcribe(file_name)
    puts transcription
    File.delete(file_name)
    end
    end

    # Monitor for silence detection in stderr
    def monitor_for_silence
    Thread.new do
    @stderr.each do |line|
    if line.include?("silence_start") && !line.include?("silence_start: 0")
    # Gracefully interrupt the current ffmpeg process
    Process.kill("TERM", @sox_wait_thr.pid) if @sox_wait_thr.alive?
    Process.kill("TERM", @ffmpeg_wait_thr.pid) if @ffmpeg_wait_thr.alive?

    # Start a new recording
    transcribe_audio(@file_index)
    @file_index += 1
    @stdin, @stdout, @stderr, @sox_wait_thr, @ffmpeg_wait_thr = start_ffmpeg(@file_index)

    monitor_for_silence
    break
    end
    end
    end
    end

    # Start the initial recording
    @file_index = 1
    @stdin, @stdout, @stderr, @sox_wait_thr, @ffmpeg_wait_thr = start_ffmpeg(@file_index)
    monitor_for_silence

    # Keep the main thread alive to handle signals
    sleep