AmazonS3に並列アップロード

http://d.hatena.ne.jp/shokai/20100220/1266646461 をgearmanで並列化した

upload-worker.rb

#!/usr/bin/env ruby
require 'rubygems'
require 'gearman'
require 'aws/s3'

BUCKET = "your-bucket-name"
GEAR_TASK = 'upload-awss3'

if ARGV.size > 0
  (ARGV.first.to_i-1).times do
    pid = fork
    break if !pid
    puts "fork pid:#{pid}"
  end
end

AWS::S3::Base.establish_connection!(:access_key_id => "your-key", 
                                    :secret_access_key => "your-secret")

#Gearman::Util.debug = true
w = Gearman::Worker.new(['localhost:7003'])
w.add_ability(GEAR_TASK){|query, job|
  prefix, name = query.split(/,/)
  filename = name.split(/\//).last
  begin
    print "upload #{name} ... "
    AWS::S3::S3Object.store("/#{prefix}/"+filename, open(name), BUCKET, :access => :public_read)
  rescue AWS::S3::ResponseError => error
    puts error
  else
    puts "success => http://#{BUCKET}.s3.amazonaws.com/#{prefix}/#{filename}"
  end
  "http://#{BUCKET}.s3.amazonaws.com/#{prefix}/#{filename}"
}

loop do
  w.work
end

jobはカンマ区切りで"prefix,filename"で渡す。
workerを複数起動させておいてからclientでtask登録する。

ruby upload-worker.rb 5

5つ起動した

fork pid:89264
fork pid:89265
fork pid:89266
fork pid:89267


upload-client.rb

#!/usr/bin/env ruby
require 'rubygems'
require 'gearman'

GEAR_TASK = 'upload-awss3'

c = Gearman::Client.new(['localhost:7003'])
taskset = Gearman::TaskSet.new(c)

now = Time.now

ARGV.sort{|a,b|a.split(/\//).last.to_i <=> b.split(/\//).last.to_i}.each{|name|
  query = "#{now.to_i},#{name}"
  puts "add task #{query}"
  task = Gearman::Task.new(GEAR_TASK, query)
  task.on_complete{|res|
    puts "#{name} => #{res}"
  }
  taskset.add_task(task)
}
taskset.wait(ARGV.size*5)

puts "finished #{ARGV.size} files #{Time.now-now} (sec)"

EC2インスタンスからS3へアップロードする場合、worker 5つで約5倍、10個で約7倍速になった。