Skip to content

Speed up Func#call by optimizing arg buffer initialization #169

@ianks

Description

@ianks

I was experimenting when making Func#call faster, and noticed there is a decent performance win by avoiding rb_ary_push. It requires a bit of unsafe code, but I believe the implemntation is sound:

diff --git i/bench/host_call.rb w/bench/host_call.rb
index ca4d3fc..b75fe40 100644
--- i/bench/host_call.rb
+++ w/bench/host_call.rb
@@ -1,14 +1,18 @@
 require_relative "bench"
 
-# Call host func (4 args)    337.181k (± 3.7%) i/s -      1.692M in   5.024584s
-# Call host func (16 args)   296.615k (± 5.2%) i/s -      1.498M in   5.064241s
-# Call host func (64 args)   217.487k (± 3.2%) i/s -      1.106M in   5.090547s
-# Call host func (128 args)  119.689k (± 3.8%) i/s -    605.136k in   5.063428s
+# New:
+# Call host func (4 results):     348836.6 i/s
+# Call host func (16 results):    320159.3 i/s - 1.09x  slower
+# Call host func (64 results):    224814.2 i/s - 1.55x  slower
+# Call host func (128 results):   126469.2 i/s - 2.76x  slower
+# Call host func (256 results):   50566.7 i/s  - 6.90x  slower
 
-# Call host func (4 args):   333800.6 i/s
-# Call host func (16 args):  291889.7 i/s - 1.14x  slower
-# Call host func (64 args):  185375.6 i/s - 1.80x  slower
-# Call host func (128 args): 97043.2 i/s - 3.44x  slower
+# Old:
+# Call host func (4 args):      331524.8 i/s
+# Call host func (16 args):     280723.9 i/s - 1.18x  slower
+# Call host func (64 args):     182873.7 i/s - 1.81x  slower
+# Call host func (128 args):    96891.1 i/s  - 3.42x  slower
+# Call host func (256 args):    40983.9 i/s  - 8.09x  slower
 
 Bench.ips do |x|
   engine = Wasmtime::Engine.new
diff --git i/ext/src/ruby_api/func.rs w/ext/src/ruby_api/func.rs
index 1ef4ba8..e42a494 100644
--- i/ext/src/ruby_api/func.rs
+++ w/ext/src/ruby_api/func.rs
@@ -1,3 +1,5 @@
+use std::mem::transmute;
+
 use super::{
     convert::{ToRubyValue, ToSym, ToValTypeVec, ToWasmVal},
     errors::result_error,
@@ -178,16 +180,57 @@ impl<'a> Func<'a> {
             [] => Ok(QNIL.into()),
             [result] => result.to_ruby_value(store),
             _ => {
-                let array = RArray::with_capacity(results.len());
-                for result in results {
-                    array.push(result.to_ruby_value(store)?)?;
+                // We want to initialized a sized ruby array so we can write to
+                // it without using `rb_ary_push`, which is slow. So we just
+                // pass in the results as a slice for the initial values. They
+                // will be overridden by the loop below. But be careful not to
+                // let Ruby try to use the values in the array before we've
+                // written to them.
+                //
+                // For result sizes of 128, this is about 20% faster than using
+                // rb_ary_push.
+                //
+                // # Safety
+                //
+                // Safety is guaranteed by the assertions below:
+                // - `rb_ary_new_from_values` will allocated `results.len()`
+                //   elements of usize (not wasmtime::Val)
+                // - Ruby never access the values in the array before we've
+                //   written to them, since doing so would point to the Rust vec
+                //   until we write to it
+                let fake_values = unsafe { transmute(results.as_slice()) };
+                let array = RArray::from_slice::<Value>(fake_values);
+                let array_slice = unsafe { rarray_as_mut_slice(array, results.len()) };
+                let results_iter = results.iter().enumerate();
+
+                assert!(array_slice.len() == results_iter.len()); // optimize out bounds check
+
+                for (i, result) in results.iter().enumerate() {
+                    array_slice[i] = result.to_ruby_value(store).map_err(|e| {
+                        // If we fail along the way, zero out the array just to be safe
+                        let _ = array.clear();
+                        e
+                    })?;
                 }
+
                 Ok(array.into())
             }
         }
     }
 }
 
+/// Converts an `RArray` into a mutable slice.
+///
+/// # Safety
+/// The capacity of the array must be known in advance for this to be safe, since we provide mutable access to the array's contents.
+unsafe fn rarray_as_mut_slice<'a>(array: RArray, capacity: usize) -> &'a mut [Value] {
+    let array_slice = unsafe { array.as_slice() };
+    let ptr = array_slice.as_ptr();
+    let array_slice = unsafe { std::slice::from_raw_parts_mut(ptr as *mut Value, capacity) };
+
+    array_slice
+}
+
 impl From<&Func<'_>> for wasmtime::Extern {
     fn from(func: &Func) -> Self {
         Self::Func(func.get())

Metadata

Metadata

Assignees

No one assigned

    Labels

    good first issueGood for newcomershelp wantedExtra attention is neededperfPerformance related

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions