Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
ZJIT: Pre-size all profile entries for number of operands
  • Loading branch information
tekknolagi committed Jul 11, 2025
commit c00566de23a1aeed7a74c7675d961b1371511d54
7 changes: 3 additions & 4 deletions zjit/src/gc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ pub struct IseqPayload {
}

impl IseqPayload {
fn new(iseq_size: u32) -> Self {
Self { profile: IseqProfile::new(iseq_size), start_ptr: None }
fn new(iseq: IseqPtr) -> Self {
Self { profile: IseqProfile::new(iseq), start_ptr: None }
}
}

Expand All @@ -32,8 +32,7 @@ pub fn get_or_create_iseq_payload(iseq: IseqPtr) -> &'static mut IseqPayload {
// We drop the payload with Box::from_raw when the GC frees the iseq and calls us.
// NOTE(alan): Sometimes we read from an iseq without ever writing to it.
// We allocate in those cases anyways.
let iseq_size = get_iseq_encoded_size(iseq);
let new_payload = IseqPayload::new(iseq_size);
let new_payload = IseqPayload::new(iseq);
let new_payload = Box::into_raw(Box::new(new_payload));
rb_iseq_set_zjit_payload(iseq, new_payload as VoidPtr);

Expand Down
81 changes: 47 additions & 34 deletions zjit/src/profile.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,44 +41,15 @@ impl Profiler {
pub extern "C" fn rb_zjit_profile_insn(opcode: ruby_vminsn_type, ec: EcPtr) {
with_vm_lock(src_loc!(), || {
let mut profiler = Profiler::new(ec);
profile_insn(&mut profiler, opcode);
profile_operands(&mut profiler);
});
}

/// Profile a YARV instruction
fn profile_insn(profiler: &mut Profiler, opcode: ruby_vminsn_type) {
match opcode {
YARVINSN_opt_nil_p => profile_operands(profiler, 1),
YARVINSN_opt_plus => profile_operands(profiler, 2),
YARVINSN_opt_minus => profile_operands(profiler, 2),
YARVINSN_opt_mult => profile_operands(profiler, 2),
YARVINSN_opt_div => profile_operands(profiler, 2),
YARVINSN_opt_mod => profile_operands(profiler, 2),
YARVINSN_opt_eq => profile_operands(profiler, 2),
YARVINSN_opt_neq => profile_operands(profiler, 2),
YARVINSN_opt_lt => profile_operands(profiler, 2),
YARVINSN_opt_le => profile_operands(profiler, 2),
YARVINSN_opt_gt => profile_operands(profiler, 2),
YARVINSN_opt_ge => profile_operands(profiler, 2),
YARVINSN_opt_and => profile_operands(profiler, 2),
YARVINSN_opt_or => profile_operands(profiler, 2),
YARVINSN_opt_send_without_block => {
let cd: *const rb_call_data = profiler.insn_opnd(0).as_ptr();
let argc = unsafe { vm_ci_argc((*cd).ci) };
// Profile all the arguments and self (+1).
profile_operands(profiler, (argc + 1) as usize);
}
_ => {}
}
}

/// Profile the Type of top-`n` stack operands
fn profile_operands(profiler: &mut Profiler, n: usize) {
fn profile_operands(profiler: &mut Profiler) {
let profile = &mut get_or_create_iseq_payload(profiler.iseq).profile;
let types = &mut profile.opnd_types[profiler.insn_idx];
if types.len() <= n {
types.resize(n, Empty);
}
let n = types.len();
for i in 0..n {
let opnd_type = Type::from_value(profiler.peek_at_stack((n - i - 1) as isize));
types[i] = types[i].union(opnd_type);
Expand All @@ -91,9 +62,51 @@ pub struct IseqProfile {
opnd_types: Vec<Vec<Type>>,
}

/// Get YARV instruction argument
fn get_arg(pc: *const VALUE, arg_idx: isize) -> VALUE {
unsafe { *(pc.offset(arg_idx + 1)) }
}

impl IseqProfile {
pub fn new(iseq_size: u32) -> Self {
Self { opnd_types: vec![vec![]; iseq_size as usize] }
pub fn new(iseq: IseqPtr) -> Self {
// Pre-size all the operand slots in the opnd_types table so profiling is as fast as possible
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't quite understand how the pre-sizing is helping here. You're still giving vec![] as the initial value to each vector and then resizing each.

If we resize the vector when we insert to the index (only) for the first time, it's probably not slower, is it? If we do so, we can skip resizing the vector for unused instructions, so not doing the pre-sizing could be faster and save memory, IIUC.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This pre-sizing happens once per ISEQ per VM

The previous thing checked once per profile per opcode per ISEQ on the hot path if the vec was the right size

It was an additional 100-200ms savings, I think

Copy link
Member

@k0kubun k0kubun Jul 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I get that the current version is faster, but I'm only suggesting the same speedup could be achieved without pre-sizing.

It seems like the slowness in the previous version came (not from "checking" it but) from the fact that the vector was optional and we ended up using clone() every time to upsert Some(). But now that it's a non-optional vector and you can always get &mut of it, you should be able to check if the vector length is 0 and resize it on the first profile, without paying an allocation overhead on future profiles for the same opcode. Checking the length of a vector should be a negligible overhead compared to cloning (or resizing) a vector.

So I didn't get why it was changed to pre-sizing, but since it's already an improvement, I'm fine with merging this as is for now.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll remove the second commit for now

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, reverting it seems to slow it down by 5% or so. I guess doing multiple allocations at once helps? Since it does give a speedup in benchmarks, I'm actually fine with not reverting it.

Running benchmark "getivar" (1/1)
+ setarch x86_64 -R taskset -c 10 /opt/rubies/before/bin/ruby --zjit -I harness /home/k0kubun/src/github.com/Shopify/yjit-bench/benchmarks/getivar.rb
ruby 3.5.0dev (2025-07-11T15:51:28Z mb-benchmark-compile c00566de23) +ZJIT +PRISM [x86_64-linux]
itr:   time
 #1:  651ms
RSS: 14.7MiB
MAXRSS: 15.9MiB
Running benchmark "getivar" (1/1)
+ setarch x86_64 -R taskset -c 10 /opt/rubies/after/bin/ruby --zjit -I harness /home/k0kubun/src/github.com/Shopify/yjit-bench/benchmarks/getivar.rb
ruby 3.5.0dev (2025-07-11T15:51:28Z mb-benchmark-compile 993d5aa2a4) +ZJIT +PRISM [x86_64-linux]
itr:   time
 #1:  687ms
RSS: 14.6MiB
MAXRSS: 15.9MiB
Total time spent benchmarking: 1s

before: ruby 3.5.0dev (2025-07-11T15:51:28Z mb-benchmark-compile c00566de23) +ZJIT +PRISM [x86_64-linux]
after: ruby 3.5.0dev (2025-07-11T15:51:28Z mb-benchmark-compile 993d5aa2a4) +ZJIT +PRISM [x86_64-linux]

-------  -----------  ----------  ----------  ----------  -------------  ------------
bench    before (ms)  stddev (%)  after (ms)  stddev (%)  after 1st itr  before/after
getivar  651.6        0.0         687.2       0.0         0.948          0.948
-------  -----------  ----------  ----------  ----------  -------------  ------------
Legend:
- after 1st itr: ratio of before/after time for the first benchmarking iteration.
- before/after: ratio of before/after time. Higher is better for after. Above 1 represents a speedup.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll probably instead change profiling to not use Type but instead do a ClassDistribution-type thing of (VALUE class, shape_id_t shape) tuples. Is there a clean way of representing:

  • Ruby class
  • Fixnum & other immediates
  • Shape

in one small data structure?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's no such thing in CRuby itself. YJIT built its own variable-length encoding of profiling data in Context::encode()/Context::decode(). We probably need something different from YJIT's, so I guess we'll build one for ZJIT too.

Btw shapes are effectively used only for ivars today, so it's probably inefficient to use a Tuple that contains a shape for every instruction.

let iseq_size = unsafe { get_iseq_encoded_size(iseq) };
let mut opnd_types = vec![vec![]; iseq_size as usize];
let mut insn_idx = 0;
while insn_idx < iseq_size {
// Get the current pc and opcode
let pc = unsafe { rb_iseq_pc_at_idx(iseq, insn_idx) };

// try_into() call below is unfortunate. Maybe pick i32 instead of usize for opcodes.
let opcode: ruby_vminsn_type = unsafe { rb_iseq_opcode_at_pc(iseq, pc) }
.try_into()
.unwrap();
let n = match opcode {
YARVINSN_zjit_opt_nil_p => 1,
YARVINSN_zjit_opt_plus => 2,
YARVINSN_zjit_opt_minus => 2,
YARVINSN_zjit_opt_mult => 2,
YARVINSN_zjit_opt_div => 2,
YARVINSN_zjit_opt_mod => 2,
YARVINSN_zjit_opt_eq => 2,
YARVINSN_zjit_opt_neq => 2,
YARVINSN_zjit_opt_lt => 2,
YARVINSN_zjit_opt_le => 2,
YARVINSN_zjit_opt_gt => 2,
YARVINSN_zjit_opt_ge => 2,
YARVINSN_zjit_opt_and => 2,
YARVINSN_zjit_opt_or => 2,
YARVINSN_zjit_opt_send_without_block => {
let cd: *const rb_call_data = get_arg(pc, 0).as_ptr();
let argc = unsafe { vm_ci_argc((*cd).ci) };
argc + 1
}
_ => 0, // Don't profile
};
opnd_types[insn_idx as usize].resize(n as usize, Empty);
insn_idx += insn_len(opcode as usize);
}
Self { opnd_types }
}

/// Get profiled operand types for a given instruction index
Expand Down