Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Merge upstream/main into gh-138122-fix
Resolved merge conflicts in:
- Include/cpython/pystate.h: Keep both base_frame and last_profiled_frame fields
- Include/internal/pycore_debug_offsets.h: Add debug offsets for both fields
- InternalDocs/frames.md: Include documentation for both features
- Modules/_remote_debugging/_remote_debugging.h: Updated process_frame_chain signature
- Modules/_remote_debugging/frames.c: Merged base_frame validation with caching logic
- Modules/_remote_debugging/threads.c: Integrated caching support with base_frame

The base_frame feature (for stack validation) now coexists with the
last_profiled_frame feature (for profiling cache optimization).
  • Loading branch information
pablogsal committed Dec 6, 2025
commit 9986d885afd123068c606de3eaccfbe7b7f34519
2 changes: 2 additions & 0 deletions Include/cpython/pystate.h
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,8 @@ struct _ts {
is defined in internal headers that cannot be exposed in the public API. */
struct _PyInterpreterFrame *base_frame;

struct _PyInterpreterFrame *last_profiled_frame;

Py_tracefunc c_profilefunc;
Py_tracefunc c_tracefunc;
PyObject *c_profileobj;
Expand Down
2 changes: 2 additions & 0 deletions Include/internal/pycore_debug_offsets.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ typedef struct _Py_DebugOffsets {
uint64_t interp;
uint64_t current_frame;
uint64_t base_frame;
uint64_t last_profiled_frame;
uint64_t thread_id;
uint64_t native_thread_id;
uint64_t datastack_chunk;
Expand Down Expand Up @@ -274,6 +275,7 @@ typedef struct _Py_DebugOffsets {
.interp = offsetof(PyThreadState, interp), \
.current_frame = offsetof(PyThreadState, current_frame), \
.base_frame = offsetof(PyThreadState, base_frame), \
.last_profiled_frame = offsetof(PyThreadState, last_profiled_frame), \
.thread_id = offsetof(PyThreadState, thread_id), \
.native_thread_id = offsetof(PyThreadState, native_thread_id), \
.datastack_chunk = offsetof(PyThreadState, datastack_chunk), \
Expand Down
20 changes: 20 additions & 0 deletions InternalDocs/frames.md
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,26 @@ last frame address matches `base_frame`. If not, discard the sample as incomplet
since the frame chain may have been in an inconsistent state due to concurrent updates.


### Remote Profiling Frame Cache

The `last_profiled_frame` field in `PyThreadState` supports an optimization for
remote profilers that sample call stacks from external processes. When a remote
profiler reads the call stack, it writes the current frame address to this field.
The eval loop then keeps this pointer valid by updating it to the parent frame
whenever a frame returns (in `_PyEval_FrameClearAndPop`).

This creates a "high-water mark" that always points to a frame still on the stack.
On subsequent samples, the profiler can walk from `current_frame` until it reaches
`last_profiled_frame`, knowing that frames from that point downward are unchanged
and can be retrieved from a cache. This significantly reduces the amount of remote
memory reads needed when call stacks are deep and stable at their base.

The update in `_PyEval_FrameClearAndPop` is guarded: it only writes when
`last_profiled_frame` is non-NULL AND matches the frame being popped. This
prevents transient frames (called and returned between profiler samples) from
corrupting the cache pointer, while avoiding any overhead when profiling is inactive.


### The Instruction Pointer

`_PyInterpreterFrame` has two fields which are used to maintain the instruction
Expand Down
7 changes: 6 additions & 1 deletion Modules/_remote_debugging/_remote_debugging.h
Original file line number Diff line number Diff line change
Expand Up @@ -402,7 +402,12 @@ extern int process_frame_chain(
StackChunkList *chunks,
PyObject *frame_info,
uintptr_t base_frame_addr,
uintptr_t gc_frame
uintptr_t gc_frame,
uintptr_t last_profiled_frame,
int *stopped_at_cached_frame,
uintptr_t *frame_addrs,
Py_ssize_t *num_addrs,
Py_ssize_t max_addrs
);

/* Frame cache functions */
Expand Down
18 changes: 13 additions & 5 deletions Modules/_remote_debugging/frames.c
Original file line number Diff line number Diff line change
Expand Up @@ -260,12 +260,17 @@ process_frame_chain(
StackChunkList *chunks,
PyObject *frame_info,
uintptr_t base_frame_addr,
uintptr_t gc_frame)
uintptr_t gc_frame,
uintptr_t last_profiled_frame,
int *stopped_at_cached_frame,
uintptr_t *frame_addrs, // optional: C array to receive frame addresses
Py_ssize_t *num_addrs, // in/out: current count / updated count
Py_ssize_t max_addrs) // max capacity of frame_addrs array
{
uintptr_t frame_addr = initial_frame_addr;
uintptr_t prev_frame_addr = 0;
uintptr_t last_frame_addr = 0; // Track last frame visited for validation
const size_t MAX_FRAMES = 1024;
const size_t MAX_FRAMES = 1024 + 512;
size_t frame_count = 0;

// Initialize output flag
Expand Down Expand Up @@ -374,7 +379,10 @@ process_frame_chain(
}

// Validate we reached the base frame (sentinel at bottom of stack)
if (last_frame_addr != base_frame_addr) {
// Only validate if we walked the full chain (didn't stop at cached frame)
// and base_frame_addr is provided (non-zero)
int stopped_early = stopped_at_cached_frame && *stopped_at_cached_frame;
if (!stopped_early && base_frame_addr != 0 && last_frame_addr != base_frame_addr) {
PyErr_Format(PyExc_RuntimeError,
"Incomplete sample: did not reach base frame (expected 0x%lx, got 0x%lx)",
base_frame_addr, last_frame_addr);
Expand Down Expand Up @@ -544,7 +552,7 @@ collect_frames_with_cache(
Py_ssize_t frames_before = PyList_GET_SIZE(frame_info);

int stopped_at_cached = 0;
if (process_frame_chain(unwinder, frame_addr, chunks, frame_info, gc_frame,
if (process_frame_chain(unwinder, frame_addr, chunks, frame_info, 0, gc_frame,
last_profiled_frame, &stopped_at_cached,
addrs, &num_addrs, FRAME_CACHE_MAX_FRAMES) < 0) {
return -1;
Expand All @@ -566,7 +574,7 @@ collect_frames_with_cache(
// Cache miss - continue walking from last_profiled_frame to get the rest
STATS_INC(unwinder, frame_cache_misses);
Py_ssize_t frames_before_walk = PyList_GET_SIZE(frame_info);
if (process_frame_chain(unwinder, last_profiled_frame, chunks, frame_info, gc_frame,
if (process_frame_chain(unwinder, last_profiled_frame, chunks, frame_info, 0, gc_frame,
0, NULL, addrs, &num_addrs, FRAME_CACHE_MAX_FRAMES) < 0) {
return -1;
}
Expand Down
25 changes: 22 additions & 3 deletions Modules/_remote_debugging/threads.c
Original file line number Diff line number Diff line change
Expand Up @@ -396,9 +396,28 @@ unwind_stack_for_thread(
}
}

if (process_frame_chain(unwinder, frame_addr, &chunks, frame_info, base_frame_addr, gc_frame) < 0) {
set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to process frame chain");
goto error;
if (unwinder->cache_frames) {
// Use cache to avoid re-reading unchanged parent frames
uintptr_t last_profiled_frame = GET_MEMBER(uintptr_t, ts,
unwinder->debug_offsets.thread_state.last_profiled_frame);
if (collect_frames_with_cache(unwinder, frame_addr, &chunks, frame_info,
gc_frame, last_profiled_frame, tid) < 0) {
set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to collect frames");
goto error;
}
// Update last_profiled_frame for next sample
uintptr_t lpf_addr = *current_tstate + unwinder->debug_offsets.thread_state.last_profiled_frame;
if (_Py_RemoteDebug_WriteRemoteMemory(&unwinder->handle, lpf_addr,
sizeof(uintptr_t), &frame_addr) < 0) {
PyErr_Clear(); // Non-fatal
}
} else {
// No caching - process entire frame chain with base_frame validation
if (process_frame_chain(unwinder, frame_addr, &chunks, frame_info,
base_frame_addr, gc_frame, 0, NULL, NULL, NULL, 0) < 0) {
set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to process frame chain");
goto error;
}
}

*current_tstate = GET_MEMBER(uintptr_t, ts, unwinder->debug_offsets.thread_state.next);
Expand Down
Loading
You are viewing a condensed version of this merge commit. You can view the full changes here.