Add last_profiled_frame field to thread state for remote profilers

Remote profilers that sample call stacks from external processes need to read the entire frame chain on every sample. For deep stacks, this is expensive since most of the stack is typically unchanged between samples. This adds a `last_profiled_frame` pointer that remote profilers can use to implement a caching optimization. When sampling, a profiler writes the current frame address here. The eval loop then keeps this pointer valid by updating it to the parent frame in _PyEval_FrameClearAndPop. This creates a "high-water mark" that always points to a frame still on the stack, allowing profilers to skip reading unchanged portions of the stack. The check in ceval.c is guarded so there's zero overhead when profiling isn't active (the field starts NULL and the branch is predictable).
python · pablogsal · Dec 6, 2025 · Dec 1, 2025 · Dec 1, 2025 · Dec 1, 2025
commit b2ca1aca447acf9490e0d28fdda314d78d7a571e
@@ -135,6 +135,8 @@ struct _ts {
     /* Pointer to currently executing frame. */
     struct _PyInterpreterFrame *current_frame;
 
+    struct _PyInterpreterFrame *last_profiled_frame;
+
     Py_tracefunc c_profilefunc;
     Py_tracefunc c_tracefunc;
     PyObject *c_profileobj;

diff --git a/Include/internal/pycore_debug_offsets.h b/Include/internal/pycore_debug_offsets.h
@@ -102,6 +102,7 @@ typedef struct _Py_DebugOffsets {
         uint64_t next;
         uint64_t interp;
         uint64_t current_frame;
+        uint64_t last_profiled_frame;
         uint64_t thread_id;
         uint64_t native_thread_id;
         uint64_t datastack_chunk;
@@ -272,6 +273,7 @@ typedef struct _Py_DebugOffsets {
         .next = offsetof(PyThreadState, next), \
         .interp = offsetof(PyThreadState, interp), \
         .current_frame = offsetof(PyThreadState, current_frame), \
+        .last_profiled_frame = offsetof(PyThreadState, last_profiled_frame), \
         .thread_id = offsetof(PyThreadState, thread_id), \
         .native_thread_id = offsetof(PyThreadState, native_thread_id), \
         .datastack_chunk = offsetof(PyThreadState, datastack_chunk), \

@@ -111,6 +111,24 @@ The shim frame points to a special code object containing the `INTERPRETER_EXIT`
 instruction which cleans up the shim frame and returns.
 
 
+### Remote Profiling Frame Cache
+
+The `last_profiled_frame` field in `PyThreadState` supports an optimization for
+remote profilers that sample call stacks from external processes. When a remote
+profiler reads the call stack, it writes the current frame address to this field.
+The eval loop then keeps this pointer valid by updating it to the parent frame
+whenever a frame returns (in `_PyEval_FrameClearAndPop`).
+
+This creates a "high-water mark" that always points to a frame still on the stack.
+On subsequent samples, the profiler can walk from `current_frame` until it reaches
+`last_profiled_frame`, knowing that frames from that point downward are unchanged
+and can be retrieved from a cache. This significantly reduces the amount of remote
+memory reads needed when call stacks are deep and stable at their base.
+
+The update in `_PyEval_FrameClearAndPop` is guarded: it only writes when
+`last_profiled_frame` is non-NULL, avoiding any overhead when profiling is inactive.
+
+
 ### The Instruction Pointer
 
 `_PyInterpreterFrame` has two fields which are used to maintain the instruction

@@ -2004,6 +2004,13 @@ clear_gen_frame(PyThreadState *tstate, _PyInterpreterFrame * frame)
 void
 _PyEval_FrameClearAndPop(PyThreadState *tstate, _PyInterpreterFrame * frame)
 {
+    // Update last_profiled_frame for remote profiler frame caching.
+    // By this point, tstate->current_frame is already set to the parent frame.
+    // The guarded check avoids writes when profiling is not active (predictable branch).
+    if (tstate->last_profiled_frame != NULL) {
+        tstate->last_profiled_frame = tstate->current_frame;
+    }
+
     if (frame->owner == FRAME_OWNED_BY_THREAD) {
         clear_thread_frame(tstate, frame);
     }