Python/perf_jit_trampoline.c - external/github.com/python/cpython - Git at Google

 /*
  * Python Perf Trampoline Support - JIT Dump Implementation
  *
  * This file implements the perf jitdump API for Python's performance profiling
  * integration. It allows perf (Linux performance analysis tool) to understand
  * and profile dynamically generated Python bytecode by creating JIT dump files
  * that perf can inject into its analysis.
  *
  *
  * IMPORTANT: This file exports specific callback functions that are part of
  * Python's internal API. Do not modify the function signatures or behavior
  * of exported functions without coordinating with the Python core team.
  *
  * Usually the binary and libraries are mapped in separate region like below:
  *
  *   address ->
  *    --+---------------------+--//--+---------------------+--
  *      | .text | .data | ... |      | .text | .data | ... |
  *    --+---------------------+--//--+---------------------+--
  *          myprog                      libc.so
  *
  * So it'd be easy and straight-forward to find a mapped binary or library from an
  * address.
  *
  * But for JIT code, the code arena only cares about the code section. But the
  * resulting DSOs (which is generated by perf inject -j) contain ELF headers and
  * unwind info too. Then it'd generate following address space with synthesized
  * MMAP events. Let's say it has a sample between address B and C.
  *
  *                                                sample
  *                                                  |
  *   address ->                         A       B   v   C
  *   ---------------------------------------------------------------------------------------------------
  *   /tmp/jitted-PID-0.so   | (headers) | .text | unwind info |
  *   /tmp/jitted-PID-1.so           | (headers) | .text | unwind info |
  *   /tmp/jitted-PID-2.so                   | (headers) | .text | unwind info |
  *     ...
  *   ---------------------------------------------------------------------------------------------------
  *
  * If it only maps the .text section, it'd find the jitted-PID-1.so but cannot see
  * the unwind info. If it maps both .text section and unwind sections, the sample
  * could be mapped to either jitted-PID-0.so or jitted-PID-1.so and it's confusing
  * which one is right. So to make perf happy we have non-overlapping ranges for each
  * DSO:
  *
  *   address ->
  *   -------------------------------------------------------------------------------------------------------
  *   /tmp/jitted-PID-0.so   | (headers) | .text | unwind info |
  *   /tmp/jitted-PID-1.so                         | (headers) | .text | unwind info |
  *   /tmp/jitted-PID-2.so                                               | (headers) | .text | unwind info |
  *     ...
  *   -------------------------------------------------------------------------------------------------------
  *
  * As the trampolines are constant, we add a constant padding but in general the padding needs to have the
  * size of the unwind info rounded to 16 bytes. In general, for our trampolines this is 0x50
  */


 #include "Python.h"
 #include "pycore_ceval.h"         // _PyPerf_Callbacks
 #include "pycore_frame.h"
 #include "pycore_interp.h"
 #include "pycore_mmap.h"          // _PyAnnotateMemoryMap()
 #include "pycore_jit_unwind.h"
 #include "pycore_runtime.h"       // _PyRuntime

 #ifdef PY_HAVE_PERF_TRAMPOLINE

 /* Standard library includes for perf jitdump implementation */
 #if defined(__linux__)
 #  include <elf.h>                // ELF architecture constants
 #endif
 #include <fcntl.h>                // File control operations
 #include <stdio.h>                // Standard I/O operations
 #include <stdlib.h>               // Standard library functions
 #include <string.h>               // memcpy, strlen
 #include <sys/mman.h>             // Memory mapping functions (mmap)
 #include <sys/types.h>            // System data types
 #include <unistd.h>               // System calls (sysconf, getpid)
 #include <sys/time.h>             // Time functions (gettimeofday)
 #if defined(__linux__)
 #  include <sys/syscall.h>        // System call interface
 #endif

 // =============================================================================
 //                           CONSTANTS AND CONFIGURATION
 // =============================================================================

 /*
  * Memory layout considerations for perf jitdump:
  *
  * Perf expects non-overlapping memory regions for each JIT-compiled function.
  * When perf processes the jitdump file, it creates synthetic DSO (Dynamic
  * Shared Object) files that contain:
  * - ELF headers
  * - .text section (actual machine code)
  * - Unwind information (for stack traces)
  *
  * To ensure proper address space layout, we add padding between code regions.
  * This prevents address conflicts when perf maps the synthesized DSOs.
  *
  * Memory layout example:
  * /tmp/jitted-PID-0.so: [headers][.text][unwind_info][padding]
  * /tmp/jitted-PID-1.so:                                       [headers][.text][unwind_info][padding]
  *
  * The padding size is now calculated automatically during initialization
  * based on the actual unwind information requirements.
  */


 /* These constants are defined inside <elf.h>, which we can't use outside of linux. */
 #if !defined(__linux__)
 #  if defined(__i386__) || defined(_M_IX86)
 #    define EM_386      3
 #  elif defined(__arm__) || defined(_M_ARM)
 #    define EM_ARM      40
 #  elif defined(__x86_64__) || defined(_M_X64)
 #    define EM_X86_64   62
 #  elif defined(__aarch64__)
 #    define EM_AARCH64  183
 #  elif defined(__riscv)
 #    define EM_RISCV    243
 #  endif
 #endif

 /* Convenient access to the global trampoline API state */
 #define trampoline_api _PyRuntime.ceval.perf.trampoline_api

 /* Type aliases for clarity and portability */
 typedef uint64_t uword;                    // Word-sized unsigned integer
 typedef const char* CodeComments;          // Code comment strings

 /* Memory size constants */
 #define MB (1024 * 1024)                   // 1 Megabyte for buffer sizing

 // =============================================================================
 //                        ARCHITECTURE-SPECIFIC DEFINITIONS
 // =============================================================================

 /*
  * Returns the ELF machine architecture constant for the current platform.
  * This is required for the jitdump header to correctly identify the target
  * architecture for perf processing.
  *
  */
 static uint64_t GetElfMachineArchitecture(void) {
 #if defined(__x86_64__) || defined(_M_X64)
     return EM_X86_64;
 #elif defined(__i386__) || defined(_M_IX86)
     return EM_386;
 #elif defined(__aarch64__)
     return EM_AARCH64;
 #elif defined(__arm__) || defined(_M_ARM)
     return EM_ARM;
 #elif defined(__riscv)
     return EM_RISCV;
 #else
     Py_UNREACHABLE();  // Unsupported architecture - should never reach here
     return 0;
 #endif
 }

 // =============================================================================
 //                           PERF JITDUMP DATA STRUCTURES
 // =============================================================================

 /*
  * Perf jitdump file format structures
  *
  * These structures define the binary format that perf expects for JIT dump files.
  * The format is documented in the Linux perf tools source code and must match
  * exactly for proper perf integration.
  */

 /*
  * Jitdump file header - written once at the beginning of each jitdump file
  * Contains metadata about the process and jitdump format version
  */
 typedef struct {
     uint32_t magic;              // Magic number (0x4A695444 = "JiTD")
     uint32_t version;            // Jitdump format version (currently 1)
     uint32_t size;               // Size of this header structure
     uint32_t elf_mach_target;    // Target architecture (from GetElfMachineArchitecture)
     uint32_t reserved;           // Reserved field (must be 0)
     uint32_t process_id;         // Process ID of the JIT compiler
     uint64_t time_stamp;         // Timestamp when jitdump was created
     uint64_t flags;              // Feature flags (currently unused)
 } Header;

 /*
  * Perf event types supported by the jitdump format
  * Each event type has a corresponding structure format
  */
 enum PerfEvent {
     PerfLoad = 0,           // Code load event (new JIT function)
     PerfMove = 1,           // Code move event (function relocated)
     PerfDebugInfo = 2,      // Debug information event
     PerfClose = 3,          // JIT session close event
     PerfUnwindingInfo = 4   // Stack unwinding information event
 };

 /*
  * Base event structure - common header for all perf events
  * Every event in the jitdump file starts with this structure
  */
 struct BaseEvent {
     uint32_t event;         // Event type (from PerfEvent enum)
     uint32_t size;          // Total size of this event including payload
     uint64_t time_stamp;    // Timestamp when event occurred
 };

 /*
  * Code load event - indicates a new JIT-compiled function is available
  * This is the most important event type for Python profiling
  */
 typedef struct {
     struct BaseEvent base;   // Common event header
     uint32_t process_id;     // Process ID where code was generated
 #if defined(__APPLE__)
     uint64_t thread_id;      // Thread ID where code was generated
 #else
     uint32_t thread_id;      // Thread ID where code was generated
 #endif
     uint64_t vma;            // Virtual memory address where code is loaded
     uint64_t code_address;   // Address of the actual machine code
     uint64_t code_size;      // Size of the machine code in bytes
     uint64_t code_id;        // Unique identifier for this code region
     /* Followed by:
      * - null-terminated function name string
      * - raw machine code bytes
      */
 } CodeLoadEvent;

 /*
  * Code unwinding information event - provides DWARF data for stack traces
  * Essential for proper stack unwinding during profiling
  */
 typedef struct {
     struct BaseEvent base;      // Common event header
     uint64_t unwind_data_size;  // Size of the unwinding data
     uint64_t eh_frame_hdr_size; // Size of the EH frame header
     uint64_t mapped_size;       // Total mapped size (with padding)
     /* Followed by:
      * - EH frame header
      * - DWARF unwinding information
      * - Padding to alignment boundary
      */
 } CodeUnwindingInfoEvent;

 /*
  * EH Frame Header structure for DWARF unwinding
  *
  * This header provides metadata about the .eh_frame data that follows.
  * It uses PC-relative and data-relative encodings to keep the synthesized
  * DSO self-contained when perf injects it.
  */
 typedef struct __attribute__((packed)) {
     uint8_t version;
     uint8_t eh_frame_ptr_enc;
     uint8_t fde_count_enc;
     uint8_t table_enc;
     int32_t eh_frame_ptr;
     uint32_t eh_fde_count;
     int32_t from;
     int32_t to;
 } EhFrameHeader;
 _Static_assert(sizeof(EhFrameHeader) == 20, "EhFrameHeader layout mismatch");

 // =============================================================================
 //                              GLOBAL STATE MANAGEMENT
 // =============================================================================

 /*
  * Global state for the perf jitdump implementation
  *
  * This structure maintains all the state needed for generating jitdump files.
  * It's designed as a singleton since there's typically only one jitdump file
  * per Python process.
  */
 typedef struct {
     FILE* perf_map;          // File handle for the jitdump file
     PyMutex map_lock;        // Thread synchronization lock
     void* mapped_buffer;     // Memory-mapped region (signals perf we're active)
     size_t mapped_size;      // Size of the mapped region
     uint32_t code_id;        // Counter for unique code region identifiers
     uint64_t build_id_salt;  // Per-process salt for unique synthetic DSOs
 } PerfMapJitState;

 /* Global singleton instance */
 static PerfMapJitState perf_jit_map_state;

 // =============================================================================
 //                              TIME UTILITIES
 // =============================================================================

 /* Time conversion constant */
 static const intptr_t nanoseconds_per_second = 1000000000;

 /*
  * Get current monotonic time in nanoseconds
  *
  * Monotonic time is preferred for event timestamps because it's not affected
  * by system clock adjustments. This ensures consistent timing relationships
  * between events even if the system clock is changed.
  *
  * Returns: Current monotonic time in nanoseconds since an arbitrary epoch
  */
 static int64_t get_current_monotonic_ticks(void) {
     struct timespec ts;
     if (clock_gettime(CLOCK_MONOTONIC, &ts) != 0) {
         Py_UNREACHABLE();  // Should never fail on supported systems
         return 0;
     }

     /* Convert to nanoseconds for maximum precision */
     int64_t result = ts.tv_sec;
     result *= nanoseconds_per_second;
     result += ts.tv_nsec;
     return result;
 }

 /*
  * Get current wall clock time in microseconds
  *
  * Used for the jitdump file header timestamp. Unlike monotonic time,
  * this represents actual wall clock time that can be correlated with
  * other system events.
  *
  * Returns: Current time in microseconds since Unix epoch
  */
 static int64_t get_current_time_microseconds(void) {
     struct timeval tv;
     if (gettimeofday(&tv, NULL) < 0) {
         Py_UNREACHABLE();  // Should never fail on supported systems
         return 0;
     }
     return ((int64_t)(tv.tv_sec) * 1000000) + tv.tv_usec;
 }

 // =============================================================================
 //                              FILE I/O UTILITIES
 // =============================================================================

 /*
  * Write data to the jitdump file with error handling
  *
  * This function ensures that all data is written to the file, handling
  * partial writes that can occur with large buffers or when the system
  * is under load.
  *
  * Args:
  *   buffer: Pointer to data to write
  *   size: Number of bytes to write
  */
 static void perf_map_jit_write_fully(const void* buffer, size_t size) {
     FILE* out_file = perf_jit_map_state.perf_map;
     const char* ptr = (const char*)(buffer);

     while (size > 0) {
         const size_t written = fwrite(ptr, 1, size, out_file);
         if (written == 0) {
             Py_UNREACHABLE();  // Write failure - should be very rare
             break;
         }
         size -= written;
         ptr += written;
     }
 }

 /*
  * Write the jitdump file header
  *
  * The header must be written exactly once at the beginning of each jitdump
  * file. It provides metadata that perf uses to parse the rest of the file.
  *
  * Args:
  *   pid: Process ID to include in the header
  *   out_file: File handle to write to (currently unused, uses global state)
  */
 static void perf_map_jit_write_header(int pid, FILE* out_file) {
     Header header;

     /* Initialize header with required values */
     header.magic = 0x4A695444;                    // "JiTD" magic number
     header.version = 1;                           // Current jitdump version
     header.size = sizeof(Header);                 // Header size for validation
     header.elf_mach_target = GetElfMachineArchitecture();  // Target architecture
     header.reserved = 0;                          // padding reserved for future use
     header.process_id = pid;                      // Process identifier
     header.time_stamp = get_current_time_microseconds();   // Creation time
     header.flags = 0;                             // No special flags currently used

     perf_map_jit_write_fully(&header, sizeof(header));
 }

 // =============================================================================
 //                              JITDUMP INITIALIZATION
 // =============================================================================

 /*
  * Initialize the perf jitdump interface
  *
  * This function sets up everything needed to generate jitdump files:
  * 1. Creates the jitdump file with a unique name
  * 2. Maps the first page to signal perf that we're using the interface
  * 3. Writes the jitdump header
  * 4. Initializes synchronization primitives
  *
  * The memory mapping is crucial - perf detects jitdump files by scanning
  * for processes that have mapped files matching the pattern /tmp/jit-*.dump
  *
  * Returns: Pointer to initialized state, or NULL on failure
  */
 static void* perf_map_jit_init(void) {
     PyMutex_Lock(&perf_jit_map_state.map_lock);
     if (perf_jit_map_state.perf_map != NULL) {
         PyMutex_Unlock(&perf_jit_map_state.map_lock);
         return &perf_jit_map_state;
     }

     char filename[100];
     int pid = getpid();

     /* Create unique filename based on process ID */
     snprintf(filename, sizeof(filename) - 1, "/tmp/jit-%d.dump", pid);

     /* Create/open the jitdump file with appropriate permissions */
     const int fd = open(filename, O_CREAT | O_TRUNC | O_RDWR, 0666);
     if (fd == -1) {
         PyMutex_Unlock(&perf_jit_map_state.map_lock);
         return NULL;  // Failed to create file
     }

     /* Get system page size for memory mapping */
     const long page_size = sysconf(_SC_PAGESIZE);
     if (page_size == -1) {
         close(fd);
         PyMutex_Unlock(&perf_jit_map_state.map_lock);
         return NULL;  // Failed to get page size
     }

 #if defined(__APPLE__)
     // On macOS, samply uses a preload to find jitdumps and this mmap can be slow.
     perf_jit_map_state.mapped_buffer = NULL;
 #else
     /*
      * Map the first page of the jitdump file
      *
      * This memory mapping serves as a signal to perf that this process
      * is generating JIT code. Perf scans /proc/.../maps looking for mapped
      * files that match the jitdump naming pattern.
      *
      * The mapping must be PROT_READ | PROT_EXEC to be detected by perf.
      */
     perf_jit_map_state.mapped_buffer = mmap(
         NULL,                    // Let kernel choose address
         page_size,               // Map one page
         PROT_READ | PROT_EXEC,   // Read and execute permissions (required by perf)
         MAP_PRIVATE,             // Private mapping
         fd,                      // File descriptor
         0                        // Offset 0 (first page)
     );

     if (perf_jit_map_state.mapped_buffer == MAP_FAILED) {
         perf_jit_map_state.mapped_buffer = NULL;
         close(fd);
         PyMutex_Unlock(&perf_jit_map_state.map_lock);
         return NULL;  // Memory mapping failed
     }
     (void)_PyAnnotateMemoryMap(perf_jit_map_state.mapped_buffer, page_size,
                                "cpython:perf_jit_trampoline");
 #endif

     perf_jit_map_state.mapped_size = page_size;

     /* Convert file descriptor to FILE* for easier I/O operations */
     perf_jit_map_state.perf_map = fdopen(fd, "w+");
     if (perf_jit_map_state.perf_map == NULL) {
         close(fd);
         PyMutex_Unlock(&perf_jit_map_state.map_lock);
         return NULL;  // Failed to create FILE*
     }

     /*
      * Set up file buffering for better performance
      *
      * We use a large buffer (2MB) because jitdump files can be written
      * frequently during program execution. Buffering reduces system call
      * overhead and improves overall performance.
      */
     setvbuf(perf_jit_map_state.perf_map, NULL, _IOFBF, 2 * MB);

     /* Write the jitdump file header */
     perf_map_jit_write_header(pid, perf_jit_map_state.perf_map);

     /* Initialize code ID counter */
     perf_jit_map_state.code_id = 0;
     perf_jit_map_state.build_id_salt =
         ((uint64_t)pid << 32) ^ (uint64_t)get_current_monotonic_ticks();

     /* Calculate padding size based on actual unwind info requirements */
     size_t eh_frame_size = _PyJitUnwind_EhFrameSize(0);
     size_t unwind_data_size = sizeof(EhFrameHeader) + eh_frame_size;
     trampoline_api.code_padding = _Py_SIZE_ROUND_UP(unwind_data_size, 16);
     trampoline_api.code_alignment = 32;

     PyMutex_Unlock(&perf_jit_map_state.map_lock);
     return &perf_jit_map_state;
 }

 // =============================================================================
 //                              MAIN JITDUMP ENTRY WRITING
 // =============================================================================

 /*
  * Write a complete jitdump entry for a code region with a provided name.
  *
  * This shares the same implementation as the trampoline callback, but
  * allows callers that don't have a PyCodeObject to reuse the jitdump
  * infrastructure.
  */
 static void perf_map_jit_write_entry_with_name(
     void *state,
     const void *code_addr,
     size_t code_size,
     const char *entry,
     const char *filename
 )
 {
     /* Initialize jitdump system on first use */
     void* ret = perf_map_jit_init();
     if (ret == NULL) {
         return;  // Initialization failed, silently abort
     }

     if (entry == NULL) {
         entry = "";
     }
     if (filename == NULL) {
         filename = "";
     }

     /*
      * Create formatted function name for perf display
      *
      * Format: "py::<function_name>:<filename>"
      * The "py::" prefix helps identify Python functions in mixed-language
      * profiles (e.g., when profiling C extensions alongside Python code).
      */
     size_t perf_map_entry_size = snprintf(NULL, 0, "py::%s:%s", entry, filename) + 1;
     char* perf_map_entry = (char*) PyMem_RawMalloc(perf_map_entry_size);
     if (perf_map_entry == NULL) {
         return;  // Memory allocation failed
     }
     snprintf(perf_map_entry, perf_map_entry_size, "py::%s:%s", entry, filename);

     const size_t name_length = strlen(perf_map_entry);
     uword base = (uword)code_addr;
     uword size = code_size;

     /*
      * Generate DWARF unwinding information
      *
      * DWARF data is essential for proper stack unwinding during profiling.
      * Without it, perf cannot generate accurate call graphs, especially
      * in optimized code where frame pointers may be omitted.
      */
     uint8_t buffer[1024];  // Buffer for DWARF data (1KB should be sufficient)
     size_t eh_frame_size = _PyJitUnwind_BuildEhFrame(
         buffer, sizeof(buffer), code_addr, code_size, 0);
     if (eh_frame_size == 0) {
         PyMem_RawFree(perf_map_entry);
         return;
     }

     /*
      * A logical jitdump entry is written as multiple records and also consumes
      * a process-global code_id. Serialize the whole sequence so concurrent JIT
      * compilation cannot interleave records or reuse an ID.
      */
     PyMutex_Lock(&perf_jit_map_state.map_lock);

     /*
      * Write Code Unwinding Information Event
      *
      * This event must be written before the code load event to ensure
      * perf has the unwinding information available when it processes
      * the code region.
      */
     CodeUnwindingInfoEvent ev2;
     ev2.base.event = PerfUnwindingInfo;
     ev2.base.time_stamp = get_current_monotonic_ticks();
     ev2.unwind_data_size = sizeof(EhFrameHeader) + eh_frame_size;

     /* Verify we don't exceed our padding budget */
     assert(ev2.unwind_data_size <= (uint64_t)trampoline_api.code_padding);

     ev2.eh_frame_hdr_size = sizeof(EhFrameHeader);
     ev2.mapped_size = _Py_SIZE_ROUND_UP(ev2.unwind_data_size, 16);  // 16-byte alignment

     /* Calculate total event size with padding */
     int content_size = (int)(sizeof(ev2) + sizeof(EhFrameHeader) + eh_frame_size);
     int padding_size = (int)_Py_SIZE_ROUND_UP((size_t)content_size, 8) - content_size;  // 8-byte align
     ev2.base.size = (uint32_t)(content_size + padding_size);

     /* Write the unwinding info event header */
     perf_map_jit_write_fully(&ev2, sizeof(ev2));

     /*
      * Write EH Frame Header
      *
      * The EH frame header provides metadata about the DWARF unwinding
      * information that follows. It includes pointers and counts that
      * help perf navigate the unwinding data efficiently.
      */
     EhFrameHeader f;
     f.version = 1;
     f.eh_frame_ptr_enc = DWRF_EH_PE_sdata4 | DWRF_EH_PE_pcrel;
     f.fde_count_enc = DWRF_EH_PE_udata4;
     f.table_enc = DWRF_EH_PE_sdata4 | DWRF_EH_PE_datarel;

     /* Calculate relative offsets for EH frame navigation */
     f.eh_frame_ptr = -(int32_t)(eh_frame_size + 4 * sizeof(unsigned char));
     f.eh_fde_count = 1;  // We generate exactly one FDE per function
     f.from = -(int32_t)(_Py_SIZE_ROUND_UP(code_size, 8) + eh_frame_size);
     uint32_t cie_payload_size;
     memcpy(&cie_payload_size, buffer, sizeof(cie_payload_size));
     int cie_size = (int)(sizeof(cie_payload_size) + cie_payload_size);
     f.to = -(int32_t)(eh_frame_size - cie_size);

     /* Write EH frame data and header */
     perf_map_jit_write_fully(buffer, eh_frame_size);
     perf_map_jit_write_fully(&f, sizeof(f));

     /* Write padding to maintain alignment */
     char padding_bytes[] = "\0\0\0\0\0\0\0\0";
     perf_map_jit_write_fully(&padding_bytes, padding_size);

     /*
      * Write Code Load Event
      *
      * This event tells perf about the new code region. It includes:
      * - Memory addresses and sizes
      * - Process and thread identification
      * - Function name for symbol resolution
      * - The actual machine code bytes
      */
     CodeLoadEvent ev;
     ev.base.event = PerfLoad;
     ev.base.size = sizeof(ev) + (name_length+1) + size;
     ev.base.time_stamp = get_current_monotonic_ticks();
     ev.process_id = getpid();
 #if defined(__APPLE__)
     pthread_threadid_np(NULL, &ev.thread_id);
 #else
     ev.thread_id = syscall(SYS_gettid);  // Get thread ID via system call
 #endif
     ev.vma = base;                       // Virtual memory address
     ev.code_address = base;              // Same as VMA for our use case
     ev.code_size = size;

     /* Assign unique code ID and increment counter */
     perf_jit_map_state.code_id += 1;
     ev.code_id = perf_jit_map_state.code_id;

     /* Write code load event and associated data */
     perf_map_jit_write_fully(&ev, sizeof(ev));
     perf_map_jit_write_fully(perf_map_entry, name_length+1);  // Include null terminator
     /*
      * Ensure each synthetic DSO has unique .text bytes.
      *
      * perf merges DSOs that share a build-id. Since trampolines can share
      * identical code and unwind bytes, perf may resolve all JIT frames to
      * the first symbol it saw (including entries from previous runs when
      * build-id caching is enabled). Patch a small marker in the emitted
      * bytes to make the build-id depend on a per-process salt and code id
      * without modifying the live code.
      */
     uint64_t marker = perf_jit_map_state.build_id_salt ^
         ((uint64_t)perf_jit_map_state.code_id << 32) ^
         (uint64_t)code_size;
     if (size >= sizeof(marker)) {
         size_t prefix = size - sizeof(marker);
         perf_map_jit_write_fully((void *)(base), prefix);
         perf_map_jit_write_fully(&marker, sizeof(marker));
     }
     else if (size > 0) {
         uint8_t tmp[sizeof(marker)];
         memcpy(tmp, (void *)(base), size);
         for (size_t i = 0; i < size; i++) {
             tmp[i] ^= (uint8_t)(marker >> (i * 8));
         }
         perf_map_jit_write_fully(tmp, size);
     }

     /* Clean up allocated memory */
     PyMutex_Unlock(&perf_jit_map_state.map_lock);
     PyMem_RawFree(perf_map_entry);
 }

 /*
  * Write a complete jitdump entry for a Python function
  *
  * This is the main function called by Python's trampoline system whenever
  * a new piece of JIT-compiled code needs to be recorded. It writes both
  * the unwinding information and the code load event to the jitdump file.
  *
  * The function performs these steps:
  * 1. Initialize jitdump system if not already done
  * 2. Extract function name and filename from Python code object
  * 3. Generate DWARF unwinding information
  * 4. Write unwinding info event to jitdump file
  * 5. Write code load event to jitdump file
  *
  * Args:
  *   state: Jitdump state (currently unused, uses global state)
  *   code_addr: Address where the compiled code resides
  *   code_size: Size of the compiled code in bytes
  *   co: Python code object containing metadata
  *
  * IMPORTANT: This function signature is part of Python's internal API
  * and must not be changed without coordinating with core Python development.
  */
 static void perf_map_jit_write_entry(void *state, const void *code_addr,
                                      size_t code_size, PyCodeObject *co)
 {
     const char *entry = "";
     const char *filename = "";
     if (co != NULL) {
         if (co->co_qualname != NULL) {
             entry = PyUnicode_AsUTF8(co->co_qualname);
         }
         if (co->co_filename != NULL) {
             filename = PyUnicode_AsUTF8(co->co_filename);
         }
     }
     perf_map_jit_write_entry_with_name(state, code_addr, code_size,
                                        entry, filename);
 }

 void
 _PyPerfJit_WriteNamedCode(const void *code_addr, size_t code_size,
                           const char *entry, const char *filename)
 {
     perf_map_jit_write_entry_with_name(
         NULL, code_addr, code_size, entry, filename);
 }

 // =============================================================================
 //                              CLEANUP AND FINALIZATION
 // =============================================================================

 /*
  * Finalize and cleanup the perf jitdump system
  *
  * This function is called when Python is shutting down or when the
  * perf trampoline system is being disabled. It ensures all resources
  * are properly released and all buffered data is flushed to disk.
  *
  * Args:
  *   state: Jitdump state (currently unused, uses global state)
  *
  * Returns: 0 on success
  *
  * IMPORTANT: This function signature is part of Python's internal API
  * and must not be changed without coordinating with core Python development.
  */
 static int perf_map_jit_fini(void* state) {
     /*
      * Close jitdump file with proper synchronization
      *
      * We need to acquire the lock to ensure no other threads are
      * writing to the file when we close it. This prevents corruption
      * and ensures all data is properly flushed.
      */
     PyMutex_Lock(&perf_jit_map_state.map_lock);
     if (perf_jit_map_state.perf_map != NULL) {
         fclose(perf_jit_map_state.perf_map);  // This also flushes buffers
         perf_jit_map_state.perf_map = NULL;
     }
     PyMutex_Unlock(&perf_jit_map_state.map_lock);

     /*
      * Unmap the memory region
      *
      * This removes the signal to perf that we were generating JIT code.
      * After this point, perf will no longer detect this process as
      * having JIT capabilities.
      */
     if (perf_jit_map_state.mapped_buffer != NULL) {
         munmap(perf_jit_map_state.mapped_buffer, perf_jit_map_state.mapped_size);
         perf_jit_map_state.mapped_buffer = NULL;
     }

     /* Clear global state reference */
     trampoline_api.state = NULL;

     return 0;  // Success
 }

 // =============================================================================
 //                              PUBLIC API EXPORT
 // =============================================================================

 /*
  * Python Perf Callbacks Structure
  *
  * This structure defines the callback interface that Python's trampoline
  * system uses to integrate with perf profiling. It contains function
  * pointers for initialization, event writing, and cleanup.
  *
  * CRITICAL: This structure and its contents are part of Python's internal
  * API. The function signatures and behavior must remain stable to maintain
  * compatibility with the Python interpreter's perf integration system.
  *
  * Used by: Python's _PyPerf_Callbacks system in pycore_ceval.h
  */
 _PyPerf_Callbacks _Py_perfmap_jit_callbacks = {
     &perf_map_jit_init,        // Initialization function
     &perf_map_jit_write_entry, // Event writing function
     &perf_map_jit_fini,        // Cleanup function
 };

 #endif /* PY_HAVE_PERF_TRAMPOLINE */