| /* |
| * Python Perf Trampoline Support - JIT Dump Implementation |
| * |
| * This file implements the perf jitdump API for Python's performance profiling |
| * integration. It allows perf (Linux performance analysis tool) to understand |
| * and profile dynamically generated Python bytecode by creating JIT dump files |
| * that perf can inject into its analysis. |
| * |
| * |
| * IMPORTANT: This file exports specific callback functions that are part of |
| * Python's internal API. Do not modify the function signatures or behavior |
| * of exported functions without coordinating with the Python core team. |
| * |
| * Usually the binary and libraries are mapped in separate region like below: |
| * |
| * address -> |
| * --+---------------------+--//--+---------------------+-- |
| * | .text | .data | ... | | .text | .data | ... | |
| * --+---------------------+--//--+---------------------+-- |
| * myprog libc.so |
| * |
| * So it'd be easy and straight-forward to find a mapped binary or library from an |
| * address. |
| * |
| * But for JIT code, the code arena only cares about the code section. But the |
| * resulting DSOs (which is generated by perf inject -j) contain ELF headers and |
| * unwind info too. Then it'd generate following address space with synthesized |
| * MMAP events. Let's say it has a sample between address B and C. |
| * |
| * sample |
| * | |
| * address -> A B v C |
| * --------------------------------------------------------------------------------------------------- |
| * /tmp/jitted-PID-0.so | (headers) | .text | unwind info | |
| * /tmp/jitted-PID-1.so | (headers) | .text | unwind info | |
| * /tmp/jitted-PID-2.so | (headers) | .text | unwind info | |
| * ... |
| * --------------------------------------------------------------------------------------------------- |
| * |
| * If it only maps the .text section, it'd find the jitted-PID-1.so but cannot see |
| * the unwind info. If it maps both .text section and unwind sections, the sample |
| * could be mapped to either jitted-PID-0.so or jitted-PID-1.so and it's confusing |
| * which one is right. So to make perf happy we have non-overlapping ranges for each |
| * DSO: |
| * |
| * address -> |
| * ------------------------------------------------------------------------------------------------------- |
| * /tmp/jitted-PID-0.so | (headers) | .text | unwind info | |
| * /tmp/jitted-PID-1.so | (headers) | .text | unwind info | |
| * /tmp/jitted-PID-2.so | (headers) | .text | unwind info | |
| * ... |
| * ------------------------------------------------------------------------------------------------------- |
| * |
| * As the trampolines are constant, we add a constant padding but in general the padding needs to have the |
| * size of the unwind info rounded to 16 bytes. In general, for our trampolines this is 0x50 |
| */ |
| |
| |
| |
| #include "Python.h" |
| #include "pycore_ceval.h" // _PyPerf_Callbacks |
| #include "pycore_frame.h" |
| #include "pycore_interp.h" |
| #include "pycore_runtime.h" // _PyRuntime |
| |
| #ifdef PY_HAVE_PERF_TRAMPOLINE |
| |
| /* Standard library includes for perf jitdump implementation */ |
| #if defined(__linux__) |
| # include <elf.h> // ELF architecture constants |
| #endif |
| #include <fcntl.h> // File control operations |
| #include <stdio.h> // Standard I/O operations |
| #include <stdlib.h> // Standard library functions |
| #include <sys/mman.h> // Memory mapping functions (mmap) |
| #include <sys/types.h> // System data types |
| #include <unistd.h> // System calls (sysconf, getpid) |
| #include <sys/time.h> // Time functions (gettimeofday) |
| #if defined(__linux__) |
| # include <sys/syscall.h> // System call interface |
| #endif |
| |
| // ============================================================================= |
| // CONSTANTS AND CONFIGURATION |
| // ============================================================================= |
| |
| /* |
| * Memory layout considerations for perf jitdump: |
| * |
| * Perf expects non-overlapping memory regions for each JIT-compiled function. |
| * When perf processes the jitdump file, it creates synthetic DSO (Dynamic |
| * Shared Object) files that contain: |
| * - ELF headers |
| * - .text section (actual machine code) |
| * - Unwind information (for stack traces) |
| * |
| * To ensure proper address space layout, we add padding between code regions. |
| * This prevents address conflicts when perf maps the synthesized DSOs. |
| * |
| * Memory layout example: |
| * /tmp/jitted-PID-0.so: [headers][.text][unwind_info][padding] |
| * /tmp/jitted-PID-1.so: [headers][.text][unwind_info][padding] |
| * |
| * The padding size is now calculated automatically during initialization |
| * based on the actual unwind information requirements. |
| */ |
| |
| |
| /* These constants are defined inside <elf.h>, which we can't use outside of linux. */ |
| #if !defined(__linux__) |
| # if defined(__i386__) || defined(_M_IX86) |
| # define EM_386 3 |
| # elif defined(__arm__) || defined(_M_ARM) |
| # define EM_ARM 40 |
| # elif defined(__x86_64__) || defined(_M_X64) |
| # define EM_X86_64 62 |
| # elif defined(__aarch64__) |
| # define EM_AARCH64 183 |
| # elif defined(__riscv) |
| # define EM_RISCV 243 |
| # endif |
| #endif |
| |
| /* Convenient access to the global trampoline API state */ |
| #define trampoline_api _PyRuntime.ceval.perf.trampoline_api |
| |
| /* Type aliases for clarity and portability */ |
| typedef uint64_t uword; // Word-sized unsigned integer |
| typedef const char* CodeComments; // Code comment strings |
| |
| /* Memory size constants */ |
| #define MB (1024 * 1024) // 1 Megabyte for buffer sizing |
| |
| // ============================================================================= |
| // ARCHITECTURE-SPECIFIC DEFINITIONS |
| // ============================================================================= |
| |
| /* |
| * Returns the ELF machine architecture constant for the current platform. |
| * This is required for the jitdump header to correctly identify the target |
| * architecture for perf processing. |
| * |
| */ |
| static uint64_t GetElfMachineArchitecture(void) { |
| #if defined(__x86_64__) || defined(_M_X64) |
| return EM_X86_64; |
| #elif defined(__i386__) || defined(_M_IX86) |
| return EM_386; |
| #elif defined(__aarch64__) |
| return EM_AARCH64; |
| #elif defined(__arm__) || defined(_M_ARM) |
| return EM_ARM; |
| #elif defined(__riscv) |
| return EM_RISCV; |
| #else |
| Py_UNREACHABLE(); // Unsupported architecture - should never reach here |
| return 0; |
| #endif |
| } |
| |
| // ============================================================================= |
| // PERF JITDUMP DATA STRUCTURES |
| // ============================================================================= |
| |
| /* |
| * Perf jitdump file format structures |
| * |
| * These structures define the binary format that perf expects for JIT dump files. |
| * The format is documented in the Linux perf tools source code and must match |
| * exactly for proper perf integration. |
| */ |
| |
| /* |
| * Jitdump file header - written once at the beginning of each jitdump file |
| * Contains metadata about the process and jitdump format version |
| */ |
| typedef struct { |
| uint32_t magic; // Magic number (0x4A695444 = "JiTD") |
| uint32_t version; // Jitdump format version (currently 1) |
| uint32_t size; // Size of this header structure |
| uint32_t elf_mach_target; // Target architecture (from GetElfMachineArchitecture) |
| uint32_t reserved; // Reserved field (must be 0) |
| uint32_t process_id; // Process ID of the JIT compiler |
| uint64_t time_stamp; // Timestamp when jitdump was created |
| uint64_t flags; // Feature flags (currently unused) |
| } Header; |
| |
| /* |
| * Perf event types supported by the jitdump format |
| * Each event type has a corresponding structure format |
| */ |
| enum PerfEvent { |
| PerfLoad = 0, // Code load event (new JIT function) |
| PerfMove = 1, // Code move event (function relocated) |
| PerfDebugInfo = 2, // Debug information event |
| PerfClose = 3, // JIT session close event |
| PerfUnwindingInfo = 4 // Stack unwinding information event |
| }; |
| |
| /* |
| * Base event structure - common header for all perf events |
| * Every event in the jitdump file starts with this structure |
| */ |
| struct BaseEvent { |
| uint32_t event; // Event type (from PerfEvent enum) |
| uint32_t size; // Total size of this event including payload |
| uint64_t time_stamp; // Timestamp when event occurred |
| }; |
| |
| /* |
| * Code load event - indicates a new JIT-compiled function is available |
| * This is the most important event type for Python profiling |
| */ |
| typedef struct { |
| struct BaseEvent base; // Common event header |
| uint32_t process_id; // Process ID where code was generated |
| #if defined(__APPLE__) |
| uint64_t thread_id; // Thread ID where code was generated |
| #else |
| uint32_t thread_id; // Thread ID where code was generated |
| #endif |
| uint64_t vma; // Virtual memory address where code is loaded |
| uint64_t code_address; // Address of the actual machine code |
| uint64_t code_size; // Size of the machine code in bytes |
| uint64_t code_id; // Unique identifier for this code region |
| /* Followed by: |
| * - null-terminated function name string |
| * - raw machine code bytes |
| */ |
| } CodeLoadEvent; |
| |
| /* |
| * Code unwinding information event - provides DWARF data for stack traces |
| * Essential for proper stack unwinding during profiling |
| */ |
| typedef struct { |
| struct BaseEvent base; // Common event header |
| uint64_t unwind_data_size; // Size of the unwinding data |
| uint64_t eh_frame_hdr_size; // Size of the EH frame header |
| uint64_t mapped_size; // Total mapped size (with padding) |
| /* Followed by: |
| * - EH frame header |
| * - DWARF unwinding information |
| * - Padding to alignment boundary |
| */ |
| } CodeUnwindingInfoEvent; |
| |
| // ============================================================================= |
| // GLOBAL STATE MANAGEMENT |
| // ============================================================================= |
| |
| /* |
| * Global state for the perf jitdump implementation |
| * |
| * This structure maintains all the state needed for generating jitdump files. |
| * It's designed as a singleton since there's typically only one jitdump file |
| * per Python process. |
| */ |
| typedef struct { |
| FILE* perf_map; // File handle for the jitdump file |
| PyThread_type_lock map_lock; // Thread synchronization lock |
| void* mapped_buffer; // Memory-mapped region (signals perf we're active) |
| size_t mapped_size; // Size of the mapped region |
| int code_id; // Counter for unique code region identifiers |
| } PerfMapJitState; |
| |
| /* Global singleton instance */ |
| static PerfMapJitState perf_jit_map_state; |
| |
| // ============================================================================= |
| // TIME UTILITIES |
| // ============================================================================= |
| |
| /* Time conversion constant */ |
| static const intptr_t nanoseconds_per_second = 1000000000; |
| |
| /* |
| * Get current monotonic time in nanoseconds |
| * |
| * Monotonic time is preferred for event timestamps because it's not affected |
| * by system clock adjustments. This ensures consistent timing relationships |
| * between events even if the system clock is changed. |
| * |
| * Returns: Current monotonic time in nanoseconds since an arbitrary epoch |
| */ |
| static int64_t get_current_monotonic_ticks(void) { |
| struct timespec ts; |
| if (clock_gettime(CLOCK_MONOTONIC, &ts) != 0) { |
| Py_UNREACHABLE(); // Should never fail on supported systems |
| return 0; |
| } |
| |
| /* Convert to nanoseconds for maximum precision */ |
| int64_t result = ts.tv_sec; |
| result *= nanoseconds_per_second; |
| result += ts.tv_nsec; |
| return result; |
| } |
| |
| /* |
| * Get current wall clock time in microseconds |
| * |
| * Used for the jitdump file header timestamp. Unlike monotonic time, |
| * this represents actual wall clock time that can be correlated with |
| * other system events. |
| * |
| * Returns: Current time in microseconds since Unix epoch |
| */ |
| static int64_t get_current_time_microseconds(void) { |
| struct timeval tv; |
| if (gettimeofday(&tv, NULL) < 0) { |
| Py_UNREACHABLE(); // Should never fail on supported systems |
| return 0; |
| } |
| return ((int64_t)(tv.tv_sec) * 1000000) + tv.tv_usec; |
| } |
| |
| // ============================================================================= |
| // UTILITY FUNCTIONS |
| // ============================================================================= |
| |
| /* |
| * Round up a value to the next multiple of a given number |
| * |
| * This is essential for maintaining proper alignment requirements in the |
| * jitdump format. Many structures need to be aligned to specific boundaries |
| * (typically 8 or 16 bytes) for efficient processing by perf. |
| * |
| * Args: |
| * value: The value to round up |
| * multiple: The multiple to round up to |
| * |
| * Returns: The smallest value >= input that is a multiple of 'multiple' |
| */ |
| static size_t round_up(int64_t value, int64_t multiple) { |
| if (multiple == 0) { |
| return value; // Avoid division by zero |
| } |
| |
| int64_t remainder = value % multiple; |
| if (remainder == 0) { |
| return value; // Already aligned |
| } |
| |
| /* Calculate how much to add to reach the next multiple */ |
| int64_t difference = multiple - remainder; |
| int64_t rounded_up_value = value + difference; |
| |
| return rounded_up_value; |
| } |
| |
| // ============================================================================= |
| // FILE I/O UTILITIES |
| // ============================================================================= |
| |
| /* |
| * Write data to the jitdump file with error handling |
| * |
| * This function ensures that all data is written to the file, handling |
| * partial writes that can occur with large buffers or when the system |
| * is under load. |
| * |
| * Args: |
| * buffer: Pointer to data to write |
| * size: Number of bytes to write |
| */ |
| static void perf_map_jit_write_fully(const void* buffer, size_t size) { |
| FILE* out_file = perf_jit_map_state.perf_map; |
| const char* ptr = (const char*)(buffer); |
| |
| while (size > 0) { |
| const size_t written = fwrite(ptr, 1, size, out_file); |
| if (written == 0) { |
| Py_UNREACHABLE(); // Write failure - should be very rare |
| break; |
| } |
| size -= written; |
| ptr += written; |
| } |
| } |
| |
| /* |
| * Write the jitdump file header |
| * |
| * The header must be written exactly once at the beginning of each jitdump |
| * file. It provides metadata that perf uses to parse the rest of the file. |
| * |
| * Args: |
| * pid: Process ID to include in the header |
| * out_file: File handle to write to (currently unused, uses global state) |
| */ |
| static void perf_map_jit_write_header(int pid, FILE* out_file) { |
| Header header; |
| |
| /* Initialize header with required values */ |
| header.magic = 0x4A695444; // "JiTD" magic number |
| header.version = 1; // Current jitdump version |
| header.size = sizeof(Header); // Header size for validation |
| header.elf_mach_target = GetElfMachineArchitecture(); // Target architecture |
| header.process_id = pid; // Process identifier |
| header.time_stamp = get_current_time_microseconds(); // Creation time |
| header.flags = 0; // No special flags currently used |
| |
| perf_map_jit_write_fully(&header, sizeof(header)); |
| } |
| |
| // ============================================================================= |
| // DWARF CONSTANTS AND UTILITIES |
| // ============================================================================= |
| |
| /* |
| * DWARF (Debug With Arbitrary Record Formats) constants |
| * |
| * DWARF is a debugging data format used to provide stack unwinding information. |
| * These constants define the various encoding types and opcodes used in |
| * DWARF Call Frame Information (CFI) records. |
| */ |
| |
| /* DWARF Call Frame Information version */ |
| #define DWRF_CIE_VERSION 1 |
| |
| /* DWARF CFA (Call Frame Address) opcodes */ |
| enum { |
| DWRF_CFA_nop = 0x0, // No operation |
| DWRF_CFA_offset_extended = 0x5, // Extended offset instruction |
| DWRF_CFA_def_cfa = 0xc, // Define CFA rule |
| DWRF_CFA_def_cfa_register = 0xd, // Define CFA register |
| DWRF_CFA_def_cfa_offset = 0xe, // Define CFA offset |
| DWRF_CFA_offset_extended_sf = 0x11, // Extended signed offset |
| DWRF_CFA_advance_loc = 0x40, // Advance location counter |
| DWRF_CFA_offset = 0x80, // Simple offset instruction |
| DWRF_CFA_restore = 0xc0 // Restore register |
| }; |
| |
| /* DWARF Exception Handling pointer encodings */ |
| enum { |
| DWRF_EH_PE_absptr = 0x00, // Absolute pointer |
| DWRF_EH_PE_omit = 0xff, // Omitted value |
| |
| /* Data type encodings */ |
| DWRF_EH_PE_uleb128 = 0x01, // Unsigned LEB128 |
| DWRF_EH_PE_udata2 = 0x02, // Unsigned 2-byte |
| DWRF_EH_PE_udata4 = 0x03, // Unsigned 4-byte |
| DWRF_EH_PE_udata8 = 0x04, // Unsigned 8-byte |
| DWRF_EH_PE_sleb128 = 0x09, // Signed LEB128 |
| DWRF_EH_PE_sdata2 = 0x0a, // Signed 2-byte |
| DWRF_EH_PE_sdata4 = 0x0b, // Signed 4-byte |
| DWRF_EH_PE_sdata8 = 0x0c, // Signed 8-byte |
| DWRF_EH_PE_signed = 0x08, // Signed flag |
| |
| /* Reference type encodings */ |
| DWRF_EH_PE_pcrel = 0x10, // PC-relative |
| DWRF_EH_PE_textrel = 0x20, // Text-relative |
| DWRF_EH_PE_datarel = 0x30, // Data-relative |
| DWRF_EH_PE_funcrel = 0x40, // Function-relative |
| DWRF_EH_PE_aligned = 0x50, // Aligned |
| DWRF_EH_PE_indirect = 0x80 // Indirect |
| }; |
| |
| /* Additional DWARF constants for debug information */ |
| enum { DWRF_TAG_compile_unit = 0x11 }; |
| enum { DWRF_children_no = 0, DWRF_children_yes = 1 }; |
| enum { |
| DWRF_AT_name = 0x03, // Name attribute |
| DWRF_AT_stmt_list = 0x10, // Statement list |
| DWRF_AT_low_pc = 0x11, // Low PC address |
| DWRF_AT_high_pc = 0x12 // High PC address |
| }; |
| enum { |
| DWRF_FORM_addr = 0x01, // Address form |
| DWRF_FORM_data4 = 0x06, // 4-byte data |
| DWRF_FORM_string = 0x08 // String form |
| }; |
| |
| /* Line number program opcodes */ |
| enum { |
| DWRF_LNS_extended_op = 0, // Extended opcode |
| DWRF_LNS_copy = 1, // Copy operation |
| DWRF_LNS_advance_pc = 2, // Advance program counter |
| DWRF_LNS_advance_line = 3 // Advance line number |
| }; |
| |
| /* Line number extended opcodes */ |
| enum { |
| DWRF_LNE_end_sequence = 1, // End of sequence |
| DWRF_LNE_set_address = 2 // Set address |
| }; |
| |
| /* |
| * Architecture-specific DWARF register numbers |
| * |
| * These constants define the register numbering scheme used by DWARF |
| * for each supported architecture. The numbers must match the ABI |
| * specification for proper stack unwinding. |
| */ |
| enum { |
| #ifdef __x86_64__ |
| /* x86_64 register numbering (note: order is defined by x86_64 ABI) */ |
| DWRF_REG_AX, // RAX |
| DWRF_REG_DX, // RDX |
| DWRF_REG_CX, // RCX |
| DWRF_REG_BX, // RBX |
| DWRF_REG_SI, // RSI |
| DWRF_REG_DI, // RDI |
| DWRF_REG_BP, // RBP |
| DWRF_REG_SP, // RSP |
| DWRF_REG_8, // R8 |
| DWRF_REG_9, // R9 |
| DWRF_REG_10, // R10 |
| DWRF_REG_11, // R11 |
| DWRF_REG_12, // R12 |
| DWRF_REG_13, // R13 |
| DWRF_REG_14, // R14 |
| DWRF_REG_15, // R15 |
| DWRF_REG_RA, // Return address (RIP) |
| #elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__) |
| /* AArch64 register numbering */ |
| DWRF_REG_FP = 29, // Frame Pointer |
| DWRF_REG_RA = 30, // Link register (return address) |
| DWRF_REG_SP = 31, // Stack pointer |
| #else |
| # error "Unsupported target architecture" |
| #endif |
| }; |
| |
| /* DWARF encoding constants used in EH frame headers */ |
| static const uint8_t DwarfUData4 = 0x03; // Unsigned 4-byte data |
| static const uint8_t DwarfSData4 = 0x0b; // Signed 4-byte data |
| static const uint8_t DwarfPcRel = 0x10; // PC-relative encoding |
| static const uint8_t DwarfDataRel = 0x30; // Data-relative encoding |
| |
| // ============================================================================= |
| // ELF OBJECT CONTEXT |
| // ============================================================================= |
| |
| /* |
| * Context for building ELF/DWARF structures |
| * |
| * This structure maintains state while constructing DWARF unwind information. |
| * It acts as a simple buffer manager with pointers to track current position |
| * and important landmarks within the buffer. |
| */ |
| typedef struct ELFObjectContext { |
| uint8_t* p; // Current write position in buffer |
| uint8_t* startp; // Start of buffer (for offset calculations) |
| uint8_t* eh_frame_p; // Start of EH frame data (for relative offsets) |
| uint8_t* fde_p; // Start of FDE data (for PC-relative calculations) |
| uint32_t code_size; // Size of the code being described |
| } ELFObjectContext; |
| |
| /* |
| * EH Frame Header structure for DWARF unwinding |
| * |
| * This structure provides metadata about the DWARF unwinding information |
| * that follows. It's required by the perf jitdump format to enable proper |
| * stack unwinding during profiling. |
| */ |
| typedef struct { |
| unsigned char version; // EH frame version (always 1) |
| unsigned char eh_frame_ptr_enc; // Encoding of EH frame pointer |
| unsigned char fde_count_enc; // Encoding of FDE count |
| unsigned char table_enc; // Encoding of table entries |
| int32_t eh_frame_ptr; // Pointer to EH frame data |
| int32_t eh_fde_count; // Number of FDEs (Frame Description Entries) |
| int32_t from; // Start address of code range |
| int32_t to; // End address of code range |
| } EhFrameHeader; |
| |
| // ============================================================================= |
| // DWARF GENERATION UTILITIES |
| // ============================================================================= |
| |
| /* |
| * Append a null-terminated string to the ELF context buffer |
| * |
| * Args: |
| * ctx: ELF object context |
| * str: String to append (must be null-terminated) |
| * |
| * Returns: Offset from start of buffer where string was written |
| */ |
| static uint32_t elfctx_append_string(ELFObjectContext* ctx, const char* str) { |
| uint8_t* p = ctx->p; |
| uint32_t ofs = (uint32_t)(p - ctx->startp); |
| |
| /* Copy string including null terminator */ |
| do { |
| *p++ = (uint8_t)*str; |
| } while (*str++); |
| |
| ctx->p = p; |
| return ofs; |
| } |
| |
| /* |
| * Append a SLEB128 (Signed Little Endian Base 128) value |
| * |
| * SLEB128 is a variable-length encoding used extensively in DWARF. |
| * It efficiently encodes small numbers in fewer bytes. |
| * |
| * Args: |
| * ctx: ELF object context |
| * v: Signed value to encode |
| */ |
| static void elfctx_append_sleb128(ELFObjectContext* ctx, int32_t v) { |
| uint8_t* p = ctx->p; |
| |
| /* Encode 7 bits at a time, with continuation bit in MSB */ |
| for (; (uint32_t)(v + 0x40) >= 0x80; v >>= 7) { |
| *p++ = (uint8_t)((v & 0x7f) | 0x80); // Set continuation bit |
| } |
| *p++ = (uint8_t)(v & 0x7f); // Final byte without continuation bit |
| |
| ctx->p = p; |
| } |
| |
| /* |
| * Append a ULEB128 (Unsigned Little Endian Base 128) value |
| * |
| * Similar to SLEB128 but for unsigned values. |
| * |
| * Args: |
| * ctx: ELF object context |
| * v: Unsigned value to encode |
| */ |
| static void elfctx_append_uleb128(ELFObjectContext* ctx, uint32_t v) { |
| uint8_t* p = ctx->p; |
| |
| /* Encode 7 bits at a time, with continuation bit in MSB */ |
| for (; v >= 0x80; v >>= 7) { |
| *p++ = (char)((v & 0x7f) | 0x80); // Set continuation bit |
| } |
| *p++ = (char)v; // Final byte without continuation bit |
| |
| ctx->p = p; |
| } |
| |
| /* |
| * Macros for generating DWARF structures |
| * |
| * These macros provide a convenient way to write various data types |
| * to the DWARF buffer while automatically advancing the pointer. |
| */ |
| #define DWRF_U8(x) (*p++ = (x)) // Write unsigned 8-bit |
| #define DWRF_I8(x) (*(int8_t*)p = (x), p++) // Write signed 8-bit |
| #define DWRF_U16(x) (*(uint16_t*)p = (x), p += 2) // Write unsigned 16-bit |
| #define DWRF_U32(x) (*(uint32_t*)p = (x), p += 4) // Write unsigned 32-bit |
| #define DWRF_ADDR(x) (*(uintptr_t*)p = (x), p += sizeof(uintptr_t)) // Write address |
| #define DWRF_UV(x) (ctx->p = p, elfctx_append_uleb128(ctx, (x)), p = ctx->p) // Write ULEB128 |
| #define DWRF_SV(x) (ctx->p = p, elfctx_append_sleb128(ctx, (x)), p = ctx->p) // Write SLEB128 |
| #define DWRF_STR(str) (ctx->p = p, elfctx_append_string(ctx, (str)), p = ctx->p) // Write string |
| |
| /* Align to specified boundary with NOP instructions */ |
| #define DWRF_ALIGNNOP(s) \ |
| while ((uintptr_t)p & ((s)-1)) { \ |
| *p++ = DWRF_CFA_nop; \ |
| } |
| |
| /* Write a DWARF section with automatic size calculation */ |
| #define DWRF_SECTION(name, stmt) \ |
| { \ |
| uint32_t* szp_##name = (uint32_t*)p; \ |
| p += 4; \ |
| stmt; \ |
| *szp_##name = (uint32_t)((p - (uint8_t*)szp_##name) - 4); \ |
| } |
| |
| // ============================================================================= |
| // DWARF EH FRAME GENERATION |
| // ============================================================================= |
| |
| static void elf_init_ehframe(ELFObjectContext* ctx); |
| |
| /* |
| * Initialize DWARF .eh_frame section for a code region |
| * |
| * The .eh_frame section contains Call Frame Information (CFI) that describes |
| * how to unwind the stack at any point in the code. This is essential for |
| * proper profiling as it allows perf to generate accurate call graphs. |
| * |
| * The function generates two main components: |
| * 1. CIE (Common Information Entry) - describes calling conventions |
| * 2. FDE (Frame Description Entry) - describes specific function unwinding |
| * |
| * Args: |
| * ctx: ELF object context containing code size and buffer pointers |
| */ |
| static size_t calculate_eh_frame_size(void) { |
| /* Calculate the EH frame size for the trampoline function */ |
| extern void *_Py_trampoline_func_start; |
| extern void *_Py_trampoline_func_end; |
| |
| size_t code_size = (char*)&_Py_trampoline_func_end - (char*)&_Py_trampoline_func_start; |
| |
| ELFObjectContext ctx; |
| char buffer[1024]; // Buffer for DWARF data (1KB should be sufficient) |
| ctx.code_size = code_size; |
| ctx.startp = ctx.p = (uint8_t*)buffer; |
| ctx.fde_p = NULL; |
| |
| elf_init_ehframe(&ctx); |
| return ctx.p - ctx.startp; |
| } |
| |
| static void elf_init_ehframe(ELFObjectContext* ctx) { |
| uint8_t* p = ctx->p; |
| uint8_t* framep = p; // Remember start of frame data |
| |
| /* |
| * DWARF Unwind Table for Trampoline Function |
| * |
| * This section defines DWARF Call Frame Information (CFI) using encoded macros |
| * like `DWRF_U8`, `DWRF_UV`, and `DWRF_SECTION` to describe how the trampoline function |
| * preserves and restores registers. This is used by profiling tools (e.g., `perf`) |
| * and debuggers for stack unwinding in JIT-compiled code. |
| * |
| * ------------------------------------------------- |
| * TO REGENERATE THIS TABLE FROM GCC OBJECTS: |
| * ------------------------------------------------- |
| * |
| * 1. Create a trampoline source file (e.g., `trampoline.c`): |
| * |
| * #include <Python.h> |
| * typedef PyObject* (*py_evaluator)(void*, void*, int); |
| * PyObject* trampoline(void *ts, void *f, int throwflag, py_evaluator evaluator) { |
| * return evaluator(ts, f, throwflag); |
| * } |
| * |
| * 2. Compile to an object file with frame pointer preservation: |
| * |
| * gcc trampoline.c -I. -I./Include -O2 -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer -c |
| * |
| * 3. Extract DWARF unwind info from the object file: |
| * |
| * readelf -w trampoline.o |
| * |
| * Example output from `.eh_frame`: |
| * |
| * 00000000 CIE |
| * Version: 1 |
| * Augmentation: "zR" |
| * Code alignment factor: 4 |
| * Data alignment factor: -8 |
| * Return address column: 30 |
| * DW_CFA_def_cfa: r31 (sp) ofs 0 |
| * |
| * 00000014 FDE cie=00000000 pc=0..14 |
| * DW_CFA_advance_loc: 4 |
| * DW_CFA_def_cfa_offset: 16 |
| * DW_CFA_offset: r29 at cfa-16 |
| * DW_CFA_offset: r30 at cfa-8 |
| * DW_CFA_advance_loc: 12 |
| * DW_CFA_restore: r30 |
| * DW_CFA_restore: r29 |
| * DW_CFA_def_cfa_offset: 0 |
| * |
| * -- These values can be verified by comparing with `readelf -w` or `llvm-dwarfdump --eh-frame`. |
| * |
| * ---------------------------------- |
| * HOW TO TRANSLATE TO DWRF_* MACROS: |
| * ---------------------------------- |
| * |
| * After compiling your trampoline with: |
| * |
| * gcc trampoline.c -I. -I./Include -O2 -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer -c |
| * |
| * run: |
| * |
| * readelf -w trampoline.o |
| * |
| * to inspect the generated `.eh_frame` data. You will see two main components: |
| * |
| * 1. A CIE (Common Information Entry): shared configuration used by all FDEs. |
| * 2. An FDE (Frame Description Entry): function-specific unwind instructions. |
| * |
| * --------------------- |
| * Translating the CIE: |
| * --------------------- |
| * From `readelf -w`, you might see: |
| * |
| * 00000000 0000000000000010 00000000 CIE |
| * Version: 1 |
| * Augmentation: "zR" |
| * Code alignment factor: 4 |
| * Data alignment factor: -8 |
| * Return address column: 30 |
| * Augmentation data: 1b |
| * DW_CFA_def_cfa: r31 (sp) ofs 0 |
| * |
| * Map this to: |
| * |
| * DWRF_SECTION(CIE, |
| * DWRF_U32(0); // CIE ID (always 0 for CIEs) |
| * DWRF_U8(DWRF_CIE_VERSION); // Version: 1 |
| * DWRF_STR("zR"); // Augmentation string "zR" |
| * DWRF_UV(4); // Code alignment factor = 4 |
| * DWRF_SV(-8); // Data alignment factor = -8 |
| * DWRF_U8(DWRF_REG_RA); // Return address register (e.g., x30 = 30) |
| * DWRF_UV(1); // Augmentation data length = 1 |
| * DWRF_U8(DWRF_EH_PE_pcrel | DWRF_EH_PE_sdata4); // Encoding for FDE pointers |
| * |
| * DWRF_U8(DWRF_CFA_def_cfa); // DW_CFA_def_cfa |
| * DWRF_UV(DWRF_REG_SP); // Register: SP (r31) |
| * DWRF_UV(0); // Offset = 0 |
| * |
| * DWRF_ALIGNNOP(sizeof(uintptr_t)); // Align to pointer size boundary |
| * ) |
| * |
| * Notes: |
| * - Use `DWRF_UV` for unsigned LEB128, `DWRF_SV` for signed LEB128. |
| * - `DWRF_REG_RA` and `DWRF_REG_SP` are architecture-defined constants. |
| * |
| * --------------------- |
| * Translating the FDE: |
| * --------------------- |
| * From `readelf -w`: |
| * |
| * 00000014 0000000000000020 00000018 FDE cie=00000000 pc=0000000000000000..0000000000000014 |
| * DW_CFA_advance_loc: 4 |
| * DW_CFA_def_cfa_offset: 16 |
| * DW_CFA_offset: r29 at cfa-16 |
| * DW_CFA_offset: r30 at cfa-8 |
| * DW_CFA_advance_loc: 12 |
| * DW_CFA_restore: r30 |
| * DW_CFA_restore: r29 |
| * DW_CFA_def_cfa_offset: 0 |
| * |
| * Map the FDE header and instructions to: |
| * |
| * DWRF_SECTION(FDE, |
| * DWRF_U32((uint32_t)(p - framep)); // Offset to CIE (relative from here) |
| * DWRF_U32(pc_relative_offset); // PC-relative location of the code (calculated dynamically) |
| * DWRF_U32(ctx->code_size); // Code range covered by this FDE |
| * DWRF_U8(0); // Augmentation data length (none) |
| * |
| * DWRF_U8(DWRF_CFA_advance_loc | 1); // Advance location by 1 unit (1 * 4 = 4 bytes) |
| * DWRF_U8(DWRF_CFA_def_cfa_offset); // CFA = SP + 16 |
| * DWRF_UV(16); |
| * |
| * DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // Save x29 (frame pointer) |
| * DWRF_UV(2); // At offset 2 * 8 = 16 bytes |
| * |
| * DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // Save x30 (return address) |
| * DWRF_UV(1); // At offset 1 * 8 = 8 bytes |
| * |
| * DWRF_U8(DWRF_CFA_advance_loc | 3); // Advance location by 3 units (3 * 4 = 12 bytes) |
| * |
| * DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // Restore x30 |
| * DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // Restore x29 |
| * |
| * DWRF_U8(DWRF_CFA_def_cfa_offset); // CFA = SP |
| * DWRF_UV(0); |
| * ) |
| * |
| * To regenerate: |
| * 1. Get the `code alignment factor`, `data alignment factor`, and `RA column` from the CIE. |
| * 2. Note the range of the function from the FDE's `pc=...` line and map it to the JIT code as |
| * the code is in a different address space every time. |
| * 3. For each `DW_CFA_*` entry, use the corresponding `DWRF_*` macro: |
| * - `DW_CFA_def_cfa_offset` → DWRF_U8(DWRF_CFA_def_cfa_offset), DWRF_UV(value) |
| * - `DW_CFA_offset: rX` → DWRF_U8(DWRF_CFA_offset | reg), DWRF_UV(offset) |
| * - `DW_CFA_restore: rX` → DWRF_U8(DWRF_CFA_offset | reg) // restore is same as reusing offset |
| * - `DW_CFA_advance_loc: N` → DWRF_U8(DWRF_CFA_advance_loc | (N / code_alignment_factor)) |
| * 4. Use `DWRF_REG_FP`, `DWRF_REG_RA`, etc., for register numbers. |
| * 5. Use `sizeof(uintptr_t)` (typically 8) for pointer size calculations and alignment. |
| */ |
| |
| /* |
| * Emit DWARF EH CIE (Common Information Entry) |
| * |
| * The CIE describes the calling conventions and basic unwinding rules |
| * that apply to all functions in this compilation unit. |
| */ |
| DWRF_SECTION(CIE, |
| DWRF_U32(0); // CIE ID (0 indicates this is a CIE) |
| DWRF_U8(DWRF_CIE_VERSION); // CIE version (1) |
| DWRF_STR("zR"); // Augmentation string ("zR" = has LSDA) |
| #ifdef __x86_64__ |
| DWRF_UV(1); // Code alignment factor (x86_64: 1 byte) |
| #elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__) |
| DWRF_UV(4); // Code alignment factor (AArch64: 4 bytes per instruction) |
| #endif |
| DWRF_SV(-(int64_t)sizeof(uintptr_t)); // Data alignment factor (negative) |
| DWRF_U8(DWRF_REG_RA); // Return address register number |
| DWRF_UV(1); // Augmentation data length |
| DWRF_U8(DWRF_EH_PE_pcrel | DWRF_EH_PE_sdata4); // FDE pointer encoding |
| |
| /* Initial CFI instructions - describe default calling convention */ |
| #ifdef __x86_64__ |
| /* x86_64 initial CFI state */ |
| DWRF_U8(DWRF_CFA_def_cfa); // Define CFA (Call Frame Address) |
| DWRF_UV(DWRF_REG_SP); // CFA = SP register |
| DWRF_UV(sizeof(uintptr_t)); // CFA = SP + pointer_size |
| DWRF_U8(DWRF_CFA_offset|DWRF_REG_RA); // Return address is saved |
| DWRF_UV(1); // At offset 1 from CFA |
| #elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__) |
| /* AArch64 initial CFI state */ |
| DWRF_U8(DWRF_CFA_def_cfa); // Define CFA (Call Frame Address) |
| DWRF_UV(DWRF_REG_SP); // CFA = SP register |
| DWRF_UV(0); // CFA = SP + 0 (AArch64 starts with offset 0) |
| // No initial register saves in AArch64 CIE |
| #endif |
| DWRF_ALIGNNOP(sizeof(uintptr_t)); // Align to pointer boundary |
| ) |
| |
| ctx->eh_frame_p = p; // Remember start of FDE data |
| |
| /* |
| * Emit DWARF EH FDE (Frame Description Entry) |
| * |
| * The FDE describes unwinding information specific to this function. |
| * It references the CIE and provides function-specific CFI instructions. |
| * |
| * The PC-relative offset is calculated after the entire EH frame is built |
| * to ensure accurate positioning relative to the synthesized DSO layout. |
| */ |
| DWRF_SECTION(FDE, |
| DWRF_U32((uint32_t)(p - framep)); // Offset to CIE (backwards reference) |
| ctx->fde_p = p; // Remember where PC offset field is located for later calculation |
| DWRF_U32(0); // Placeholder for PC-relative offset (calculated at end of elf_init_ehframe) |
| DWRF_U32(ctx->code_size); // Address range covered by this FDE (code length) |
| DWRF_U8(0); // Augmentation data length (none) |
| |
| /* |
| * Architecture-specific CFI instructions |
| * |
| * These instructions describe how registers are saved and restored |
| * during function calls. Each architecture has different calling |
| * conventions and register usage patterns. |
| */ |
| #ifdef __x86_64__ |
| /* x86_64 calling convention unwinding rules with frame pointer */ |
| # if defined(__CET__) && (__CET__ & 1) |
| DWRF_U8(DWRF_CFA_advance_loc | 4); // Advance past endbr64 (4 bytes) |
| # endif |
| DWRF_U8(DWRF_CFA_advance_loc | 1); // Advance past push %rbp (1 byte) |
| DWRF_U8(DWRF_CFA_def_cfa_offset); // def_cfa_offset 16 |
| DWRF_UV(16); // New offset: SP + 16 |
| DWRF_U8(DWRF_CFA_offset | DWRF_REG_BP); // offset r6 at cfa-16 |
| DWRF_UV(2); // Offset factor: 2 * 8 = 16 bytes |
| DWRF_U8(DWRF_CFA_advance_loc | 3); // Advance past mov %rsp,%rbp (3 bytes) |
| DWRF_U8(DWRF_CFA_def_cfa_register); // def_cfa_register r6 |
| DWRF_UV(DWRF_REG_BP); // Use base pointer register |
| DWRF_U8(DWRF_CFA_advance_loc | 3); // Advance past call *%rcx (2 bytes) + pop %rbp (1 byte) = 3 |
| DWRF_U8(DWRF_CFA_def_cfa); // def_cfa r7 ofs 8 |
| DWRF_UV(DWRF_REG_SP); // Use stack pointer register |
| DWRF_UV(8); // New offset: SP + 8 |
| #elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__) |
| /* AArch64 calling convention unwinding rules */ |
| DWRF_U8(DWRF_CFA_advance_loc | 1); // Advance by 1 instruction (4 bytes) |
| DWRF_U8(DWRF_CFA_def_cfa_offset); // CFA = SP + 16 |
| DWRF_UV(16); // Stack pointer moved by 16 bytes |
| DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // x29 (frame pointer) saved |
| DWRF_UV(2); // At CFA-16 (2 * 8 = 16 bytes from CFA) |
| DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // x30 (link register) saved |
| DWRF_UV(1); // At CFA-8 (1 * 8 = 8 bytes from CFA) |
| DWRF_U8(DWRF_CFA_advance_loc | 3); // Advance by 3 instructions (12 bytes) |
| DWRF_U8(DWRF_CFA_restore | DWRF_REG_RA); // Restore x30 - NO DWRF_UV() after this! |
| DWRF_U8(DWRF_CFA_restore | DWRF_REG_FP); // Restore x29 - NO DWRF_UV() after this! |
| DWRF_U8(DWRF_CFA_def_cfa_offset); // CFA = SP + 0 (stack restored) |
| DWRF_UV(0); // Back to original stack position |
| #else |
| # error "Unsupported target architecture" |
| #endif |
| |
| DWRF_ALIGNNOP(sizeof(uintptr_t)); // Align to pointer boundary |
| ) |
| |
| ctx->p = p; // Update context pointer to end of generated data |
| |
| /* Calculate and update the PC-relative offset in the FDE |
| * |
| * When perf processes the jitdump, it creates a synthesized DSO with this layout: |
| * |
| * Synthesized DSO Memory Layout: |
| * ┌─────────────────────────────────────────────────────────────┐ < code_start |
| * │ Code Section │ |
| * │ (round_up(code_size, 8) bytes) │ |
| * ├─────────────────────────────────────────────────────────────┤ < start of EH frame data |
| * │ EH Frame Data │ |
| * │ ┌─────────────────────────────────────────────────────┐ │ |
| * │ │ CIE data │ │ |
| * │ └─────────────────────────────────────────────────────┘ │ |
| * │ ┌─────────────────────────────────────────────────────┐ │ |
| * │ │ FDE Header: │ │ |
| * │ │ - CIE offset (4 bytes) │ │ |
| * │ │ - PC offset (4 bytes) <─ fde_offset_in_frame ─────┼────┼─> points to code_start |
| * │ │ - address range (4 bytes) │ │ (this specific field) |
| * │ │ CFI Instructions... │ │ |
| * │ └─────────────────────────────────────────────────────┘ │ |
| * ├─────────────────────────────────────────────────────────────┤ < reference_point |
| * │ EhFrameHeader │ |
| * │ (navigation metadata) │ |
| * └─────────────────────────────────────────────────────────────┘ |
| * |
| * The PC offset field in the FDE must contain the distance from itself to code_start: |
| * |
| * distance = code_start - fde_pc_field |
| * |
| * Where: |
| * fde_pc_field_location = reference_point - eh_frame_size + fde_offset_in_frame |
| * code_start_location = reference_point - eh_frame_size - round_up(code_size, 8) |
| * |
| * Therefore: |
| * distance = code_start_location - fde_pc_field_location |
| * = (ref - eh_frame_size - rounded_code_size) - (ref - eh_frame_size + fde_offset_in_frame) |
| * = -rounded_code_size - fde_offset_in_frame |
| * = -(round_up(code_size, 8) + fde_offset_in_frame) |
| * |
| * Note: fde_offset_in_frame is the offset from EH frame start to the PC offset field, |
| * |
| */ |
| if (ctx->fde_p != NULL) { |
| int32_t fde_offset_in_frame = (ctx->fde_p - ctx->startp); |
| int32_t rounded_code_size = round_up(ctx->code_size, 8); |
| int32_t pc_relative_offset = -(rounded_code_size + fde_offset_in_frame); |
| |
| |
| // Update the PC-relative offset in the FDE |
| *(int32_t*)ctx->fde_p = pc_relative_offset; |
| } |
| } |
| |
| // ============================================================================= |
| // JITDUMP INITIALIZATION |
| // ============================================================================= |
| |
| /* |
| * Initialize the perf jitdump interface |
| * |
| * This function sets up everything needed to generate jitdump files: |
| * 1. Creates the jitdump file with a unique name |
| * 2. Maps the first page to signal perf that we're using the interface |
| * 3. Writes the jitdump header |
| * 4. Initializes synchronization primitives |
| * |
| * The memory mapping is crucial - perf detects jitdump files by scanning |
| * for processes that have mapped files matching the pattern /tmp/jit-*.dump |
| * |
| * Returns: Pointer to initialized state, or NULL on failure |
| */ |
| static void* perf_map_jit_init(void) { |
| char filename[100]; |
| int pid = getpid(); |
| |
| /* Create unique filename based on process ID */ |
| snprintf(filename, sizeof(filename) - 1, "/tmp/jit-%d.dump", pid); |
| |
| /* Create/open the jitdump file with appropriate permissions */ |
| const int fd = open(filename, O_CREAT | O_TRUNC | O_RDWR, 0666); |
| if (fd == -1) { |
| return NULL; // Failed to create file |
| } |
| |
| /* Get system page size for memory mapping */ |
| const long page_size = sysconf(_SC_PAGESIZE); |
| if (page_size == -1) { |
| close(fd); |
| return NULL; // Failed to get page size |
| } |
| |
| #if defined(__APPLE__) |
| // On macOS, samply uses a preload to find jitdumps and this mmap can be slow. |
| perf_jit_map_state.mapped_buffer = NULL; |
| #else |
| /* |
| * Map the first page of the jitdump file |
| * |
| * This memory mapping serves as a signal to perf that this process |
| * is generating JIT code. Perf scans /proc/.../maps looking for mapped |
| * files that match the jitdump naming pattern. |
| * |
| * The mapping must be PROT_READ | PROT_EXEC to be detected by perf. |
| */ |
| perf_jit_map_state.mapped_buffer = mmap( |
| NULL, // Let kernel choose address |
| page_size, // Map one page |
| PROT_READ | PROT_EXEC, // Read and execute permissions (required by perf) |
| MAP_PRIVATE, // Private mapping |
| fd, // File descriptor |
| 0 // Offset 0 (first page) |
| ); |
| |
| if (perf_jit_map_state.mapped_buffer == NULL) { |
| close(fd); |
| return NULL; // Memory mapping failed |
| } |
| #endif |
| |
| perf_jit_map_state.mapped_size = page_size; |
| |
| /* Convert file descriptor to FILE* for easier I/O operations */ |
| perf_jit_map_state.perf_map = fdopen(fd, "w+"); |
| if (perf_jit_map_state.perf_map == NULL) { |
| close(fd); |
| return NULL; // Failed to create FILE* |
| } |
| |
| /* |
| * Set up file buffering for better performance |
| * |
| * We use a large buffer (2MB) because jitdump files can be written |
| * frequently during program execution. Buffering reduces system call |
| * overhead and improves overall performance. |
| */ |
| setvbuf(perf_jit_map_state.perf_map, NULL, _IOFBF, 2 * MB); |
| |
| /* Write the jitdump file header */ |
| perf_map_jit_write_header(pid, perf_jit_map_state.perf_map); |
| |
| /* |
| * Initialize thread synchronization lock |
| * |
| * Multiple threads may attempt to write to the jitdump file |
| * simultaneously. This lock ensures thread-safe access to the |
| * global jitdump state. |
| */ |
| perf_jit_map_state.map_lock = PyThread_allocate_lock(); |
| if (perf_jit_map_state.map_lock == NULL) { |
| fclose(perf_jit_map_state.perf_map); |
| return NULL; // Failed to create lock |
| } |
| |
| /* Initialize code ID counter */ |
| perf_jit_map_state.code_id = 0; |
| |
| /* Calculate padding size based on actual unwind info requirements */ |
| size_t eh_frame_size = calculate_eh_frame_size(); |
| size_t unwind_data_size = sizeof(EhFrameHeader) + eh_frame_size; |
| trampoline_api.code_padding = round_up(unwind_data_size, 16); |
| trampoline_api.code_alignment = 32; |
| |
| return &perf_jit_map_state; |
| } |
| |
| // ============================================================================= |
| // MAIN JITDUMP ENTRY WRITING |
| // ============================================================================= |
| |
| /* |
| * Write a complete jitdump entry for a Python function |
| * |
| * This is the main function called by Python's trampoline system whenever |
| * a new piece of JIT-compiled code needs to be recorded. It writes both |
| * the unwinding information and the code load event to the jitdump file. |
| * |
| * The function performs these steps: |
| * 1. Initialize jitdump system if not already done |
| * 2. Extract function name and filename from Python code object |
| * 3. Generate DWARF unwinding information |
| * 4. Write unwinding info event to jitdump file |
| * 5. Write code load event to jitdump file |
| * |
| * Args: |
| * state: Jitdump state (currently unused, uses global state) |
| * code_addr: Address where the compiled code resides |
| * code_size: Size of the compiled code in bytes |
| * co: Python code object containing metadata |
| * |
| * IMPORTANT: This function signature is part of Python's internal API |
| * and must not be changed without coordinating with core Python development. |
| */ |
| static void perf_map_jit_write_entry(void *state, const void *code_addr, |
| unsigned int code_size, PyCodeObject *co) |
| { |
| /* Initialize jitdump system on first use */ |
| if (perf_jit_map_state.perf_map == NULL) { |
| void* ret = perf_map_jit_init(); |
| if(ret == NULL){ |
| return; // Initialization failed, silently abort |
| } |
| } |
| |
| /* |
| * Extract function information from Python code object |
| * |
| * We create a human-readable function name by combining the qualified |
| * name (includes class/module context) with the filename. This helps |
| * developers identify functions in perf reports. |
| */ |
| const char *entry = ""; |
| if (co->co_qualname != NULL) { |
| entry = PyUnicode_AsUTF8(co->co_qualname); |
| } |
| |
| const char *filename = ""; |
| if (co->co_filename != NULL) { |
| filename = PyUnicode_AsUTF8(co->co_filename); |
| } |
| |
| /* |
| * Create formatted function name for perf display |
| * |
| * Format: "py::<function_name>:<filename>" |
| * The "py::" prefix helps identify Python functions in mixed-language |
| * profiles (e.g., when profiling C extensions alongside Python code). |
| */ |
| size_t perf_map_entry_size = snprintf(NULL, 0, "py::%s:%s", entry, filename) + 1; |
| char* perf_map_entry = (char*) PyMem_RawMalloc(perf_map_entry_size); |
| if (perf_map_entry == NULL) { |
| return; // Memory allocation failed |
| } |
| snprintf(perf_map_entry, perf_map_entry_size, "py::%s:%s", entry, filename); |
| |
| const size_t name_length = strlen(perf_map_entry); |
| uword base = (uword)code_addr; |
| uword size = code_size; |
| |
| /* |
| * Generate DWARF unwinding information |
| * |
| * DWARF data is essential for proper stack unwinding during profiling. |
| * Without it, perf cannot generate accurate call graphs, especially |
| * in optimized code where frame pointers may be omitted. |
| */ |
| ELFObjectContext ctx; |
| char buffer[1024]; // Buffer for DWARF data (1KB should be sufficient) |
| ctx.code_size = code_size; |
| ctx.startp = ctx.p = (uint8_t*)buffer; |
| ctx.fde_p = NULL; // Initialize to NULL, will be set when FDE is written |
| |
| /* Generate EH frame (Exception Handling frame) data */ |
| elf_init_ehframe(&ctx); |
| int eh_frame_size = ctx.p - ctx.startp; |
| |
| /* |
| * Write Code Unwinding Information Event |
| * |
| * This event must be written before the code load event to ensure |
| * perf has the unwinding information available when it processes |
| * the code region. |
| */ |
| CodeUnwindingInfoEvent ev2; |
| ev2.base.event = PerfUnwindingInfo; |
| ev2.base.time_stamp = get_current_monotonic_ticks(); |
| ev2.unwind_data_size = sizeof(EhFrameHeader) + eh_frame_size; |
| |
| /* Verify we don't exceed our padding budget */ |
| assert(ev2.unwind_data_size <= (uint64_t)trampoline_api.code_padding); |
| |
| ev2.eh_frame_hdr_size = sizeof(EhFrameHeader); |
| ev2.mapped_size = round_up(ev2.unwind_data_size, 16); // 16-byte alignment |
| |
| /* Calculate total event size with padding */ |
| int content_size = sizeof(ev2) + sizeof(EhFrameHeader) + eh_frame_size; |
| int padding_size = round_up(content_size, 8) - content_size; // 8-byte align |
| ev2.base.size = content_size + padding_size; |
| |
| /* Write the unwinding info event header */ |
| perf_map_jit_write_fully(&ev2, sizeof(ev2)); |
| |
| /* |
| * Write EH Frame Header |
| * |
| * The EH frame header provides metadata about the DWARF unwinding |
| * information that follows. It includes pointers and counts that |
| * help perf navigate the unwinding data efficiently. |
| */ |
| EhFrameHeader f; |
| f.version = 1; |
| f.eh_frame_ptr_enc = DwarfSData4 | DwarfPcRel; // PC-relative signed 4-byte |
| f.fde_count_enc = DwarfUData4; // Unsigned 4-byte count |
| f.table_enc = DwarfSData4 | DwarfDataRel; // Data-relative signed 4-byte |
| |
| /* Calculate relative offsets for EH frame navigation */ |
| f.eh_frame_ptr = -(eh_frame_size + 4 * sizeof(unsigned char)); |
| f.eh_fde_count = 1; // We generate exactly one FDE per function |
| f.from = -(round_up(code_size, 8) + eh_frame_size); |
| |
| int cie_size = ctx.eh_frame_p - ctx.startp; |
| f.to = -(eh_frame_size - cie_size); |
| |
| /* Write EH frame data and header */ |
| perf_map_jit_write_fully(ctx.startp, eh_frame_size); |
| perf_map_jit_write_fully(&f, sizeof(f)); |
| |
| /* Write padding to maintain alignment */ |
| char padding_bytes[] = "\0\0\0\0\0\0\0\0"; |
| perf_map_jit_write_fully(&padding_bytes, padding_size); |
| |
| /* |
| * Write Code Load Event |
| * |
| * This event tells perf about the new code region. It includes: |
| * - Memory addresses and sizes |
| * - Process and thread identification |
| * - Function name for symbol resolution |
| * - The actual machine code bytes |
| */ |
| CodeLoadEvent ev; |
| ev.base.event = PerfLoad; |
| ev.base.size = sizeof(ev) + (name_length+1) + size; |
| ev.base.time_stamp = get_current_monotonic_ticks(); |
| ev.process_id = getpid(); |
| #if defined(__APPLE__) |
| pthread_threadid_np(NULL, &ev.thread_id); |
| #else |
| ev.thread_id = syscall(SYS_gettid); // Get thread ID via system call |
| #endif |
| ev.vma = base; // Virtual memory address |
| ev.code_address = base; // Same as VMA for our use case |
| ev.code_size = size; |
| |
| /* Assign unique code ID and increment counter */ |
| perf_jit_map_state.code_id += 1; |
| ev.code_id = perf_jit_map_state.code_id; |
| |
| /* Write code load event and associated data */ |
| perf_map_jit_write_fully(&ev, sizeof(ev)); |
| perf_map_jit_write_fully(perf_map_entry, name_length+1); // Include null terminator |
| perf_map_jit_write_fully((void*)(base), size); // Copy actual machine code |
| |
| /* Clean up allocated memory */ |
| PyMem_RawFree(perf_map_entry); |
| } |
| |
| // ============================================================================= |
| // CLEANUP AND FINALIZATION |
| // ============================================================================= |
| |
| /* |
| * Finalize and cleanup the perf jitdump system |
| * |
| * This function is called when Python is shutting down or when the |
| * perf trampoline system is being disabled. It ensures all resources |
| * are properly released and all buffered data is flushed to disk. |
| * |
| * Args: |
| * state: Jitdump state (currently unused, uses global state) |
| * |
| * Returns: 0 on success |
| * |
| * IMPORTANT: This function signature is part of Python's internal API |
| * and must not be changed without coordinating with core Python development. |
| */ |
| static int perf_map_jit_fini(void* state) { |
| /* |
| * Close jitdump file with proper synchronization |
| * |
| * We need to acquire the lock to ensure no other threads are |
| * writing to the file when we close it. This prevents corruption |
| * and ensures all data is properly flushed. |
| */ |
| if (perf_jit_map_state.perf_map != NULL) { |
| PyThread_acquire_lock(perf_jit_map_state.map_lock, 1); |
| fclose(perf_jit_map_state.perf_map); // This also flushes buffers |
| PyThread_release_lock(perf_jit_map_state.map_lock); |
| |
| /* Clean up synchronization primitive */ |
| PyThread_free_lock(perf_jit_map_state.map_lock); |
| perf_jit_map_state.perf_map = NULL; |
| } |
| |
| /* |
| * Unmap the memory region |
| * |
| * This removes the signal to perf that we were generating JIT code. |
| * After this point, perf will no longer detect this process as |
| * having JIT capabilities. |
| */ |
| if (perf_jit_map_state.mapped_buffer != NULL) { |
| munmap(perf_jit_map_state.mapped_buffer, perf_jit_map_state.mapped_size); |
| perf_jit_map_state.mapped_buffer = NULL; |
| } |
| |
| /* Clear global state reference */ |
| trampoline_api.state = NULL; |
| |
| return 0; // Success |
| } |
| |
| // ============================================================================= |
| // PUBLIC API EXPORT |
| // ============================================================================= |
| |
| /* |
| * Python Perf Callbacks Structure |
| * |
| * This structure defines the callback interface that Python's trampoline |
| * system uses to integrate with perf profiling. It contains function |
| * pointers for initialization, event writing, and cleanup. |
| * |
| * CRITICAL: This structure and its contents are part of Python's internal |
| * API. The function signatures and behavior must remain stable to maintain |
| * compatibility with the Python interpreter's perf integration system. |
| * |
| * Used by: Python's _PyPerf_Callbacks system in pycore_ceval.h |
| */ |
| _PyPerf_Callbacks _Py_perfmap_jit_callbacks = { |
| &perf_map_jit_init, // Initialization function |
| &perf_map_jit_write_entry, // Event writing function |
| &perf_map_jit_fini, // Cleanup function |
| }; |
| |
| #endif /* PY_HAVE_PERF_TRAMPOLINE */ |