| /* ********************************************************** |
| * Copyright (c) 2013 Google, Inc. All rights reserved. |
| * Copyright (c) 2000-2008 VMware, Inc. All rights reserved. |
| * **********************************************************/ |
| |
| /* |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions are met: |
| * |
| * * Redistributions of source code must retain the above copyright notice, |
| * this list of conditions and the following disclaimer. |
| * |
| * * Redistributions in binary form must reproduce the above copyright notice, |
| * this list of conditions and the following disclaimer in the documentation |
| * and/or other materials provided with the distribution. |
| * |
| * * Neither the name of VMware, Inc. nor the names of its contributors may be |
| * used to endorse or promote products derived from this software without |
| * specific prior written permission. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| * ARE DISCLAIMED. IN NO EVENT SHALL VMWARE, INC. OR CONTRIBUTORS BE LIABLE |
| * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
| * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR |
| * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER |
| * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
| * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
| * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH |
| * DAMAGE. |
| */ |
| |
| /* Copyright (c) 2003-2007 Determina Corp. */ |
| /* Copyright (c) 2001-2003 Massachusetts Institute of Technology */ |
| /* Copyright (c) 2000-2001 Hewlett-Packard Company */ |
| |
| /* |
| * proc.c - processor-specific routines |
| */ |
| |
| #include "../globals.h" |
| #include "proc.h" |
| #include "instr.h" /* for dr_insert_{save,restore}_fpstate */ |
| #include "instrument.h" /* for dr_insert_{save,restore}_fpstate */ |
| #include "instr_create.h" /* for dr_insert_{save,restore}_fpstate */ |
| #include "decode.h" /* for dr_insert_{save,restore}_fpstate */ |
| |
| #ifdef DEBUG |
| /* case 10450: give messages to clients */ |
| /* we can't undef ASSERT b/c of DYNAMO_OPTION */ |
| # undef ASSERT_TRUNCATE |
| # undef ASSERT_BITFIELD_TRUNCATE |
| # undef ASSERT_NOT_REACHED |
| # define ASSERT_TRUNCATE DO_NOT_USE_ASSERT_USE_CLIENT_ASSERT_INSTEAD |
| # define ASSERT_BITFIELD_TRUNCATE DO_NOT_USE_ASSERT_USE_CLIENT_ASSERT_INSTEAD |
| # define ASSERT_NOT_REACHED DO_NOT_USE_ASSERT_USE_CLIENT_ASSERT_INSTEAD |
| #endif |
| |
| /* Intel processors: ebx:edx:ecx spell GenuineIntel */ |
| #define INTEL_EBX /* Genu */ 0x756e6547 |
| #define INTEL_EDX /* ineI */ 0x49656e69 |
| #define INTEL_ECX /* ntel */ 0x6c65746e |
| |
| /* AMD processors: ebx:edx:ecx spell AuthenticAMD */ |
| #define AMD_EBX /* Auth */ 0x68747541 |
| #define AMD_EDX /* enti */ 0x69746e65 |
| #define AMD_ECX /* cAMD */ 0x444d4163 |
| |
| /* cache_line_size is exported for efficient access. |
| * FIXME: In case the processor doesn't support the |
| * cpuid instruction, use a default value of 32. |
| * (see case 463 for discussion) |
| */ |
| size_t cache_line_size = 32; |
| static ptr_uint_t mask; /* bits that should be 0 to be cache-line-aligned */ |
| |
| static uint L1_icache_size = CACHE_SIZE_UNKNOWN; |
| static uint L1_dcache_size = CACHE_SIZE_UNKNOWN; |
| static uint L2_cache_size = CACHE_SIZE_UNKNOWN; |
| |
| static uint vendor = VENDOR_UNKNOWN; |
| static uint family = 0; |
| static uint type = 0; |
| static uint model = 0; |
| static uint stepping = 0; |
| |
| /* Feature bits in 4 32-bit values: features in edx, |
| * features in ecx, extended features in edx, and |
| * extended features in ecx |
| */ |
| static features_t features = {0, 0, 0, 0}; |
| |
| /* The brand string is a 48-character, null terminated string. |
| * Declare as a 12-element uint so the compiler won't complain |
| * when we store GPRs to it. Initialization is "unknown" . |
| */ |
| static uint brand_string[12] = {0x6e6b6e75, 0x006e776f}; |
| |
| static bool avx_enabled; |
| |
| static void |
| set_cache_size(uint val, uint *dst) |
| { |
| CLIENT_ASSERT(dst != NULL, "invalid internal param"); |
| switch (val) { |
| case 8: *dst = CACHE_SIZE_8_KB; break; |
| case 16: *dst = CACHE_SIZE_16_KB; break; |
| case 32: *dst = CACHE_SIZE_32_KB; break; |
| case 64: *dst = CACHE_SIZE_64_KB; break; |
| case 128: *dst = CACHE_SIZE_128_KB; break; |
| case 256: *dst = CACHE_SIZE_256_KB; break; |
| case 512: *dst = CACHE_SIZE_512_KB; break; |
| case 1024: *dst = CACHE_SIZE_1_MB; break; |
| case 2048: *dst = CACHE_SIZE_2_MB; break; |
| default: SYSLOG_INTERNAL_ERROR("Unknown processor cache size"); break; |
| } |
| } |
| |
| static void |
| get_cache_sizes_amd(uint max_ext_val) |
| { |
| uint cpuid_res_local[4]; /* eax, ebx, ecx, and edx registers (in that order) */ |
| |
| if (max_ext_val >= 0x80000005) { |
| #ifdef UNIX |
| our_cpuid((int*)cpuid_res_local, 0x80000005); |
| #else |
| __cpuid(cpuid_res_local, 0x80000005); |
| #endif |
| set_cache_size((cpuid_res_local[2]/*ecx*/ >> 24), &L1_icache_size); |
| set_cache_size((cpuid_res_local[3]/*edx*/ >> 24), &L1_dcache_size); |
| } |
| |
| if (max_ext_val >= 0x80000006) { |
| #ifdef UNIX |
| our_cpuid((int*)cpuid_res_local, 0x80000006); |
| #else |
| __cpuid(cpuid_res_local, 0x80000006); |
| #endif |
| set_cache_size((cpuid_res_local[2]/*ecx*/ >> 16), &L2_cache_size); |
| } |
| } |
| |
| static void |
| get_cache_sizes_intel(uint max_val) |
| { |
| /* declare as uint so compiler won't complain when we write GP regs to the array */ |
| uint cache_codes[4]; |
| int i; |
| |
| if (max_val < 2) |
| return; |
| |
| #ifdef UNIX |
| our_cpuid((int*)cache_codes, 2); |
| #else |
| __cpuid(cache_codes, 2); |
| #endif |
| /* The lower 8 bits of eax specify the number of times cpuid |
| * must be executed to obtain a complete picture of the cache |
| * characteristics. |
| */ |
| CLIENT_ASSERT((cache_codes[0] & 0xff) == 1, "cpuid error"); |
| cache_codes[0] &= ~0xff; |
| |
| /* Cache codes are stored in consecutive bytes in the |
| * GP registers. For each register, a 1 in bit 31 |
| * indicates that the codes should be ignored... zero |
| * all four bytes when that happens |
| */ |
| for (i=0; i<4; i++) { |
| if (cache_codes[i] & 0x80000000) |
| cache_codes[i] = 0; |
| } |
| |
| /* Table 3-17, pg 3-171 of IA-32 instruction set reference lists |
| * all codes. Omitting L3 cache characteristics for now... |
| */ |
| for (i=0; i<16; i++) { |
| switch (((uchar*)cache_codes)[i]) { |
| case 0x06: L1_icache_size = CACHE_SIZE_8_KB; break; |
| case 0x08: L1_icache_size = CACHE_SIZE_16_KB; break; |
| case 0x0a: L1_dcache_size = CACHE_SIZE_8_KB; break; |
| case 0x0c: L1_dcache_size = CACHE_SIZE_16_KB; break; |
| case 0x2c: L1_dcache_size = CACHE_SIZE_32_KB; break; |
| case 0x30: L1_icache_size = CACHE_SIZE_32_KB; break; |
| case 0x41: L2_cache_size = CACHE_SIZE_128_KB; break; |
| case 0x42: L2_cache_size = CACHE_SIZE_256_KB; break; |
| case 0x43: L2_cache_size = CACHE_SIZE_512_KB; break; |
| case 0x44: L2_cache_size = CACHE_SIZE_1_MB; break; |
| case 0x45: L2_cache_size = CACHE_SIZE_2_MB; break; |
| case 0x60: L1_dcache_size = CACHE_SIZE_16_KB; break; |
| case 0x66: L1_dcache_size = CACHE_SIZE_8_KB; break; |
| case 0x67: L1_dcache_size = CACHE_SIZE_16_KB; break; |
| case 0x68: L1_dcache_size = CACHE_SIZE_32_KB; break; |
| case 0x78: L2_cache_size = CACHE_SIZE_1_MB; break; |
| case 0x79: L2_cache_size = CACHE_SIZE_128_KB; break; |
| case 0x7a: L2_cache_size = CACHE_SIZE_256_KB; break; |
| case 0x7b: L2_cache_size = CACHE_SIZE_512_KB; break; |
| case 0x7c: L2_cache_size = CACHE_SIZE_1_MB; break; |
| case 0x7d: L2_cache_size = CACHE_SIZE_2_MB; break; |
| case 0x7f: L2_cache_size = CACHE_SIZE_512_KB; break; |
| case 0x82: L2_cache_size = CACHE_SIZE_256_KB; break; |
| case 0x83: L2_cache_size = CACHE_SIZE_512_KB; break; |
| case 0x84: L2_cache_size = CACHE_SIZE_1_MB; break; |
| case 0x85: L2_cache_size = CACHE_SIZE_2_MB; break; |
| case 0x86: L2_cache_size = CACHE_SIZE_512_KB; break; |
| case 0x87: L2_cache_size = CACHE_SIZE_1_MB; break; |
| default: break; |
| } |
| } |
| } |
| |
| /* |
| * On Pentium through Pentium III, I-cache lines are 32 bytes. |
| * On Pentium IV they are 64 bytes. |
| */ |
| static void |
| get_processor_specific_info(void) |
| { |
| /* use cpuid instruction to get processor info. For details, see |
| * http://download.intel.com/design/Xeon/applnots/24161830.pdf |
| * "AP-485: Intel Processor Identification and the CPUID |
| * instruction", 96 pages, January 2006 |
| */ |
| uint res_eax, res_ebx = 0, res_ecx = 0, res_edx = 0; |
| uint max_val, max_ext_val; |
| int cpuid_res_local[4]; /* eax, ebx, ecx, and edx registers (in that order) */ |
| |
| /* First check for existence of the cpuid instruction |
| * by attempting to modify bit 21 of eflags |
| */ |
| /* FIXME: Perhaps we should abort when the cpuid instruction |
| * doesn't exist since the cache_line_size may be incorrect. |
| * (see case 463 for discussion) |
| */ |
| if (!cpuid_supported()) { |
| ASSERT_CURIOSITY(false && "cpuid instruction unsupported"); |
| SYSLOG_INTERNAL_WARNING("cpuid instruction unsupported -- cache_line_size " |
| "may be incorrect"); |
| return; |
| } |
| |
| /* first verify on Intel processor */ |
| #ifdef UNIX |
| our_cpuid(cpuid_res_local, 0); |
| #else |
| __cpuid(cpuid_res_local, 0); |
| #endif |
| res_eax = cpuid_res_local[0]; |
| res_ebx = cpuid_res_local[1]; |
| res_ecx = cpuid_res_local[2]; |
| res_edx = cpuid_res_local[3]; |
| max_val = res_eax; |
| |
| if (res_ebx == INTEL_EBX) { |
| vendor = VENDOR_INTEL; |
| CLIENT_ASSERT(res_edx == INTEL_EDX && res_ecx == INTEL_ECX, |
| "unknown Intel processor type"); |
| } else if (res_ebx == AMD_EBX) { |
| vendor = VENDOR_AMD; |
| CLIENT_ASSERT(res_edx == AMD_EDX && res_ecx == AMD_ECX, |
| "unknown AMD processor type"); |
| } else { |
| vendor = VENDOR_UNKNOWN; |
| SYSLOG_INTERNAL_ERROR("Running on unknown processor type"); |
| LOG(GLOBAL, LOG_TOP, 1, "cpuid returned "PFX" "PFX" "PFX" "PFX"\n", |
| res_eax, res_ebx, res_ecx, res_edx); |
| } |
| |
| /* Try to get extended cpuid information */ |
| #ifdef UNIX |
| our_cpuid(cpuid_res_local, 0x80000000); |
| #else |
| __cpuid(cpuid_res_local, 0x80000000); |
| #endif |
| max_ext_val = cpuid_res_local[0]/*eax*/; |
| |
| /* Extended feature flags */ |
| if (max_ext_val >= 0x80000001) { |
| #ifdef UNIX |
| our_cpuid(cpuid_res_local, 0x80000001); |
| #else |
| __cpuid(cpuid_res_local, 0x80000001); |
| #endif |
| res_ecx = cpuid_res_local[2]; |
| res_edx = cpuid_res_local[3]; |
| features.ext_flags_edx = res_edx; |
| features.ext_flags_ecx = res_ecx; |
| } |
| |
| /* now get processor info */ |
| #ifdef UNIX |
| our_cpuid(cpuid_res_local, 1); |
| #else |
| __cpuid(cpuid_res_local, 1); |
| #endif |
| res_eax = cpuid_res_local[0]; |
| res_ebx = cpuid_res_local[1]; |
| res_ecx = cpuid_res_local[2]; |
| res_edx = cpuid_res_local[3]; |
| /* eax contains basic info: |
| * extended family, extended model, type, family, model, stepping id |
| * 20:27, 16:19, 12:13, 8:11, 4:7, 0:3 |
| */ |
| type = (res_eax >> 12) & 0x3; |
| family = (res_eax >> 8) & 0xf; |
| model = (res_eax >> 4) & 0xf; |
| stepping = res_eax & 0xf; |
| |
| /* Pages 3-164 and 3-165 of the IA-32 instruction set |
| * reference instruct us to adjust the family and model |
| * numbers as follows. |
| */ |
| if (family == 0x6 || family == 0xf) { |
| uint ext_model = (res_eax >> 16) & 0xf; |
| model += (ext_model << 4); |
| |
| if (family == 0xf) { |
| uint ext_family = (res_eax >> 20) & 0xff; |
| family += ext_family; |
| } |
| } |
| |
| features.flags_edx = res_edx; |
| features.flags_ecx = res_ecx; |
| |
| /* Now features.* are complete and we can query */ |
| if (proc_has_feature(FEATURE_CLFSH)) { |
| /* The new manuals imply ebx always holds the |
| * cache line size for clflush, not just on P4 |
| */ |
| cache_line_size = (res_ebx & 0x0000ff00) >> 5; /* (x >> 8) * 8 == x >> 5 */ |
| } else if (vendor == VENDOR_INTEL && |
| (family == FAMILY_PENTIUM_3 || family == FAMILY_PENTIUM_2)) { |
| /* Pentium III, Pentium II */ |
| cache_line_size = 32; |
| } else if (vendor == VENDOR_AMD && family == FAMILY_ATHLON) { |
| /* Athlon */ |
| cache_line_size = 64; |
| #ifdef IA32_ON_IA64 |
| } else if (vendor == VENDOR_INTEL && family == FAMILY_IA64) { |
| /* Itanium */ |
| cache_line_size = 32; |
| #endif |
| } else { |
| LOG(GLOBAL, LOG_TOP, 1, "Warning: running on unsupported processor family %d\n", |
| family); |
| cache_line_size = 32; |
| } |
| /* people who use this in ALIGN* macros are assuming it's a power of 2 */ |
| CLIENT_ASSERT((cache_line_size & (cache_line_size - 1)) == 0, |
| "invalid cache line size"); |
| |
| /* get L1 and L2 cache sizes */ |
| if (vendor == VENDOR_AMD) |
| get_cache_sizes_amd(max_ext_val); |
| else |
| get_cache_sizes_intel(max_val); |
| |
| /* Processor brand string */ |
| if (max_ext_val >= 0x80000004) { |
| #ifdef UNIX |
| our_cpuid((int*)&brand_string[0], 0x80000002); |
| our_cpuid((int*)&brand_string[4], 0x80000003); |
| our_cpuid((int*)&brand_string[8], 0x80000004); |
| #else |
| __cpuid(&brand_string[0], 0x80000002); |
| __cpuid(&brand_string[4], 0x80000003); |
| __cpuid(&brand_string[8], 0x80000004); |
| #endif |
| } |
| } |
| |
| void |
| proc_init(void) |
| { |
| LOG(GLOBAL, LOG_TOP, 1, "Running on a %d CPU machine\n", get_num_processors()); |
| |
| get_processor_specific_info(); |
| CLIENT_ASSERT(cache_line_size > 0, "invalid cache line size"); |
| mask = (cache_line_size - 1); |
| |
| LOG(GLOBAL, LOG_TOP, 1, "Cache line size is %d bytes\n", cache_line_size); |
| LOG(GLOBAL, LOG_TOP, 1, "L1 icache=%s, L1 dcache=%s, L2 cache=%s\n", |
| proc_get_cache_size_str(proc_get_L1_icache_size()), |
| proc_get_cache_size_str(proc_get_L1_dcache_size()), |
| proc_get_cache_size_str(proc_get_L2_cache_size())); |
| LOG(GLOBAL, LOG_TOP, 1, "Processor brand string = %s\n", brand_string); |
| LOG(GLOBAL, LOG_TOP, 1, "Type=0x%x, Family=0x%x, Model=0x%x, Stepping=0x%x\n", |
| type, family, model, stepping); |
| |
| #ifdef X86 |
| # ifdef X64 |
| CLIENT_ASSERT(proc_has_feature(FEATURE_LAHF), |
| "Unsupported processor type - processor must support LAHF/SAHF in " |
| "64bit mode."); |
| if (!proc_has_feature(FEATURE_LAHF)) { |
| FATAL_USAGE_ERROR(UNSUPPORTED_PROCESSOR_LAHF, 2, |
| get_application_name(), get_application_pid()); |
| } |
| # endif |
| |
| # ifdef DEBUG |
| /* FIXME: This is a small subset of processor features. If we |
| * care enough to add more, it would probably be best to loop |
| * through a const array of feature names. |
| */ |
| if (stats->loglevel > 0 && (stats->logmask & LOG_TOP) != 0) { |
| if (proc_has_feature(FEATURE_XD_Bit)) |
| LOG(GLOBAL, LOG_TOP, 1, "\tProcessor has XD Bit\n"); |
| if (proc_has_feature(FEATURE_MMX)) |
| LOG(GLOBAL, LOG_TOP, 1, "\tProcessor has MMX\n"); |
| if (proc_has_feature(FEATURE_FXSR)) |
| LOG(GLOBAL, LOG_TOP, 1, "\tProcessor has fxsave/fxrstor\n"); |
| if (proc_has_feature(FEATURE_SSE)) |
| LOG(GLOBAL, LOG_TOP, 1, "\tProcessor has SSE\n"); |
| if (proc_has_feature(FEATURE_SSE2)) |
| LOG(GLOBAL, LOG_TOP, 1, "\tProcessor has SSE2\n"); |
| if (proc_has_feature(FEATURE_SSE3)) |
| LOG(GLOBAL, LOG_TOP, 1, "\tProcessor has SSE3\n"); |
| if (proc_has_feature(FEATURE_AVX)) |
| LOG(GLOBAL, LOG_TOP, 1, "\tProcessor has AVX\n"); |
| if (proc_has_feature(FEATURE_OSXSAVE)) |
| LOG(GLOBAL, LOG_TOP, 1, "\tProcessor has OSXSAVE\n"); |
| } |
| # endif |
| /* PR 264138: for 32-bit CONTEXT we assume fxsave layout */ |
| CLIENT_ASSERT((proc_has_feature(FEATURE_FXSR) && proc_has_feature(FEATURE_SSE)) || |
| (!proc_has_feature(FEATURE_FXSR) && !proc_has_feature(FEATURE_SSE)), |
| "Unsupported processor type: SSE and FXSR must match"); |
| |
| if (proc_has_feature(FEATURE_AVX) && proc_has_feature(FEATURE_OSXSAVE)) { |
| /* Even if the processor supports AVX, it will #UD on any AVX instruction |
| * if the OS hasn't enabled YMM and XMM state saving. |
| * To check that, we invoke xgetbv -- for which we need FEATURE_OSXSAVE. |
| * FEATURE_OSXSAVE is also listed as one of the 3 steps in Intel Vol 1 |
| * Fig 13-1: 1) cpuid OSXSAVE; 2) xgetbv 0x6; 3) cpuid AVX. |
| * Xref i#1278, i#1030, i#437. |
| */ |
| uint bv_high = 0, bv_low = 0; |
| dr_xgetbv(&bv_high, &bv_low); |
| LOG(GLOBAL, LOG_TOP, 2, "\txgetbv => 0x%08x%08x\n", bv_high, bv_low); |
| if (TESTALL(XCR0_AVX|XCR0_SSE, bv_low)) { |
| avx_enabled = true; |
| LOG(GLOBAL, LOG_TOP, 1, "\tProcessor and OS fully support AVX\n"); |
| } else { |
| LOG(GLOBAL, LOG_TOP, 1, "\tOS does NOT support AVX\n"); |
| } |
| } |
| #endif /* X86 */ |
| } |
| |
| uint |
| proc_get_vendor(void) |
| { |
| return vendor; |
| } |
| |
| DR_API |
| int |
| proc_set_vendor(uint new_vendor) |
| { |
| if (new_vendor == VENDOR_INTEL || |
| new_vendor == VENDOR_AMD) { |
| uint old_vendor = vendor; |
| SELF_UNPROTECT_DATASEC(DATASEC_RARELY_PROT); |
| vendor = new_vendor; |
| SELF_PROTECT_DATASEC(DATASEC_RARELY_PROT); |
| return old_vendor; |
| } else { |
| CLIENT_ASSERT(false, "invalid vendor"); |
| return -1; |
| } |
| } |
| |
| uint |
| proc_get_family(void) |
| { |
| return family; |
| } |
| |
| uint proc_get_type(void) |
| { |
| return type; |
| } |
| |
| /* FIXME: Add MODEL_ constants to proc.h?? */ |
| uint proc_get_model(void) |
| { |
| return model; |
| } |
| |
| uint proc_get_stepping(void) |
| { |
| return stepping; |
| } |
| |
| bool |
| proc_has_feature(feature_bit_t f) |
| { |
| uint bit, val = 0; |
| |
| /* Cast to int to avoid tautological comparison if feature_bit_t enum is |
| * unsigned. */ |
| if ((int)f >= 0 && f <= 31) { |
| val = features.flags_edx; |
| } else if (f >= 32 && f <= 63) { |
| val = features.flags_ecx; |
| } else if (f >= 64 && f <= 95) { |
| val = features.ext_flags_edx; |
| } else if (f >= 96 && f <= 127) { |
| val = features.ext_flags_ecx; |
| } else { |
| CLIENT_ASSERT(false, "proc_has_feature: invalid parameter"); |
| } |
| |
| bit = f % 32; |
| return TEST((1 << bit), val); |
| } |
| |
| features_t * |
| proc_get_all_feature_bits(void) |
| { |
| return &features; |
| } |
| |
| char * |
| proc_get_brand_string(void) |
| { |
| return (char *)brand_string; |
| } |
| |
| cache_size_t |
| proc_get_L1_icache_size(void) |
| { |
| return L1_icache_size; |
| } |
| |
| cache_size_t |
| proc_get_L1_dcache_size(void) |
| { |
| return L1_dcache_size; |
| } |
| |
| cache_size_t |
| proc_get_L2_cache_size(void) |
| { |
| return L2_cache_size; |
| } |
| |
| const char * |
| proc_get_cache_size_str(cache_size_t size) |
| { |
| static const char *strings[] = { |
| "8 KB", |
| "16 KB", |
| "32 KB", |
| "64 KB", |
| "128 KB", |
| "256 KB", |
| "512 KB", |
| "1 MB", |
| "2 MB", |
| "unknown" |
| }; |
| CLIENT_ASSERT(size <= CACHE_SIZE_UNKNOWN, "proc_get_cache_size_str: invalid size"); |
| return strings[size]; |
| } |
| |
| size_t |
| proc_get_cache_line_size(void) |
| { |
| return cache_line_size; |
| } |
| |
| /* check to see if addr is cache aligned */ |
| bool |
| proc_is_cache_aligned(void *addr) |
| { |
| return (((ptr_uint_t)addr & mask) == 0x0); |
| } |
| |
| /* Given an address or number of bytes sz, return a number >= sz that is divisible |
| by the cache line size. */ |
| ptr_uint_t |
| proc_bump_to_end_of_cache_line(ptr_uint_t sz) |
| { |
| if ((sz & mask) == 0x0) |
| return sz; /* sz already a multiple of the line size */ |
| |
| return ((sz + cache_line_size) & ~mask); |
| } |
| |
| /* yes same result as PAGE_START...FIXME: get rid of one of them? */ |
| void * |
| proc_get_containing_page(void *addr) |
| { |
| return (void *) (((ptr_uint_t)addr) & ~(PAGE_SIZE-1)); |
| } |
| |
| /* No synchronization routines necessary. The Pentium hardware |
| * guarantees that the i and d caches are consistent. */ |
| void |
| machine_cache_sync(void *pc_start, void *pc_end, bool flush_icache) |
| { |
| /* empty */ |
| } |
| |
| DR_API |
| /** |
| * Returns the size in bytes needed for a buffer for saving the floating point state. |
| */ |
| size_t |
| proc_fpstate_save_size(void) |
| { |
| CLIENT_ASSERT(opnd_size_in_bytes(OPSZ_512) == 512 && |
| opnd_size_in_bytes(OPSZ_108) == 108, |
| "internal sizing discrepancy"); |
| return (proc_has_feature(FEATURE_FXSR) ? 512 : 108); |
| } |
| |
| DR_API |
| /* Saves the floating point state into the 16-byte-aligned buffer buf, |
| * which must be 512 bytes for processors with the FXSR feature, and |
| * 108 bytes for those without (where this routine does not support |
| * 16-bit operand sizing). |
| * |
| * DynamoRIO does NOT save the application's floating-point, MMX, or SSE state |
| * on context switches! Thus if a client performs any floating-point operations |
| * in its main routines called by DynamoRIO, the client must save and restore |
| * the floating-point/MMX/SSE state. |
| * If the client needs to do so inside the code cache the client should implement |
| * that itself. |
| * return number of bytes written |
| * |
| * XXX: we do not translate the last fp pc (xref i#698). If a client ever needs that |
| * we can try to support it in the future. |
| */ |
| size_t |
| proc_save_fpstate(byte *buf) |
| { |
| /* MUST be 16-byte aligned */ |
| CLIENT_ASSERT((((ptr_uint_t)buf) & 0x0000000f) == 0, |
| "proc_save_fpstate: buf must be 16-byte aligned"); |
| #ifdef X86 |
| if (proc_has_feature(FEATURE_FXSR)) { |
| /* Not using inline asm for identical cross-platform code |
| * here. An extra function call won't hurt here. |
| */ |
| # ifdef X64 |
| if (X64_MODE_DC(get_thread_private_dcontext())) |
| dr_fxsave(buf); |
| else |
| dr_fxsave32(buf); |
| # else |
| dr_fxsave(buf); |
| # endif |
| } else { |
| # ifdef WINDOWS |
| dr_fnsave(buf); |
| # else |
| asm volatile("fnsave %0 ; fwait" : "=m" ((*buf))); |
| # endif |
| } |
| return proc_fpstate_save_size(); |
| #elif defined(ARM) |
| /* FIXME i#1551: NYI on ARM */ |
| ASSERT_NOT_IMPLEMENTED(false); |
| return 0; |
| #endif /* X86/ARM */ |
| } |
| |
| DR_API |
| /* Restores the floating point state from the 16-byte-aligned buffer |
| * buf, which must be 512 bytes for processors with the FXSR feature, |
| * and 108 bytes for those without (where this routine does not |
| * support 16-bit operand sizing). |
| */ |
| void |
| proc_restore_fpstate(byte *buf) |
| { |
| /* MUST be 16-byte aligned */ |
| CLIENT_ASSERT((((ptr_uint_t)buf) & 0x0000000f) == 0, |
| "proc_restore_fpstate: buf must be 16-byte aligned"); |
| #ifdef X86 |
| if (proc_has_feature(FEATURE_FXSR)) { |
| /* Not using inline asm for identical cross-platform code |
| * here. An extra function call won't hurt here. |
| */ |
| # ifdef X64 |
| if (X64_MODE_DC(get_thread_private_dcontext())) |
| dr_fxrstor(buf); |
| else |
| dr_fxrstor32(buf); |
| # else |
| dr_fxrstor(buf); |
| # endif |
| } else { |
| # ifdef WINDOWS |
| dr_frstor(buf); |
| # else |
| asm volatile("frstor %0" : : "m" ((*buf))); |
| # endif |
| } |
| #elif defined(ARM) |
| /* FIXME i#1551: NYI on ARM */ |
| ASSERT_NOT_IMPLEMENTED(false); |
| #endif /* X86/ARM */ |
| } |
| |
| /* XXX: we do not translate the last fp pc (xref i#698). If a client ever needs that |
| * we can try to support it in the future. |
| */ |
| void |
| dr_insert_save_fpstate(void *drcontext, instrlist_t *ilist, instr_t *where, |
| opnd_t buf) |
| { |
| #ifdef X86 |
| dcontext_t *dcontext = (dcontext_t *) drcontext; |
| if (proc_has_feature(FEATURE_FXSR)) { |
| /* we want "fxsave, fnclex, finit" */ |
| CLIENT_ASSERT(opnd_get_size(buf) == OPSZ_512, |
| "dr_insert_save_fpstate: opnd size must be OPSZ_512"); |
| if (X64_MODE_DC(dcontext)) |
| instrlist_meta_preinsert(ilist, where, INSTR_CREATE_fxsave64(dcontext, buf)); |
| else |
| instrlist_meta_preinsert(ilist, where, INSTR_CREATE_fxsave32(dcontext, buf)); |
| instrlist_meta_preinsert(ilist, where, INSTR_CREATE_fnclex(dcontext)); |
| instrlist_meta_preinsert(ilist, where, INSTR_CREATE_fwait(dcontext)); |
| instrlist_meta_preinsert(ilist, where, INSTR_CREATE_fninit(dcontext)); |
| } else { |
| /* auto-adjust opnd size so it will encode */ |
| if (opnd_get_size(buf) == OPSZ_512) |
| opnd_set_size(&buf, OPSZ_108); |
| /* FIXME: why is this appending fwait, vs "fsave" which prepends? */ |
| instrlist_meta_preinsert(ilist, where, INSTR_CREATE_fnsave(dcontext, buf)); |
| instrlist_meta_preinsert(ilist, where, INSTR_CREATE_fwait(dcontext)); |
| } |
| #elif defined(ARM) |
| /* FIXME i#1551: NYI on ARM */ |
| ASSERT_NOT_IMPLEMENTED(false); |
| #endif /* X86/ARM */ |
| } |
| |
| void |
| dr_insert_restore_fpstate(void *drcontext, instrlist_t *ilist, instr_t *where, |
| opnd_t buf) |
| { |
| #ifdef X86 |
| dcontext_t *dcontext = (dcontext_t *) drcontext; |
| if (proc_has_feature(FEATURE_FXSR)) { |
| CLIENT_ASSERT(opnd_get_size(buf) == OPSZ_512, |
| "dr_insert_save_fpstate: opnd size must be OPSZ_512"); |
| if (X64_MODE_DC(dcontext)) |
| instrlist_meta_preinsert(ilist, where, INSTR_CREATE_fxrstor64(dcontext, buf)); |
| else |
| instrlist_meta_preinsert(ilist, where, INSTR_CREATE_fxrstor32(dcontext, buf)); |
| } else { |
| /* auto-adjust opnd size so it will encode */ |
| if (opnd_get_size(buf) == OPSZ_512) |
| opnd_set_size(&buf, OPSZ_108); |
| instrlist_meta_preinsert(ilist, where, INSTR_CREATE_frstor(dcontext, buf)); |
| } |
| #elif defined(ARM) |
| /* FIXME i#1551: NYI on ARM */ |
| ASSERT_NOT_IMPLEMENTED(false); |
| #endif /* X86/ARM */ |
| } |
| |
| bool |
| proc_avx_enabled(void) |
| { |
| return avx_enabled; |
| } |