| /* |
| * Test from attachement to this message: |
| * http://old.nabble.com/-PATCH-0-1--ARM%3A-NEON-optimized-implementation-of-memcpy.-td24328820.html |
| * |
| * Originally, this code compared memcpy_neon to memcpy_arm. Those parts have |
| * been ifdefed out and the test engine has been kept. |
| */ |
| #include <sys/time.h> |
| #include <stdint.h> |
| #include <stdio.h> |
| #include <string.h> |
| #include <time.h> |
| #include <stdlib.h> |
| |
| #include <eprintf.h> |
| #include <puny.h> |
| |
| #if 0 |
| void *memcpy_neon(void *, const void *, size_t); |
| void *memcpy_arm(void *, const void *, size_t); |
| #endif |
| |
| #define CORRECTNESS_TESTS_COUNT 300000 |
| #define CORRECTNESS_TEST_BUFFER_SIZE 16384 |
| |
| enum { FIT_L1_CACHE = 4096, /* Small enough to fit in L1 Cache */ |
| FIT_L2_CACHE = 64 * 1024, /* Bigger than L1 but fit in L2 */ |
| FIT_SDRAM = 8 * 1024 * 1024, /* Bigger than biggest cache */ |
| BUFFER_SIZE = 2 * FIT_SDRAM}; |
| |
| #define FIT_CHECK(_x) ((_x) + (_x) / 2) /* Run test again but 50% |
| * bigger to make sure numbers |
| * seem right. |
| */ |
| |
| uint8_t *testbuffer8_1w; |
| uint8_t *testbuffer8_1r; |
| uint8_t *testbuffer8_2w; |
| uint8_t *testbuffer8_2r; |
| |
| struct { |
| double scale; |
| char *units; |
| char *legend; |
| } meg = { 1024.0 * 1024.0, "MiB", "2**20 or 1,048,576 bytes" }; |
| |
| uint8_t run_correctness = TRUE; |
| uint8_t run_trivial = TRUE; |
| |
| void *memcpy_trivial(void *d, const void *s, size_t count) |
| { |
| uint8_t *dst = d; |
| const uint8_t *src = s; |
| while (count-- > 0) *dst++ = *src++; |
| return d; |
| } |
| |
| int run_correctness_test(void) |
| { |
| int i; |
| uint8_t c8; |
| int offs1, offs2, size; |
| printf("--- Running correctness tests (use '-b' option to skip) ---\n"); |
| for (i = 0; i < CORRECTNESS_TEST_BUFFER_SIZE; i++) { |
| c8 = rand(); |
| testbuffer8_1r[i] = c8; |
| testbuffer8_2r[i] = c8; |
| testbuffer8_1w[i] = c8; |
| testbuffer8_2w[i] = c8; |
| } |
| srand(0); |
| for (i = 0; i < CORRECTNESS_TESTS_COUNT; i++) { |
| offs1 = rand() % (BUFFER_SIZE / 2); |
| offs2 = rand() % (BUFFER_SIZE / 2); |
| size = (rand() % 2) ? (rand() % (CORRECTNESS_TEST_BUFFER_SIZE / 2)) |
| : (rand() % 64); |
| |
| if (run_trivial) |
| memcpy_trivial(testbuffer8_1w + offs1, |
| testbuffer8_1r + offs2, size); |
| #if 0 |
| memcpy_neon(testbuffer8_2w + offs1, testbuffer8_2r + offs2, size); |
| #endif |
| memcpy(testbuffer8_2w + offs1, testbuffer8_2r + offs2, size); |
| if (memcmp(testbuffer8_1w, testbuffer8_2w, |
| CORRECTNESS_TEST_BUFFER_SIZE) != 0) { |
| printf("memcpy_trivial: test failed," |
| " i=%d, offs1=%d offs2=%d, size=%d\n", |
| i, offs1, offs2, size); |
| exit(1); |
| } |
| } |
| printf("all the correctness tests passed\n\n"); |
| return 0; |
| } |
| |
| static int64_t gettime(void) |
| { |
| struct timeval tv; |
| gettimeofday(&tv, NULL); |
| return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec; |
| } |
| |
| #define OFFS1 64 |
| #define OFFS2 64 |
| |
| void run_bench(const char *msg, |
| uint8_t *dst, uint8_t *src, int size, |
| void *(*f)(void *, const void *, size_t)) { |
| int64_t before1, after1, before2, after2; |
| int i, j, k, kmax; |
| |
| kmax = 1024 * 1024 / size; |
| if (kmax == 0) kmax = 1; |
| if (kmax > 256) kmax = 256; |
| |
| /* Note: we do copy in both directions on purpose. The point |
| * is that ARM does not support write-allocate for L1 cache. |
| * During the test, destination buffer may get sometimes |
| * evicted from cache (if some other process gets activated |
| * for a short period of time and suddenly triggers load of |
| * lots of data into cache, evicting our data). If this |
| * happens, writes into a destination buffer would always |
| * miss cache, hugely impacting performance. As a result, |
| * benchmark numbers would vary a lot in a nonreproducible |
| * way. Reading from both source and destination buffers |
| * during test ensures that the data would be immediately |
| * reloaded into L1 cache |
| */ |
| f(dst, src, size + 64); |
| f(src, dst, size + 64); |
| |
| before1 = gettime(); |
| for (k = 0; k < kmax; k++) |
| for (i = 0; i < OFFS1; i++) |
| for (j = 0; j < OFFS2; j++) { |
| f(dst + i, src + j, size); |
| f(src + j, dst + i, size); |
| } |
| after1 = gettime(); |
| |
| before2 = gettime(); |
| for (k = 0; k < kmax; k++) |
| for (i = 0; i < OFFS1; i++) |
| for (j = 0; j < OFFS2; j++) { |
| f(dst, src, size); |
| f(src, dst, size); |
| } |
| after2 = gettime(); |
| |
| printf("%s (%d bytes copy) = %6.1f %s/s / %6.1f %s/s\n", msg, size, |
| (double)size * OFFS1 * OFFS2 * 1000000. * kmax * 2 / |
| (double)(after1 - before1) / meg.scale, |
| meg.units, |
| (double)size * OFFS1 * OFFS2 * 1000000. * kmax * 2 / |
| (double)(after2 - before2) / meg.scale, |
| meg.units); |
| } |
| |
| void run_bench_for_for_size(int size) |
| { |
| #if 0 |
| run_bench("memcpy_neon : ", testbuffer8_1w, testbuffer8_2w, |
| size, memcpy_neon); |
| run_bench("memcpy_arm : ", testbuffer8_1w, testbuffer8_2w, |
| size, memcpy_arm); |
| #endif |
| /* insert a call to benchmark your own implementation here */ |
| if (run_trivial) |
| run_bench("memcpy_trivial: ", testbuffer8_1w, testbuffer8_2w, |
| size, memcpy_trivial); |
| run_bench("memcpy : ", testbuffer8_1w, testbuffer8_2w, |
| size, memcpy); |
| } |
| |
| void run_performance_tests(void) |
| { |
| printf("--- Running benchmarks (average case/perfect alignment case) ---\n"); |
| |
| printf("\nvery small data test:\n"); |
| run_bench_for_for_size(3); |
| run_bench_for_for_size(4); |
| run_bench_for_for_size(5); |
| run_bench_for_for_size(7); |
| run_bench_for_for_size(8); |
| run_bench_for_for_size(11); |
| run_bench_for_for_size(12); |
| run_bench_for_for_size(15); |
| run_bench_for_for_size(16); |
| run_bench_for_for_size(24); |
| run_bench_for_for_size(31); |
| |
| printf("\nL1 cached data:\n"); |
| run_bench_for_for_size(FIT_L1_CACHE); |
| run_bench_for_for_size(FIT_CHECK(FIT_L1_CACHE)); |
| |
| printf("\nL2 cached data:\n"); |
| run_bench_for_for_size(FIT_L2_CACHE); |
| run_bench_for_for_size(FIT_CHECK(FIT_L2_CACHE)); |
| |
| printf("\nSDRAM:\n"); |
| run_bench_for_for_size(FIT_SDRAM); |
| run_bench_for_for_size(FIT_CHECK(FIT_SDRAM)); |
| |
| printf("\n(*) 1 %s = %s\n", meg.units, meg.legend); |
| #if 0 |
| printf("(*) 'memcpy_arm' - an implementation for" |
| " older ARM cores from glibc-ports\n"); |
| #endif |
| } |
| |
| bool myopt (int c) |
| { |
| switch (c) { |
| case 'b': |
| run_correctness = FALSE; |
| break; |
| case 'm': |
| meg.scale = 1000. * 1000.; |
| meg.legend = "1,000,000 bytes"; |
| meg.units = "MB"; |
| break; |
| case 't': |
| run_trivial = FALSE; |
| break; |
| default: |
| return FALSE; |
| } |
| return TRUE; |
| } |
| |
| void usage(void) |
| { |
| pr_usage("-bhmt\n" |
| "\tb - only run benchmark, don't include correctness test\n" |
| "\th - help\n" |
| "\tm - Use Meg = 1,000,000; default is 2**20 or 1,048,576\n" |
| "\tt - skip the trivial tests\n"); |
| } |
| |
| int main(int argc, char *argv[]) |
| { |
| uint8_t *p; |
| int rc = posix_memalign((void **)&p, 4096, BUFFER_SIZE * 4); |
| if (rc) { |
| fatal("posix_memalign %d", rc); |
| } |
| testbuffer8_1w = p + 0 * BUFFER_SIZE; |
| testbuffer8_1r = p + 1 * BUFFER_SIZE; |
| testbuffer8_2w = p + 2 * BUFFER_SIZE; |
| testbuffer8_2r = p + 3 * BUFFER_SIZE; |
| |
| punyopt(argc, argv, myopt, "bmt"); |
| if (run_correctness) |
| run_correctness_test(); |
| run_performance_tests(); |
| |
| free(p); |
| return 0; |
| } |