| #include <sys/time.h> | 
 | #include <stdio.h> | 
 | #include <stdlib.h> | 
 | #include <unistd.h> | 
 | #include <iostream> | 
 | #include <vector> | 
 | #include <numeric> | 
 |  | 
 | /// This test contains some of the loops from the GCC vectrorizer example page [1]. | 
 | /// Dorit Nuzman who developed the gcc vectorizer said that we can use them in our test suite. | 
 | /// | 
 | /// [1] - http://gcc.gnu.org/projects/tree-ssa/vectorization.html | 
 |  | 
 | #define N 1024 | 
 | #define M 32 | 
 | #define K 4 | 
 | #define ALIGNED16 __attribute__((aligned(16))) | 
 |  | 
 | unsigned short usa[N]; | 
 | short sa[N]; | 
 | short sb[N]; | 
 | short sc[N]; | 
 | unsigned int   ua[N]; | 
 | int   ia[N] ALIGNED16; | 
 | int   ib[N] ALIGNED16; | 
 | int   ic[N] ALIGNED16; | 
 | unsigned int ub[N]; | 
 | unsigned int uc[N]; | 
 | float fa[N], fb[N]; | 
 | float da[N], db[N], dc[N], dd[N]; | 
 | int dj[N]; | 
 |  | 
 | struct A { | 
 |   int ca[N]; | 
 | } s; | 
 |  | 
 | int a[N*2] ALIGNED16; | 
 | int b[N*2] ALIGNED16; | 
 | int c[N*2] ALIGNED16; | 
 | int d[N*2] ALIGNED16; | 
 |  | 
 | __attribute__((noinline)) | 
 | void example1 () { | 
 |   int i; | 
 |  | 
 |   for (i=0; i<256; i++){ | 
 |     a[i] = b[i] + c[i]; | 
 |   } | 
 | } | 
 |  | 
 | __attribute__((noinline)) | 
 | void example2a (int n, int x) { | 
 |    int i; | 
 |  | 
 |    /* feature: support for unknown loop bound  */ | 
 |    /* feature: support for loop invariants  */ | 
 |    for (i=0; i<n; i++) { | 
 |       b[i] = x; | 
 |    } | 
 | } | 
 |  | 
 | __attribute__((noinline)) | 
 | void example2b (int n, int x) { | 
 |   int i = 0; | 
 |    /* feature: general loop exit condition  */ | 
 |    /* feature: support for bitwise operations  */ | 
 |    while (n--){ | 
 |       a[i] = b[i]&c[i]; i++; | 
 |    } | 
 | } | 
 |  | 
 |  | 
 | typedef int aint __attribute__ ((__aligned__(16))); | 
 | __attribute__((noinline)) | 
 | void example3 (int n, aint * __restrict__ p, aint * __restrict q) { | 
 |  | 
 |    /* feature: support for (aligned) pointer accesses.  */ | 
 |    while (n--){ | 
 |       *p++ = *q++; | 
 |    } | 
 | } | 
 |  | 
 | __attribute__((noinline)) | 
 | void example4a (int n, aint * __restrict__ p, aint * __restrict__ q) { | 
 |    /* feature: support for (aligned) pointer accesses  */ | 
 |    /* feature: support for constants  */ | 
 |    while (n--){ | 
 |       *p++ = *q++ + 5; | 
 |    } | 
 | } | 
 |  | 
 | __attribute__((noinline)) | 
 | void example4b (int n, aint * __restrict__ p, aint * __restrict__ q) { | 
 |    int i; | 
 |  | 
 |    /* feature: support for read accesses with a compile time known misalignment  */ | 
 |    for (i=0; i<n; i++){ | 
 |       a[i] = b[i+1] + c[i+3]; | 
 |    } | 
 | } | 
 |  | 
 | __attribute__((noinline)) | 
 | void example4c (int n, aint * __restrict__ p, aint * __restrict__ q) { | 
 |    int i; | 
 |     const int MAX = 4; | 
 |    /* feature: support for if-conversion  */ | 
 |    for (i=0; i<n; i++){ | 
 |       int j = a[i]; | 
 |       b[i] = (j > MAX ? MAX : 0); | 
 |    } | 
 | } | 
 |  | 
 | __attribute__((noinline)) | 
 | void  example5 (int n, struct A *s) { | 
 |   int i; | 
 |   for (i = 0; i < n; i++) { | 
 |     /* feature: support for alignable struct access  */ | 
 |     s->ca[i] = 5; | 
 |   } | 
 | } | 
 |  | 
 | __attribute__((noinline)) | 
 | void  example7 (int x) { | 
 |    int i; | 
 |  | 
 |    /* feature: support for read accesses with an unknown misalignment  */ | 
 |    for (i=0; i<N; i++){ | 
 |       a[i] = b[i+x]; | 
 |    } | 
 | } | 
 |  | 
 | int G[M][N]; | 
 | __attribute__((noinline)) | 
 | void example8 (int x) { | 
 |    int i,j; | 
 |  | 
 |    /* feature: support for multidimensional arrays  */ | 
 |    for (i=0; i<M; i++) { | 
 |      for (j=0; j<N; j++) { | 
 |        G[i][j] = x; | 
 |      } | 
 |    } | 
 | } | 
 |  | 
 |  | 
 | __attribute__((noinline)) | 
 | void example9 (unsigned *ret) { | 
 |   int i; | 
 |  | 
 |   /* feature: support summation reduction. | 
 |      note: in case of floats use -funsafe-math-optimizations  */ | 
 |   unsigned int diff = 0; | 
 |   for (i = 0; i < N; i++) { | 
 |     diff += (ub[i] - uc[i]); | 
 |   } | 
 |  | 
 |   *ret = diff; | 
 | } | 
 |  | 
 |  | 
 | /* feature: support data-types of different sizes. | 
 |    Currently only a single vector-size per target is supported;  | 
 |    it can accommodate n elements such that n = vector-size/element-size  | 
 |    (e.g, 4 ints, 8 shorts, or 16 chars for a vector of size 16 bytes).  | 
 |    A combination of data-types of different sizes in the same loop  | 
 |    requires special handling. This support is now present in mainline, | 
 |    and also includes support for type conversions.  */ | 
 | __attribute__((noinline)) | 
 | void example10a(short *__restrict__ sa, short *__restrict__ sb, short *__restrict__ sc, int* __restrict__ ia, int* __restrict__ ib, int* __restrict__ ic) { | 
 |   int i; | 
 |   for (i = 0; i < N; i++) { | 
 |     ia[i] = ib[i] + ic[i]; | 
 |     sa[i] = sb[i] + sc[i]; | 
 |   } | 
 | } | 
 |  | 
 | __attribute__((noinline)) | 
 | void example10b(short *__restrict__ sa, short *__restrict__ sb, short *__restrict__ sc, int* __restrict__ ia, int* __restrict__ ib, int* __restrict__ ic) { | 
 |   int i; | 
 |   for (i = 0; i < N; i++) { | 
 |     ia[i] = (int) sb[i]; | 
 |   } | 
 | } | 
 |  | 
 | /* feature: support strided accesses - the data elements | 
 |    that are to be operated upon in parallel are not consecutive - they | 
 |    are accessed with a stride > 1 (in the example, the stride is 2):  */ | 
 | __attribute__((noinline)) | 
 | void example11() { | 
 |    int i; | 
 |   for (i = 0; i < N/2; i++){ | 
 |     a[i] = b[2*i+1] * c[2*i+1] - b[2*i] * c[2*i]; | 
 |     d[i] = b[2*i] * c[2*i+1] + b[2*i+1] * c[2*i]; | 
 |   } | 
 | } | 
 |  | 
 |  | 
 | __attribute__((noinline)) | 
 | void example12() { | 
 |   for (int i = 0; i < N; i++) { | 
 |     a[i] = i; | 
 |   } | 
 | } | 
 |  | 
 | __attribute__((noinline)) | 
 | void example13(int **A, int **B, int *out) { | 
 |   int i,j; | 
 |   for (i = 0; i < M; i++) { | 
 |     int diff = 0; | 
 |     for (j = 0; j < N; j+=8) { | 
 |       diff += (A[i][j] - B[i][j]); | 
 |     } | 
 |     out[i] = diff; | 
 |   } | 
 | } | 
 |  | 
 | __attribute__((noinline)) | 
 | void example14(int **in, int **coeff, int *out) { | 
 |   int k,j,i=0; | 
 |   for (k = 0; k < K; k++) { | 
 |     int sum = 0; | 
 |     for (j = 0; j < M; j++) | 
 |       for (i = 0; i < N; i++) | 
 |           sum += in[i+k][j] * coeff[i][j]; | 
 |  | 
 |     out[k] = sum; | 
 |   } | 
 |  | 
 | } | 
 |  | 
 |  | 
 | __attribute__((noinline)) | 
 | void example21(int *b, int n) { | 
 |   int i, a = 0; | 
 |  | 
 |   for (i = n-1; i >= 0; i--) | 
 |     a += b[i]; | 
 |  | 
 |   b[0] = a; | 
 | } | 
 |  | 
 | __attribute__((noinline)) | 
 | void example23 (unsigned short *src, unsigned int *dst) | 
 | { | 
 |   int i; | 
 |  | 
 |   for (i = 0; i < 256; i++) | 
 |     *dst++ = *src++ << 7; | 
 | } | 
 |  | 
 |  | 
 | __attribute__((noinline)) | 
 | void example24 (short x, short y) | 
 | { | 
 |   int i; | 
 |   for (i = 0; i < N; i++) | 
 |     ic[i] = fa[i] < fb[i] ? x : y; | 
 | } | 
 |  | 
 |  | 
 | __attribute__((noinline)) | 
 | void example25 (void) | 
 | { | 
 |   int i; | 
 |   char x, y; | 
 |   for (i = 0; i < N; i++) | 
 |     { | 
 |       x = (da[i] < db[i]); | 
 |       y = (dc[i] < dd[i]); | 
 |       dj[i] = x & y; | 
 |     } | 
 | } | 
 |  | 
 | void init_memory(void *start, void* end) { | 
 |   unsigned char state = 1; | 
 |   while (start != end) { | 
 |     state *= 7; state ^= 0x27; state += 1; | 
 |     *((unsigned char*)start) = state; | 
 |     start = ((char*)start) + 1; | 
 |   } | 
 | } | 
 |  | 
 | void init_memory_float(float *start, float* end) { | 
 |   float state = 1.0; | 
 |   while (start != end) { | 
 |     state *= 1.1; | 
 |     *start = state; | 
 |     start++; | 
 |   } | 
 | } | 
 |  | 
 | unsigned digest_memory(void *start, void* end) { | 
 |   unsigned state = 1; | 
 |   while (start != end) { | 
 |     state *= 3; | 
 |     state ^= *((unsigned char*)start); | 
 |     state = (state >> 8  ^ state << 8); | 
 |     start = ((char*)start) + 1; | 
 |   } | 
 |   return state; | 
 | } | 
 |  | 
 | class Timer { | 
 |  | 
 | public: | 
 |   Timer(const char* title, bool print) { | 
 |     Title = title; | 
 |     Print = print; | 
 |     gettimeofday(&Start, 0); | 
 |   } | 
 |  | 
 |   ~Timer() { | 
 |     gettimeofday(&End, 0); | 
 |     long mtime, s,us; | 
 |     s = End.tv_sec  - Start.tv_sec; | 
 |     us = End.tv_usec - Start.tv_usec; | 
 |     mtime = (s*1000 + us/1000.0)+0.5; | 
 |     if (Print) | 
 |       std::cout<<Title<<", "<<mtime<<", msec\n"; | 
 |   } | 
 |  | 
 | private: | 
 |   const char* Title; | 
 |   bool Print; | 
 |   struct timeval Start, End; | 
 | }; | 
 |  | 
 |  | 
 | // Warmup and then measure. | 
 | #define BENCH(NAME, RUN_LINE, ITER, DIGEST_LINE) {\ | 
 |   RUN_LINE;\ | 
 |   Timer atimer(NAME, print_times);\ | 
 |   for (int i=0; i < (ITER); ++i) RUN_LINE;\ | 
 |   unsigned r = DIGEST_LINE;\ | 
 |   results.push_back(r);\ | 
 |  } | 
 |  | 
 | int main(int argc,char* argv[]){ | 
 |  | 
 |   bool print_times = argc > 1; | 
 |  | 
 |   std::vector<unsigned> results; | 
 |   unsigned dummy = 0; | 
 | #ifdef SMALL_PROBLEM_SIZE | 
 |   const int Mi = 1<<10; | 
 | #else | 
 |   const int Mi = 1<<18; | 
 | #endif | 
 |   init_memory(&ia[0], &ia[N]); | 
 |   init_memory(&ib[0], &ib[N]); | 
 |   init_memory(&ic[0], &ic[N]); | 
 |   init_memory(&sa[0], &sa[N]); | 
 |   init_memory(&sb[0], &sb[N]); | 
 |   init_memory(&sc[0], &sc[N]); | 
 |   init_memory(&a[0], &a[N*2]); | 
 |   init_memory(&b[0], &b[N*2]); | 
 |   init_memory(&c[0], &c[N*2]); | 
 |   init_memory(&ua[0], &ua[N]); | 
 |   init_memory(&ub[0], &ub[N]); | 
 |   init_memory(&uc[0], &uc[N]); | 
 |   init_memory(&G[0][0], &G[0][N]); | 
 |   init_memory_float(&fa[0], &fa[N]); | 
 |   init_memory_float(&fb[0], &fb[N]); | 
 |   init_memory_float(&da[0], &da[N]); | 
 |   init_memory_float(&db[0], &db[N]); | 
 |   init_memory_float(&dc[0], &dc[N]); | 
 |   init_memory_float(&dd[0], &dd[N]); | 
 |  | 
 |   BENCH("Example1",   example1(), Mi*10, digest_memory(&a[0], &a[256])); | 
 |   BENCH("Example2a",  example2a(N, 2), Mi*4, digest_memory(&b[0], &b[N])); | 
 |   BENCH("Example2b",  example2b(N, 2), Mi*2, digest_memory(&a[0], &a[N])); | 
 |   BENCH("Example3",   example3(N, ia, ib), Mi*2, digest_memory(&ia[0], &ia[N])); | 
 |   BENCH("Example4a",  example4a(N, ia, ib), Mi*2, digest_memory(&ia[0], &ia[N])); | 
 |   BENCH("Example4b",  example4b(N-10, ia, ib), Mi*2, digest_memory(&ia[0], &ia[N])); | 
 |   BENCH("Example4c",  example4c(N, ia, ib), Mi*2, digest_memory(&ib[0], &ib[N])); | 
 |   BENCH("Example7",   example7(4), Mi*4, digest_memory(&a[0], &a[N])); | 
 |   BENCH("Example8",   example8(8), Mi/4, digest_memory(&G[0][0], &G[0][N])); | 
 |   BENCH("Example9",   example9(&dummy), Mi*2, dummy); | 
 |   BENCH("Example10a", example10a(sa,sb,sc,ia,ib,ic), Mi*2, digest_memory(&ia[0], &ia[N]) + digest_memory(&sa[0], &sa[N])); | 
 |   BENCH("Example10b", example10b(sa,sb,sc,ia,ib,ic), Mi*4, digest_memory(&ia[0], &ia[N])); | 
 |   BENCH("Example11",  example11(), Mi*2, digest_memory(&d[0], &d[N])); | 
 |   BENCH("Example12",  example12(), Mi*4, digest_memory(&a[0], &a[N])); | 
 |   //BENCH("Example21",  example21(ia, N), Mi*4, digest_memory(&ia[0], &ia[N])); | 
 |   BENCH("Example23",  example23(usa,ua), Mi*8, digest_memory(&usa[0], &usa[256])); | 
 |   BENCH("Example24",  example24(2,4), Mi*2, 0); | 
 |   BENCH("Example25",  example25(), Mi*2, digest_memory(&dj[0], &dj[N])); | 
 |  | 
 |   std::cout<<std::hex; | 
 |   std::cout<<"Results: ("<<std::accumulate(results.begin(), results.end(), 0)<<"):"; | 
 |   for (unsigned i=0; i < results.size(); ++i) { | 
 |     std::cout<<" "<<results[i]; | 
 |   } | 
 |   std::cout<<"\n"; | 
 |  | 
 |   return 0; | 
 | } | 
 |  | 
 |  | 
 |  |