| /* |
| * Copyright 2013 The Emscripten Authors. All rights reserved. |
| * Emscripten is available under two separate licenses, the MIT license and the |
| * University of Illinois/NCSA Open Source License. Both these licenses can be |
| * found in the LICENSE file. |
| */ |
| |
| /* gcc linpack.c cpuidc64.o cpuida64.o -m64 -lrt -lc -lm -o linpack |
| * |
| * Linpack 100x100 Benchmark In C/C++ For PCs |
| * |
| * Different compilers can produce different floating point numeric |
| * results, probably due to compiling instructions in a different |
| * sequence. As the program checks these, they may need to be changed. |
| * The log file indicates non-standard results and these values can |
| * be copied and pasted into this program. See // Values near the |
| * end of main(). |
| * |
| * Different compilers do not optimise the code in the same way. |
| * This can lead to wide variations in benchmark speeds. See results |
| * with MS6 compiler ID and compare with those from same CPUs from |
| * the Watcom compiler generated code. |
| * |
| *************************************************************************** |
| */ |
| |
| #define _CRT_SECURE_NO_WARNINGS 1 |
| #ifdef WIN32 |
| #include <Windows.h> |
| #else |
| #include <sys/time.h> |
| #endif |
| |
| #define UNROLL |
| #ifndef SP |
| #define DP |
| #endif |
| |
| #ifdef SP |
| #define REAL float |
| #define ZERO 0.0 |
| #define ONE 1.0 |
| #define PREC "Single" |
| #endif |
| |
| #ifdef DP |
| #define REAL double |
| #define ZERO 0.0e0 |
| #define ONE 1.0e0 |
| #define PREC "Double" |
| #endif |
| |
| #ifdef ROLL |
| #define ROLLING "Rolled" |
| #endif |
| #ifdef UNROLL |
| #define ROLLING "Unrolled" |
| #endif |
| |
| // VERSION |
| |
| #ifdef CNNT |
| #define options "Non-optimised" |
| #define opt "0" |
| #else |
| // #define options "Optimised" |
| #define options "Opt 3 64 Bit" |
| #define opt "1" |
| #endif |
| |
| #define NTIMES 10 |
| |
| #include <stdio.h> |
| #include <math.h> |
| #include <stdlib.h> |
| #include <time.h> |
| |
| |
| /* this is truly rank, but it's minimally invasive, and lifted in part from the STREAM scores */ |
| |
| static double secs; |
| |
| #ifndef WIN32 |
| |
| double mysecond() |
| { |
| struct timeval tp; |
| struct timezone tzp; |
| int i; |
| |
| i = gettimeofday(&tp,&tzp); |
| return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 ); |
| } |
| #else |
| |
| double mysecond() |
| { |
| static LARGE_INTEGER freq = {0}; |
| LARGE_INTEGER count = {0}; |
| if(freq.QuadPart == 0LL) { |
| QueryPerformanceFrequency(&freq); |
| } |
| QueryPerformanceCounter(&count); |
| return (double)count.QuadPart / (double)freq.QuadPart; |
| } |
| |
| #endif |
| |
| void start_time() |
| { |
| secs = mysecond(); |
| } |
| |
| void end_time() |
| { |
| secs = mysecond() - secs; |
| } |
| |
| void print_time (int row); |
| void matgen (REAL a[], int lda, int n, REAL b[], REAL *norma); |
| void dgefa (REAL a[], int lda, int n, int ipvt[], int *info); |
| void dgesl (REAL a[],int lda,int n,int ipvt[],REAL b[],int job); |
| void dmxpy (int n1, REAL y[], int n2, int ldm, REAL x[], REAL m[]); |
| void daxpy (int n, REAL da, REAL dx[], int incx, REAL dy[], int incy); |
| REAL epslon (REAL x); |
| int idamax (int n, REAL dx[], int incx); |
| void dscal (int n, REAL da, REAL dx[], int incx); |
| REAL ddot (int n, REAL dx[], int incx, REAL dy[], int incy); |
| |
| static REAL atime[9][15]; |
| double runSecs = 1; |
| |
| |
| int main (int argc, char *argv[]) |
| { |
| static REAL aa[200*200],a[200*201],b[200],x[200]; |
| REAL cray,ops,total,norma,normx; |
| REAL resid,residn,eps,tm2,epsn,x1,x2; |
| REAL mflops; |
| static int ipvt[200],n,i,j,ntimes,info,lda,ldaa; |
| int endit, pass, loop; |
| REAL overhead1, overhead2, time2; |
| REAL max1, max2; |
| char was[5][20]; |
| char expect[5][20]; |
| char title[5][20]; |
| int errors; |
| |
| int arg = argc > 1 ? argv[1][0] - '0' : 3; |
| if (arg == 0) return 0; |
| |
| printf("\n"); |
| |
| printf("##########################################\n"); |
| |
| |
| |
| lda = 201; |
| ldaa = 200; |
| cray = .056; |
| n = 100; |
| |
| fprintf(stdout, "%s ", ROLLING); |
| fprintf(stdout, "%s ", PREC); |
| fprintf(stdout,"Precision Linpack Benchmark - PC Version in 'C/C++'\n\n"); |
| |
| fprintf(stdout,"Optimisation %s\n\n",options); |
| |
| ops = (2.0e0*(n*n*n))/3.0 + 2.0*(n*n); |
| |
| matgen(a,lda,n,b,&norma); |
| start_time(); |
| dgefa(a,lda,n,ipvt,&info); |
| end_time(); |
| atime[0][0] = secs; |
| start_time(); |
| dgesl(a,lda,n,ipvt,b,0); |
| end_time(); |
| atime[1][0] = secs; |
| total = atime[0][0] + atime[1][0]; |
| |
| /* compute a residual to verify results. */ |
| |
| for (i = 0; i < n; i++) { |
| x[i] = b[i]; |
| } |
| matgen(a,lda,n,b,&norma); |
| for (i = 0; i < n; i++) { |
| b[i] = -b[i]; |
| } |
| dmxpy(n,b,n,lda,x,a); |
| resid = 0.0; |
| normx = 0.0; |
| for (i = 0; i < n; i++) { |
| resid = (resid > fabs((double)b[i])) |
| ? resid : fabs((double)b[i]); |
| normx = (normx > fabs((double)x[i])) |
| ? normx : fabs((double)x[i]); |
| } |
| eps = epslon(ONE); |
| residn = resid/( n*norma*normx*eps ); |
| epsn = eps; |
| x1 = x[0] - 1; |
| x2 = x[n-1] - 1; |
| |
| printf("norm resid resid machep"); |
| printf(" x[0]-1 x[n-1]-1\n"); |
| printf("%6.1f %17.8e%17.8e%17.8e%17.8e\n\n", |
| (double)residn, (double)resid, (double)epsn, |
| (double)x1, (double)x2); |
| |
| printf("Times are reported for matrices of order %5d\n",n); |
| printf("1 pass times for array with leading dimension of%5d\n\n",lda); |
| printf(" dgefa dgesl total Mflops unit"); |
| printf(" ratio\n"); |
| |
| atime[2][0] = total; |
| if (total > 0.0) |
| { |
| atime[3][0] = ops/(1.0e6*total); |
| atime[4][0] = 2.0/atime[3][0]; |
| } |
| else |
| { |
| atime[3][0] = 0.0; |
| atime[4][0] = 0.0; |
| } |
| atime[5][0] = total/cray; |
| |
| print_time(0); |
| |
| /************************************************************************ |
| * Calculate overhead of executing matgen procedure * |
| ************************************************************************/ |
| |
| printf("\nCalculating matgen overhead\n"); |
| pass = -20; |
| loop = NTIMES; |
| do |
| { |
| start_time(); |
| pass = pass + 1; |
| for ( i = 0 ; i < loop ; i++) |
| { |
| matgen(a,lda,n,b,&norma); |
| } |
| end_time(); |
| overhead1 = secs; |
| printf("%10d times %6.2f seconds\n", loop, overhead1); |
| if (overhead1 > runSecs) |
| { |
| pass = 0; |
| } |
| if (pass < 0) |
| { |
| if (overhead1 < 0.1) |
| { |
| loop = loop * 10; |
| } |
| else |
| { |
| loop = loop * 2; |
| } |
| } |
| } |
| while (pass < 0); |
| |
| overhead1 = overhead1 / (double)loop; |
| |
| printf("Overhead for 1 matgen %12.5f seconds\n\n", overhead1); |
| |
| /************************************************************************ |
| * Calculate matgen/dgefa passes for runSecs seconds * |
| ************************************************************************/ |
| |
| printf("Calculating matgen/dgefa passes for %d seconds\n", (int)runSecs); |
| pass = -20; |
| ntimes = NTIMES; |
| do |
| { |
| start_time(); |
| pass = pass + 1; |
| for ( i = 0 ; i < ntimes ; i++) |
| { |
| matgen(a,lda,n,b,&norma); |
| dgefa(a,lda,n,ipvt,&info ); |
| } |
| end_time(); |
| time2 = secs; |
| printf("%10d times %6.2f seconds\n", ntimes, time2); |
| if (time2 > runSecs) |
| { |
| pass = 0; |
| } |
| if (pass < 0) |
| { |
| if (time2 < 0.1) |
| { |
| ntimes = ntimes * 10; |
| } |
| else |
| { |
| ntimes = ntimes * 2; |
| } |
| } |
| } |
| while (pass < 0); |
| |
| ntimes = (int)(runSecs * (double)ntimes / time2); |
| if (ntimes == 0) ntimes = 1; |
| |
| printf("Passes used %10d \n\n", ntimes); |
| printf("Times for array with leading dimension of%4d\n\n",lda); |
| printf(" dgefa dgesl total Mflops unit"); |
| printf(" ratio\n"); |
| |
| /************************************************************************ |
| * Execute 5 passes * |
| ************************************************************************/ |
| |
| tm2 = ntimes * overhead1; |
| atime[3][6] = 0; |
| |
| for (j=1 ; j<6 ; j++) |
| { |
| start_time(); |
| for (i = 0; i < ntimes; i++) |
| { |
| matgen(a,lda,n,b,&norma); |
| dgefa(a,lda,n,ipvt,&info ); |
| } |
| end_time(); |
| atime[0][j] = (secs - tm2)/ntimes; |
| |
| start_time(); |
| for (i = 0; i < ntimes; i++) |
| { |
| dgesl(a,lda,n,ipvt,b,0); |
| } |
| end_time(); |
| |
| atime[1][j] = secs/ntimes; |
| total = atime[0][j] + atime[1][j]; |
| atime[2][j] = total; |
| atime[3][j] = ops/(1.0e6*total); |
| atime[4][j] = 2.0/atime[3][j]; |
| atime[5][j] = total/cray; |
| atime[3][6] = atime[3][6] + atime[3][j]; |
| |
| print_time(j); |
| } |
| atime[3][6] = atime[3][6] / 5.0; |
| printf("Average %11.2f\n", |
| (double)atime[3][6]); |
| |
| printf("\nCalculating matgen2 overhead\n"); |
| |
| /************************************************************************ |
| * Calculate overhead of executing matgen procedure * |
| ************************************************************************/ |
| |
| start_time(); |
| for ( i = 0 ; i < loop ; i++) |
| { |
| matgen(aa,ldaa,n,b,&norma); |
| } |
| end_time(); |
| overhead2 = secs; |
| overhead2 = overhead2 / (double)loop; |
| |
| printf("Overhead for 1 matgen %12.5f seconds\n\n", overhead2); |
| printf("Times for array with leading dimension of%4d\n\n",ldaa); |
| printf(" dgefa dgesl total Mflops unit"); |
| printf(" ratio\n"); |
| |
| /************************************************************************ |
| * Execute 5 passes * |
| ************************************************************************/ |
| |
| tm2 = ntimes * overhead2; |
| atime[3][12] = 0; |
| |
| for (j=7 ; j<12 ; j++) |
| { |
| start_time(); |
| for (i = 0; i < ntimes; i++) |
| { |
| matgen(aa,ldaa,n,b,&norma); |
| dgefa(aa,ldaa,n,ipvt,&info ); |
| } |
| end_time(); |
| atime[0][j] = (secs - tm2)/ntimes; |
| |
| start_time(); |
| for (i = 0; i < ntimes; i++) |
| { |
| dgesl(aa,ldaa,n,ipvt,b,0); |
| } |
| end_time(); |
| atime[1][j] = secs/ntimes; |
| total = atime[0][j] + atime[1][j]; |
| atime[2][j] = total; |
| atime[3][j] = ops/(1.0e6*total); |
| atime[4][j] = 2.0/atime[3][j]; |
| atime[5][j] = total/cray; |
| atime[3][12] = atime[3][12] + atime[3][j]; |
| |
| print_time(j); |
| } |
| atime[3][12] = atime[3][12] / 5.0; |
| printf("Average %11.2f\n", |
| (double)atime[3][12]); |
| |
| /************************************************************************ |
| * Use minimum average as overall Mflops rating * |
| ************************************************************************/ |
| |
| mflops = atime[3][6]; |
| if (atime[3][12] < mflops) mflops = atime[3][12]; |
| |
| printf("\n"); |
| printf( "%s ", ROLLING); |
| printf( "%s ", PREC); |
| printf(" Precision %11.2f Mflops \n\n",mflops); |
| |
| |
| max1 = 0; |
| for (i=1 ; i<6 ; i++) |
| { |
| if (atime[3][i] > max1) max1 = atime[3][i]; |
| } |
| |
| max2 = 0; |
| for (i=7 ; i<12 ; i++) |
| { |
| if (atime[3][i] > max2) max2 = atime[3][i]; |
| } |
| if (max1 < max2) max2 = max1; |
| |
| sprintf(was[0], "%16.1f",(double)residn); |
| sprintf(was[1], "%16.8e",(double)resid); |
| sprintf(was[2], "%16.8e",(double)epsn); |
| sprintf(was[3], "%16.8e",(double)x1); |
| sprintf(was[4], "%16.8e",(double)x2); |
| |
| /* |
| // Values for Watcom |
| |
| sprintf(expect[0], " 0.4"); |
| sprintf(expect[1], " 7.41628980e-014"); |
| sprintf(expect[2], " 1.00000000e-015"); |
| sprintf(expect[3], "-1.49880108e-014"); |
| sprintf(expect[4], "-1.89848137e-014"); |
| // Values for Visual C++ |
| |
| sprintf(expect[0], " 1.7"); |
| sprintf(expect[1], " 7.41628980e-014"); |
| sprintf(expect[2], " 2.22044605e-016"); |
| sprintf(expect[3], "-1.49880108e-014"); |
| sprintf(expect[4], "-1.89848137e-014"); |
| |
| // Values for Ubuntu GCC 32 Bit |
| |
| sprintf(expect[0], " 1.9"); |
| sprintf(expect[1], " 8.39915160e-14"); |
| sprintf(expect[2], " 2.22044605e-16"); |
| sprintf(expect[3], " -6.22835117e-14"); |
| sprintf(expect[4], " -4.16333634e-14"); |
| */ |
| |
| // Values for Ubuntu GCC 32 Bit |
| |
| sprintf(expect[0], " 1.7"); |
| sprintf(expect[1], " 7.41628980e-14"); |
| sprintf(expect[2], " 2.22044605e-16"); |
| sprintf(expect[3], " -1.49880108e-14"); |
| sprintf(expect[4], " -1.89848137e-14"); |
| |
| sprintf(title[0], "norm. resid"); |
| sprintf(title[1], "resid "); |
| sprintf(title[2], "machep "); |
| sprintf(title[3], "x[0]-1 "); |
| sprintf(title[4], "x[n-1]-1 "); |
| |
| if (strtol(opt, NULL, 10) == 0) |
| { |
| sprintf(expect[2], " 8.88178420e-016"); |
| } |
| errors = 0; |
| |
| printf ("\n"); |
| } |
| |
| /*----------------------*/ |
| void print_time (int row) |
| |
| { |
| printf("%11.5f%11.5f%11.5f%11.2f%11.4f%11.4f\n", (double)atime[0][row], |
| (double)atime[1][row], (double)atime[2][row], (double)atime[3][row], |
| (double)atime[4][row], (double)atime[5][row]); |
| return; |
| } |
| |
| /*----------------------*/ |
| |
| void matgen (REAL a[], int lda, int n, REAL b[], REAL *norma) |
| |
| |
| /* We would like to declare a[][lda], but c does not allow it. In this |
| function, references to a[i][j] are written a[lda*i+j]. */ |
| |
| { |
| int init, i, j; |
| |
| init = 1325; |
| *norma = 0.0; |
| for (j = 0; j < n; j++) { |
| for (i = 0; i < n; i++) { |
| init = 3125*init % 65536; |
| a[lda*j+i] = (init - 32768.0)/16384.0; |
| *norma = (a[lda*j+i] > *norma) ? a[lda*j+i] : *norma; |
| |
| /* alternative for some compilers |
| if (fabs(a[lda*j+i]) > *norma) *norma = fabs(a[lda*j+i]); |
| */ |
| } |
| } |
| for (i = 0; i < n; i++) { |
| b[i] = 0.0; |
| } |
| for (j = 0; j < n; j++) { |
| for (i = 0; i < n; i++) { |
| b[i] = b[i] + a[lda*j+i]; |
| } |
| } |
| return; |
| } |
| |
| /*----------------------*/ |
| void dgefa(REAL a[], int lda, int n, int ipvt[], int *info) |
| |
| |
| /* We would like to declare a[][lda], but c does not allow it. In this |
| function, references to a[i][j] are written a[lda*i+j]. */ |
| /* |
| dgefa factors a double precision matrix by gaussian elimination. |
| |
| dgefa is usually called by dgeco, but it can be called |
| directly with a saving in time if rcond is not needed. |
| (time for dgeco) = (1 + 9/n)*(time for dgefa) . |
| |
| on entry |
| |
| a REAL precision[n][lda] |
| the matrix to be factored. |
| |
| lda integer |
| the leading dimension of the array a . |
| |
| n integer |
| the order of the matrix a . |
| |
| on return |
| |
| a an upper triangular matrix and the multipliers |
| which were used to obtain it. |
| the factorization can be written a = l*u where |
| l is a product of permutation and unit lower |
| triangular matrices and u is upper triangular. |
| |
| ipvt integer[n] |
| an integer vector of pivot indices. |
| |
| info integer |
| = 0 normal value. |
| = k if u[k][k] .eq. 0.0 . this is not an error |
| condition for this subroutine, but it does |
| indicate that dgesl or dgedi will divide by zero |
| if called. use rcond in dgeco for a reliable |
| indication of singularity. |
| |
| linpack. this version dated 08/14/78 . |
| cleve moler, university of new mexico, argonne national lab. |
| |
| functions |
| |
| blas daxpy,dscal,idamax |
| */ |
| |
| { |
| /* internal variables */ |
| |
| REAL t; |
| int j,k,kp1,l,nm1; |
| |
| |
| /* gaussian elimination with partial pivoting */ |
| |
| *info = 0; |
| nm1 = n - 1; |
| if (nm1 >= 0) { |
| for (k = 0; k < nm1; k++) { |
| kp1 = k + 1; |
| |
| /* find l = pivot index */ |
| |
| l = idamax(n-k,&a[lda*k+k],1) + k; |
| ipvt[k] = l; |
| |
| /* zero pivot implies this column already |
| triangularized */ |
| |
| if (a[lda*k+l] != ZERO) { |
| |
| /* interchange if necessary */ |
| |
| if (l != k) { |
| t = a[lda*k+l]; |
| a[lda*k+l] = a[lda*k+k]; |
| a[lda*k+k] = t; |
| } |
| |
| /* compute multipliers */ |
| |
| t = -ONE/a[lda*k+k]; |
| dscal(n-(k+1),t,&a[lda*k+k+1],1); |
| |
| /* row elimination with column indexing */ |
| |
| for (j = kp1; j < n; j++) { |
| t = a[lda*j+l]; |
| if (l != k) { |
| a[lda*j+l] = a[lda*j+k]; |
| a[lda*j+k] = t; |
| } |
| daxpy(n-(k+1),t,&a[lda*k+k+1],1, |
| &a[lda*j+k+1],1); |
| } |
| } |
| else { |
| *info = k; |
| } |
| } |
| } |
| ipvt[n-1] = n-1; |
| if (a[lda*(n-1)+(n-1)] == ZERO) *info = n-1; |
| return; |
| } |
| |
| /*----------------------*/ |
| |
| void dgesl(REAL a[],int lda,int n,int ipvt[],REAL b[],int job ) |
| |
| |
| /* We would like to declare a[][lda], but c does not allow it. In this |
| function, references to a[i][j] are written a[lda*i+j]. */ |
| |
| /* |
| dgesl solves the double precision system |
| a * x = b or trans(a) * x = b |
| using the factors computed by dgeco or dgefa. |
| |
| on entry |
| |
| a double precision[n][lda] |
| the output from dgeco or dgefa. |
| |
| lda integer |
| the leading dimension of the array a . |
| |
| n integer |
| the order of the matrix a . |
| |
| ipvt integer[n] |
| the pivot vector from dgeco or dgefa. |
| |
| b double precision[n] |
| the right hand side vector. |
| |
| job integer |
| = 0 to solve a*x = b , |
| = nonzero to solve trans(a)*x = b where |
| trans(a) is the transpose. |
| |
| on return |
| |
| b the solution vector x . |
| |
| error condition |
| |
| a division by zero will occur if the input factor contains a |
| zero on the diagonal. technically this indicates singularity |
| but it is often caused by improper arguments or improper |
| setting of lda . it will not occur if the subroutines are |
| called correctly and if dgeco has set rcond .gt. 0.0 |
| or dgefa has set info .eq. 0 . |
| |
| to compute inverse(a) * c where c is a matrix |
| with p columns |
| dgeco(a,lda,n,ipvt,rcond,z) |
| if (!rcond is too small){ |
| for (j=0,j<p,j++) |
| dgesl(a,lda,n,ipvt,c[j][0],0); |
| } |
| |
| linpack. this version dated 08/14/78 . |
| cleve moler, university of new mexico, argonne national lab. |
| |
| functions |
| |
| blas daxpy,ddot |
| */ |
| { |
| /* internal variables */ |
| |
| REAL t; |
| int k,kb,l,nm1; |
| |
| nm1 = n - 1; |
| if (job == 0) { |
| |
| /* job = 0 , solve a * x = b |
| first solve l*y = b */ |
| |
| if (nm1 >= 1) { |
| for (k = 0; k < nm1; k++) { |
| l = ipvt[k]; |
| t = b[l]; |
| if (l != k){ |
| b[l] = b[k]; |
| b[k] = t; |
| } |
| daxpy(n-(k+1),t,&a[lda*k+k+1],1,&b[k+1],1 ); |
| } |
| } |
| |
| /* now solve u*x = y */ |
| |
| for (kb = 0; kb < n; kb++) { |
| k = n - (kb + 1); |
| b[k] = b[k]/a[lda*k+k]; |
| t = -b[k]; |
| daxpy(k,t,&a[lda*k+0],1,&b[0],1 ); |
| } |
| } |
| else { |
| |
| /* job = nonzero, solve trans(a) * x = b |
| first solve trans(u)*y = b */ |
| |
| for (k = 0; k < n; k++) { |
| t = ddot(k,&a[lda*k+0],1,&b[0],1); |
| b[k] = (b[k] - t)/a[lda*k+k]; |
| } |
| |
| /* now solve trans(l)*x = y */ |
| |
| if (nm1 >= 1) { |
| for (kb = 1; kb < nm1; kb++) { |
| k = n - (kb+1); |
| b[k] = b[k] + ddot(n-(k+1),&a[lda*k+k+1],1,&b[k+1],1); |
| l = ipvt[k]; |
| if (l != k) { |
| t = b[l]; |
| b[l] = b[k]; |
| b[k] = t; |
| } |
| } |
| } |
| } |
| return; |
| } |
| |
| /*----------------------*/ |
| |
| void daxpy(int n, REAL da, REAL dx[], int incx, REAL dy[], int incy) |
| /* |
| constant times a vector plus a vector. |
| jack dongarra, linpack, 3/11/78. |
| */ |
| |
| { |
| int i,ix,iy,m,mp1; |
| |
| mp1 = 0; |
| m = 0; |
| |
| if(n <= 0) return; |
| if (da == ZERO) return; |
| |
| if(incx != 1 || incy != 1) { |
| |
| /* code for unequal increments or equal increments |
| not equal to 1 */ |
| |
| ix = 0; |
| iy = 0; |
| if(incx < 0) ix = (-n+1)*incx; |
| if(incy < 0)iy = (-n+1)*incy; |
| for (i = 0;i < n; i++) { |
| dy[iy] = dy[iy] + da*dx[ix]; |
| ix = ix + incx; |
| iy = iy + incy; |
| |
| } |
| return; |
| } |
| |
| /* code for both increments equal to 1 */ |
| |
| |
| #ifdef ROLL |
| |
| for (i = 0;i < n; i++) { |
| dy[i] = dy[i] + da*dx[i]; |
| } |
| |
| |
| #endif |
| |
| #ifdef UNROLL |
| |
| m = n % 4; |
| if ( m != 0) { |
| for (i = 0; i < m; i++) |
| dy[i] = dy[i] + da*dx[i]; |
| |
| if (n < 4) return; |
| } |
| for (i = m; i < n; i = i + 4) { |
| dy[i] = dy[i] + da*dx[i]; |
| dy[i+1] = dy[i+1] + da*dx[i+1]; |
| dy[i+2] = dy[i+2] + da*dx[i+2]; |
| dy[i+3] = dy[i+3] + da*dx[i+3]; |
| |
| } |
| |
| #endif |
| return; |
| } |
| |
| /*----------------------*/ |
| |
| REAL ddot(int n, REAL dx[], int incx, REAL dy[], int incy) |
| /* |
| forms the dot product of two vectors. |
| jack dongarra, linpack, 3/11/78. |
| */ |
| |
| { |
| REAL dtemp; |
| int i,ix,iy,m,mp1; |
| |
| mp1 = 0; |
| m = 0; |
| |
| dtemp = ZERO; |
| |
| if(n <= 0) return(ZERO); |
| |
| if(incx != 1 || incy != 1) { |
| |
| /* code for unequal increments or equal increments |
| not equal to 1 */ |
| |
| ix = 0; |
| iy = 0; |
| if (incx < 0) ix = (-n+1)*incx; |
| if (incy < 0) iy = (-n+1)*incy; |
| for (i = 0;i < n; i++) { |
| dtemp = dtemp + dx[ix]*dy[iy]; |
| ix = ix + incx; |
| iy = iy + incy; |
| |
| } |
| return(dtemp); |
| } |
| |
| /* code for both increments equal to 1 */ |
| |
| |
| #ifdef ROLL |
| |
| for (i=0;i < n; i++) |
| dtemp = dtemp + dx[i]*dy[i]; |
| |
| return(dtemp); |
| |
| #endif |
| |
| #ifdef UNROLL |
| |
| |
| m = n % 5; |
| if (m != 0) { |
| for (i = 0; i < m; i++) |
| dtemp = dtemp + dx[i]*dy[i]; |
| if (n < 5) return(dtemp); |
| } |
| for (i = m; i < n; i = i + 5) { |
| dtemp = dtemp + dx[i]*dy[i] + |
| dx[i+1]*dy[i+1] + dx[i+2]*dy[i+2] + |
| dx[i+3]*dy[i+3] + dx[i+4]*dy[i+4]; |
| } |
| return(dtemp); |
| |
| #endif |
| |
| } |
| |
| /*----------------------*/ |
| void dscal(int n, REAL da, REAL dx[], int incx) |
| |
| /* scales a vector by a constant. |
| jack dongarra, linpack, 3/11/78. |
| */ |
| |
| { |
| int i,m,mp1,nincx; |
| |
| mp1 = 0; |
| m = 0; |
| |
| if(n <= 0)return; |
| if(incx != 1) { |
| |
| /* code for increment not equal to 1 */ |
| |
| nincx = n*incx; |
| for (i = 0; i < nincx; i = i + incx) |
| dx[i] = da*dx[i]; |
| |
| return; |
| } |
| |
| /* code for increment equal to 1 */ |
| |
| |
| #ifdef ROLL |
| |
| for (i = 0; i < n; i++) |
| dx[i] = da*dx[i]; |
| |
| |
| #endif |
| |
| #ifdef UNROLL |
| |
| |
| m = n % 5; |
| if (m != 0) { |
| for (i = 0; i < m; i++) |
| dx[i] = da*dx[i]; |
| if (n < 5) return; |
| } |
| for (i = m; i < n; i = i + 5){ |
| dx[i] = da*dx[i]; |
| dx[i+1] = da*dx[i+1]; |
| dx[i+2] = da*dx[i+2]; |
| dx[i+3] = da*dx[i+3]; |
| dx[i+4] = da*dx[i+4]; |
| } |
| |
| #endif |
| |
| } |
| |
| /*----------------------*/ |
| int idamax(int n, REAL dx[], int incx) |
| |
| /* |
| finds the index of element having max. absolute value. |
| jack dongarra, linpack, 3/11/78. |
| */ |
| |
| |
| { |
| REAL dmax; |
| int i, ix, itemp; |
| |
| if( n < 1 ) return(-1); |
| if(n ==1 ) return(0); |
| if(incx != 1) { |
| |
| /* code for increment not equal to 1 */ |
| |
| ix = 1; |
| dmax = fabs((double)dx[0]); |
| ix = ix + incx; |
| for (i = 1; i < n; i++) { |
| if(fabs((double)dx[ix]) > dmax) { |
| itemp = i; |
| dmax = fabs((double)dx[ix]); |
| } |
| ix = ix + incx; |
| } |
| } |
| else { |
| |
| /* code for increment equal to 1 */ |
| |
| itemp = 0; |
| dmax = fabs((double)dx[0]); |
| for (i = 1; i < n; i++) { |
| if(fabs((double)dx[i]) > dmax) { |
| itemp = i; |
| dmax = fabs((double)dx[i]); |
| } |
| } |
| } |
| return (itemp); |
| } |
| |
| /*----------------------*/ |
| REAL epslon (REAL x) |
| |
| /* |
| estimate unit roundoff in quantities of size x. |
| */ |
| |
| { |
| REAL a,b,c,eps; |
| /* |
| this program should function properly on all systems |
| satisfying the following two assumptions, |
| 1. the base used in representing dfloating point |
| numbers is not a power of three. |
| 2. the quantity a in statement 10 is represented to |
| the accuracy used in dfloating point variables |
| that are stored in memory. |
| the statement number 10 and the go to 10 are intended to |
| force optimizing compilers to generate code satisfying |
| assumption 2. |
| under these assumptions, it should be true that, |
| a is not exactly equal to four-thirds, |
| b has a zero for its last bit or digit, |
| c is not exactly equal to one, |
| eps measures the separation of 1.0 from |
| the next larger dfloating point number. |
| the developers of eispack would appreciate being informed |
| about any systems where these assumptions do not hold. |
| |
| ***************************************************************** |
| this routine is one of the auxiliary routines used by eispack iii |
| to avoid machine dependencies. |
| ***************************************************************** |
| |
| this version dated 4/6/83. |
| */ |
| |
| a = 4.0e0/3.0e0; |
| eps = ZERO; |
| while (eps == ZERO) { |
| b = a - ONE; |
| c = b + b + b; |
| eps = fabs((double)(c-ONE)); |
| } |
| return(eps*fabs((double)x)); |
| } |
| |
| /*----------------------*/ |
| void dmxpy (int n1, REAL y[], int n2, int ldm, REAL x[], REAL m[]) |
| |
| |
| /* We would like to declare m[][ldm], but c does not allow it. In this |
| function, references to m[i][j] are written m[ldm*i+j]. */ |
| |
| /* |
| purpose: |
| multiply matrix m times vector x and add the result to vector y. |
| |
| parameters: |
| |
| n1 integer, number of elements in vector y, and number of rows in |
| matrix m |
| |
| y double [n1], vector of length n1 to which is added |
| the product m*x |
| |
| n2 integer, number of elements in vector x, and number of columns |
| in matrix m |
| |
| ldm integer, leading dimension of array m |
| |
| x double [n2], vector of length n2 |
| |
| m double [ldm][n2], matrix of n1 rows and n2 columns |
| |
| ---------------------------------------------------------------------- |
| */ |
| { |
| int j,i,jmin; |
| /* cleanup odd vector */ |
| |
| j = n2 % 2; |
| if (j >= 1) { |
| j = j - 1; |
| for (i = 0; i < n1; i++) |
| y[i] = (y[i]) + x[j]*m[ldm*j+i]; |
| } |
| |
| /* cleanup odd group of two vectors */ |
| |
| j = n2 % 4; |
| if (j >= 2) { |
| j = j - 1; |
| for (i = 0; i < n1; i++) |
| y[i] = ( (y[i]) |
| + x[j-1]*m[ldm*(j-1)+i]) + x[j]*m[ldm*j+i]; |
| } |
| |
| /* cleanup odd group of four vectors */ |
| |
| j = n2 % 8; |
| if (j >= 4) { |
| j = j - 1; |
| for (i = 0; i < n1; i++) |
| y[i] = ((( (y[i]) |
| + x[j-3]*m[ldm*(j-3)+i]) |
| + x[j-2]*m[ldm*(j-2)+i]) |
| + x[j-1]*m[ldm*(j-1)+i]) + x[j]*m[ldm*j+i]; |
| } |
| |
| /* cleanup odd group of eight vectors */ |
| |
| j = n2 % 16; |
| if (j >= 8) { |
| j = j - 1; |
| for (i = 0; i < n1; i++) |
| y[i] = ((((((( (y[i]) |
| + x[j-7]*m[ldm*(j-7)+i]) + x[j-6]*m[ldm*(j-6)+i]) |
| + x[j-5]*m[ldm*(j-5)+i]) + x[j-4]*m[ldm*(j-4)+i]) |
| + x[j-3]*m[ldm*(j-3)+i]) + x[j-2]*m[ldm*(j-2)+i]) |
| + x[j-1]*m[ldm*(j-1)+i]) + x[j] *m[ldm*j+i]; |
| } |
| |
| /* main loop - groups of sixteen vectors */ |
| |
| jmin = (n2%16)+16; |
| for (j = jmin-1; j < n2; j = j + 16) { |
| for (i = 0; i < n1; i++) |
| y[i] = ((((((((((((((( (y[i]) |
| + x[j-15]*m[ldm*(j-15)+i]) |
| + x[j-14]*m[ldm*(j-14)+i]) |
| + x[j-13]*m[ldm*(j-13)+i]) |
| + x[j-12]*m[ldm*(j-12)+i]) |
| + x[j-11]*m[ldm*(j-11)+i]) |
| + x[j-10]*m[ldm*(j-10)+i]) |
| + x[j- 9]*m[ldm*(j- 9)+i]) |
| + x[j- 8]*m[ldm*(j- 8)+i]) |
| + x[j- 7]*m[ldm*(j- 7)+i]) |
| + x[j- 6]*m[ldm*(j- 6)+i]) |
| + x[j- 5]*m[ldm*(j- 5)+i]) |
| + x[j- 4]*m[ldm*(j- 4)+i]) |
| + x[j- 3]*m[ldm*(j- 3)+i]) |
| + x[j- 2]*m[ldm*(j- 2)+i]) |
| + x[j- 1]*m[ldm*(j- 1)+i]) |
| + x[j] *m[ldm*j+i]; |
| } |
| return; |
| } |
| |
| |
| |