| /* Create tuned thresholds for various algorithms. |
| |
| Copyright 1999, 2000, 2001, 2002, 2003, 2005 Free Software Foundation, Inc. |
| |
| This file is part of the GNU MP Library. |
| |
| The GNU MP Library is free software; you can redistribute it and/or modify |
| it under the terms of the GNU Lesser General Public License as published by |
| the Free Software Foundation; either version 3 of the License, or (at your |
| option) any later version. |
| |
| The GNU MP Library is distributed in the hope that it will be useful, but |
| WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
| or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public |
| License for more details. |
| |
| You should have received a copy of the GNU Lesser General Public License |
| along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ |
| |
| |
| /* Usage: tuneup [-t] [-t] [-p precision] |
| |
| -t turns on some diagnostic traces, a second -t turns on more traces. |
| |
| Notes: |
| |
| The code here isn't a vision of loveliness, mainly because it's subject |
| to ongoing changes according to new things wanting to be tuned, and |
| practical requirements of systems tested. |
| |
| Sometimes running the program twice produces slightly different results. |
| This is probably because there's so little separating algorithms near |
| their crossover, and on that basis it should make little or no difference |
| to the final speed of the relevant routines, but nothing has been done to |
| check that carefully. |
| |
| Algorithm: |
| |
| The thresholds are determined as follows. A crossover may not be a |
| single size but rather a range where it oscillates between method A or |
| method B faster. If the threshold is set making B used where A is faster |
| (or vice versa) that's bad. Badness is the percentage time lost and |
| total badness is the sum of this over all sizes measured. The threshold |
| is set to minimize total badness. |
| |
| Suppose, as sizes increase, method B becomes faster than method A. The |
| effect of the rule is that, as you look at increasing sizes, isolated |
| points where B is faster are ignored, but when it's consistently faster, |
| or faster on balance, then the threshold is set there. The same result |
| is obtained thinking in the other direction of A becoming faster at |
| smaller sizes. |
| |
| In practice the thresholds tend to be chosen to bring on the next |
| algorithm fairly quickly. |
| |
| This rule is attractive because it's got a basis in reason and is fairly |
| easy to implement, but no work has been done to actually compare it in |
| absolute terms to other possibilities. |
| |
| Implementation: |
| |
| In a normal library build the thresholds are constants. To tune them |
| selected objects are recompiled with the thresholds as global variables |
| instead. #define TUNE_PROGRAM_BUILD does this, with help from code at |
| the end of gmp-impl.h, and rules in tune/Makefile.am. |
| |
| MUL_KARATSUBA_THRESHOLD for example uses a recompiled mpn_mul_n. The |
| threshold is set to "size+1" to avoid karatsuba, or to "size" to use one |
| level, but recurse into the basecase. |
| |
| MUL_TOOM3_THRESHOLD makes use of the tuned MUL_KARATSUBA_THRESHOLD value. |
| Other routines in turn will make use of both of those. Naturally the |
| dependants must be tuned first. |
| |
| In a couple of cases, like DIVEXACT_1_THRESHOLD, there's no recompiling, |
| just a threshold based on comparing two routines (mpn_divrem_1 and |
| mpn_divexact_1), and no further use of the value determined. |
| |
| Flags like USE_PREINV_MOD_1 or JACOBI_BASE_METHOD are even simpler, being |
| just comparisons between certain routines on representative data. |
| |
| Shortcuts are applied when native (assembler) versions of routines exist. |
| For instance a native mpn_sqr_basecase is assumed to be always faster |
| than mpn_mul_basecase, with no measuring. |
| |
| No attempt is made to tune within assembler routines, for instance |
| DIVREM_1_NORM_THRESHOLD. An assembler mpn_divrem_1 is expected to be |
| written and tuned all by hand. Assembler routines that might have hard |
| limits are recompiled though, to make them accept a bigger range of sizes |
| than normal, eg. mpn_sqr_basecase to compare against mpn_kara_sqr_n. |
| |
| Limitations: |
| |
| The FFTs aren't subject to the same badness rule as the other thresholds, |
| so each k is probably being brought on a touch early. This isn't likely |
| to make a difference, and the simpler probing means fewer tests. |
| |
| */ |
| |
| #define TUNE_PROGRAM_BUILD 1 /* for gmp-impl.h */ |
| |
| #include "config.h" |
| |
| #include <math.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <time.h> |
| #if HAVE_UNISTD_H |
| #include <unistd.h> |
| #endif |
| |
| #include "gmp.h" |
| #include "gmp-impl.h" |
| #include "longlong.h" |
| |
| #include "tests.h" |
| #include "speed.h" |
| |
| #if !HAVE_DECL_OPTARG |
| extern char *optarg; |
| extern int optind, opterr; |
| #endif |
| |
| |
| #define DEFAULT_MAX_SIZE 1000 /* limbs */ |
| |
| #if WANT_FFT |
| mp_size_t option_fft_max_size = 50000; /* limbs */ |
| #else |
| mp_size_t option_fft_max_size = 0; |
| #endif |
| int option_trace = 0; |
| int option_fft_trace = 0; |
| struct speed_params s; |
| |
| struct dat_t { |
| mp_size_t size; |
| double d; |
| } *dat = NULL; |
| int ndat = 0; |
| int allocdat = 0; |
| |
| /* This is not defined if mpn_sqr_basecase doesn't declare a limit. In that |
| case use zero here, which for params.max_size means no limit. */ |
| #ifndef TUNE_SQR_KARATSUBA_MAX |
| #define TUNE_SQR_KARATSUBA_MAX 0 |
| #endif |
| |
| mp_size_t mul_karatsuba_threshold = MP_SIZE_T_MAX; |
| mp_size_t mul_toom3_threshold = MUL_TOOM3_THRESHOLD_LIMIT; |
| mp_size_t mul_toom44_threshold = MUL_TOOM44_THRESHOLD_LIMIT; |
| mp_size_t mul_fft_threshold = MP_SIZE_T_MAX; |
| mp_size_t mul_fft_modf_threshold = MP_SIZE_T_MAX; |
| mp_size_t sqr_basecase_threshold = MP_SIZE_T_MAX; |
| mp_size_t sqr_karatsuba_threshold |
| = (TUNE_SQR_KARATSUBA_MAX == 0 ? MP_SIZE_T_MAX : TUNE_SQR_KARATSUBA_MAX); |
| mp_size_t sqr_toom3_threshold = SQR_TOOM3_THRESHOLD_LIMIT; |
| mp_size_t sqr_toom4_threshold = SQR_TOOM4_THRESHOLD_LIMIT; |
| mp_size_t sqr_fft_threshold = MP_SIZE_T_MAX; |
| mp_size_t sqr_fft_modf_threshold = MP_SIZE_T_MAX; |
| mp_size_t mullow_basecase_threshold = MP_SIZE_T_MAX; |
| mp_size_t mullow_dc_threshold = MP_SIZE_T_MAX; |
| mp_size_t mullow_mul_n_threshold = MP_SIZE_T_MAX; |
| mp_size_t div_sb_preinv_threshold = MP_SIZE_T_MAX; |
| mp_size_t div_dc_threshold = MP_SIZE_T_MAX; |
| mp_size_t powm_threshold = MP_SIZE_T_MAX; |
| mp_size_t matrix22_strassen_threshold = MP_SIZE_T_MAX; |
| mp_size_t hgcd_threshold = MP_SIZE_T_MAX; |
| mp_size_t gcd_accel_threshold = MP_SIZE_T_MAX; |
| mp_size_t gcd_dc_threshold = MP_SIZE_T_MAX; |
| mp_size_t gcdext_dc_threshold = MP_SIZE_T_MAX; |
| mp_size_t divrem_1_norm_threshold = MP_SIZE_T_MAX; |
| mp_size_t divrem_1_unnorm_threshold = MP_SIZE_T_MAX; |
| mp_size_t mod_1_norm_threshold = MP_SIZE_T_MAX; |
| mp_size_t mod_1_unnorm_threshold = MP_SIZE_T_MAX; |
| mp_size_t mod_1_1_threshold = MP_SIZE_T_MAX; |
| mp_size_t mod_1_2_threshold = MP_SIZE_T_MAX; |
| mp_size_t mod_1_3_threshold = MP_SIZE_T_MAX; |
| mp_size_t mod_1_4_threshold = MP_SIZE_T_MAX; |
| mp_size_t divrem_2_threshold = MP_SIZE_T_MAX; |
| mp_size_t get_str_dc_threshold = MP_SIZE_T_MAX; |
| mp_size_t get_str_precompute_threshold = MP_SIZE_T_MAX; |
| mp_size_t set_str_dc_threshold = MP_SIZE_T_MAX; |
| mp_size_t set_str_precompute_threshold = MP_SIZE_T_MAX; |
| |
| mp_size_t fft_modf_sqr_threshold = MP_SIZE_T_MAX; |
| mp_size_t fft_modf_mul_threshold = MP_SIZE_T_MAX; |
| |
| struct param_t { |
| const char *name; |
| speed_function_t function; |
| speed_function_t function2; |
| double step_factor; /* how much to step sizes (rounded down) */ |
| double function_fudge; /* multiplier for "function" speeds */ |
| int stop_since_change; |
| double stop_factor; |
| mp_size_t min_size; |
| int min_is_always; |
| mp_size_t max_size; |
| mp_size_t check_size; |
| mp_size_t size_extra; |
| |
| #define DATA_HIGH_LT_R 1 |
| #define DATA_HIGH_GE_R 2 |
| int data_high; |
| |
| int noprint; |
| }; |
| |
| |
| /* These are normally undefined when false, which suits "#if" fine. |
| But give them zero values so they can be used in plain C "if"s. */ |
| #ifndef UDIV_PREINV_ALWAYS |
| #define UDIV_PREINV_ALWAYS 0 |
| #endif |
| #ifndef HAVE_NATIVE_mpn_divexact_1 |
| #define HAVE_NATIVE_mpn_divexact_1 0 |
| #endif |
| #ifndef HAVE_NATIVE_mpn_divrem_1 |
| #define HAVE_NATIVE_mpn_divrem_1 0 |
| #endif |
| #ifndef HAVE_NATIVE_mpn_divrem_2 |
| #define HAVE_NATIVE_mpn_divrem_2 0 |
| #endif |
| #ifndef HAVE_NATIVE_mpn_mod_1 |
| #define HAVE_NATIVE_mpn_mod_1 0 |
| #endif |
| #ifndef HAVE_NATIVE_mpn_modexact_1_odd |
| #define HAVE_NATIVE_mpn_modexact_1_odd 0 |
| #endif |
| #ifndef HAVE_NATIVE_mpn_preinv_divrem_1 |
| #define HAVE_NATIVE_mpn_preinv_divrem_1 0 |
| #endif |
| #ifndef HAVE_NATIVE_mpn_preinv_mod_1 |
| #define HAVE_NATIVE_mpn_preinv_mod_1 0 |
| #endif |
| #ifndef HAVE_NATIVE_mpn_sqr_basecase |
| #define HAVE_NATIVE_mpn_sqr_basecase 0 |
| #endif |
| |
| |
| #define MAX3(a,b,c) MAX (MAX (a, b), c) |
| |
| mp_limb_t |
| randlimb_norm (void) |
| { |
| mp_limb_t n; |
| mpn_random (&n, 1); |
| n |= GMP_NUMB_HIGHBIT; |
| return n; |
| } |
| |
| #define GMP_NUMB_HALFMASK ((CNST_LIMB(1) << (GMP_NUMB_BITS/2)) - 1) |
| |
| mp_limb_t |
| randlimb_half (void) |
| { |
| mp_limb_t n; |
| mpn_random (&n, 1); |
| n &= GMP_NUMB_HALFMASK; |
| n += (n==0); |
| return n; |
| } |
| |
| |
| /* Add an entry to the end of the dat[] array, reallocing to make it bigger |
| if necessary. */ |
| void |
| add_dat (mp_size_t size, double d) |
| { |
| #define ALLOCDAT_STEP 500 |
| |
| ASSERT_ALWAYS (ndat <= allocdat); |
| |
| if (ndat == allocdat) |
| { |
| dat = (struct dat_t *) __gmp_allocate_or_reallocate |
| (dat, allocdat * sizeof(dat[0]), |
| (allocdat+ALLOCDAT_STEP) * sizeof(dat[0])); |
| allocdat += ALLOCDAT_STEP; |
| } |
| |
| dat[ndat].size = size; |
| dat[ndat].d = d; |
| ndat++; |
| } |
| |
| |
| /* Return the threshold size based on the data accumulated. */ |
| mp_size_t |
| analyze_dat (int final) |
| { |
| double x, min_x; |
| int j, min_j; |
| |
| /* If the threshold is set at dat[0].size, any positive values are bad. */ |
| x = 0.0; |
| for (j = 0; j < ndat; j++) |
| if (dat[j].d > 0.0) |
| x += dat[j].d; |
| |
| if (option_trace >= 2 && final) |
| { |
| printf ("\n"); |
| printf ("x is the sum of the badness from setting thresh at given size\n"); |
| printf (" (minimum x is sought)\n"); |
| printf ("size=%ld first x=%.4f\n", (long) dat[j].size, x); |
| } |
| |
| min_x = x; |
| min_j = 0; |
| |
| |
| /* When stepping to the next dat[j].size, positive values are no longer |
| bad (so subtracted), negative values become bad (so add the absolute |
| value, meaning subtract). */ |
| for (j = 0; j < ndat; x -= dat[j].d, j++) |
| { |
| if (option_trace >= 2 && final) |
| printf ("size=%ld x=%.4f\n", (long) dat[j].size, x); |
| |
| if (x < min_x) |
| { |
| min_x = x; |
| min_j = j; |
| } |
| } |
| |
| return min_j; |
| } |
| |
| |
| /* Measuring for recompiled mpn/generic/divrem_1.c and mpn/generic/mod_1.c */ |
| |
| mp_limb_t mpn_divrem_1_tune |
| __GMP_PROTO ((mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t)); |
| mp_limb_t mpn_mod_1_tune |
| __GMP_PROTO ((mp_srcptr, mp_size_t, mp_limb_t)); |
| |
| double |
| speed_mpn_mod_1_tune (struct speed_params *s) |
| { |
| SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1_tune); |
| } |
| double |
| speed_mpn_divrem_1_tune (struct speed_params *s) |
| { |
| SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1_tune); |
| } |
| |
| |
| double |
| tuneup_measure (speed_function_t fun, |
| const struct param_t *param, |
| struct speed_params *s) |
| { |
| static struct param_t dummy; |
| double t; |
| TMP_DECL; |
| |
| if (! param) |
| param = &dummy; |
| |
| s->size += param->size_extra; |
| |
| TMP_MARK; |
| SPEED_TMP_ALLOC_LIMBS (s->xp, s->size, 0); |
| SPEED_TMP_ALLOC_LIMBS (s->yp, s->size, 0); |
| |
| mpn_random (s->xp, s->size); |
| mpn_random (s->yp, s->size); |
| |
| switch (param->data_high) { |
| case DATA_HIGH_LT_R: |
| s->xp[s->size-1] %= s->r; |
| s->yp[s->size-1] %= s->r; |
| break; |
| case DATA_HIGH_GE_R: |
| s->xp[s->size-1] |= s->r; |
| s->yp[s->size-1] |= s->r; |
| break; |
| } |
| |
| t = speed_measure (fun, s); |
| |
| s->size -= param->size_extra; |
| |
| TMP_FREE; |
| return t; |
| } |
| |
| |
| #define PRINT_WIDTH 28 |
| |
| void |
| print_define_start (const char *name) |
| { |
| printf ("#define %-*s ", PRINT_WIDTH, name); |
| if (option_trace) |
| printf ("...\n"); |
| } |
| |
| void |
| print_define_end_remark (const char *name, mp_size_t value, const char *remark) |
| { |
| if (option_trace) |
| printf ("#define %-*s ", PRINT_WIDTH, name); |
| |
| if (value == MP_SIZE_T_MAX) |
| printf ("MP_SIZE_T_MAX"); |
| else |
| printf ("%5ld", (long) value); |
| |
| if (remark != NULL) |
| printf (" /* %s */", remark); |
| printf ("\n"); |
| } |
| |
| void |
| print_define_end (const char *name, mp_size_t value) |
| { |
| const char *remark; |
| if (value == MP_SIZE_T_MAX) |
| remark = "never"; |
| else if (value == 0) |
| remark = "always"; |
| else |
| remark = NULL; |
| print_define_end_remark (name, value, remark); |
| } |
| |
| void |
| print_define (const char *name, mp_size_t value) |
| { |
| print_define_start (name); |
| print_define_end (name, value); |
| } |
| |
| void |
| print_define_remark (const char *name, mp_size_t value, const char *remark) |
| { |
| print_define_start (name); |
| print_define_end_remark (name, value, remark); |
| } |
| |
| |
| void |
| one (mp_size_t *threshold, struct param_t *param) |
| { |
| int since_positive, since_thresh_change; |
| int thresh_idx, new_thresh_idx; |
| |
| #define DEFAULT(x,n) do { if (! (x)) (x) = (n); } while (0) |
| |
| DEFAULT (param->function_fudge, 1.0); |
| DEFAULT (param->function2, param->function); |
| DEFAULT (param->step_factor, 0.01); /* small steps by default */ |
| DEFAULT (param->stop_since_change, 80); |
| DEFAULT (param->stop_factor, 1.2); |
| DEFAULT (param->min_size, 10); |
| DEFAULT (param->max_size, DEFAULT_MAX_SIZE); |
| |
| if (param->check_size != 0) |
| { |
| double t1, t2; |
| s.size = param->check_size; |
| |
| *threshold = s.size+1; |
| t1 = tuneup_measure (param->function, param, &s); |
| |
| *threshold = s.size; |
| t2 = tuneup_measure (param->function2, param, &s); |
| if (t1 == -1.0 || t2 == -1.0) |
| { |
| printf ("Oops, can't run both functions at size %ld\n", |
| (long) s.size); |
| abort (); |
| } |
| t1 *= param->function_fudge; |
| |
| /* ask that t2 is at least 4% below t1 */ |
| if (t1 < t2*1.04) |
| { |
| if (option_trace) |
| printf ("function2 never enough faster: t1=%.9f t2=%.9f\n", t1, t2); |
| *threshold = MP_SIZE_T_MAX; |
| if (! param->noprint) |
| print_define (param->name, *threshold); |
| return; |
| } |
| |
| if (option_trace >= 2) |
| printf ("function2 enough faster at size=%ld: t1=%.9f t2=%.9f\n", |
| (long) s.size, t1, t2); |
| } |
| |
| if (! param->noprint || option_trace) |
| print_define_start (param->name); |
| |
| ndat = 0; |
| since_positive = 0; |
| since_thresh_change = 0; |
| thresh_idx = 0; |
| |
| if (option_trace >= 2) |
| { |
| printf (" algorithm-A algorithm-B ratio possible\n"); |
| printf (" (seconds) (seconds) diff thresh\n"); |
| } |
| |
| for (s.size = param->min_size; |
| s.size < param->max_size; |
| s.size += MAX ((mp_size_t) floor (s.size * param->step_factor), 1)) |
| { |
| double ti, tiplus1, d; |
| |
| /* If there's a size limit and it's reached then it should still |
| be sensible to analyze the data since we want the threshold put |
| either at or near the limit. */ |
| if (s.size >= param->max_size) |
| { |
| if (option_trace) |
| printf ("Reached maximum size (%ld) without otherwise stopping\n", |
| (long) param->max_size); |
| break; |
| } |
| |
| /* |
| FIXME: check minimum size requirements are met, possibly by just |
| checking for the -1 returns from the speed functions. |
| */ |
| |
| /* using method A at this size */ |
| *threshold = s.size+1; |
| ti = tuneup_measure (param->function, param, &s); |
| if (ti == -1.0) |
| abort (); |
| ti *= param->function_fudge; |
| |
| /* using method B at this size */ |
| *threshold = s.size; |
| tiplus1 = tuneup_measure (param->function2, param, &s); |
| if (tiplus1 == -1.0) |
| abort (); |
| |
| /* Calculate the fraction by which the one or the other routine is |
| slower. */ |
| if (tiplus1 >= ti) |
| d = (tiplus1 - ti) / tiplus1; /* negative */ |
| else |
| d = (tiplus1 - ti) / ti; /* positive */ |
| |
| add_dat (s.size, d); |
| |
| new_thresh_idx = analyze_dat (0); |
| |
| if (option_trace >= 2) |
| printf ("size=%ld %.9f %.9f % .4f %c %ld\n", |
| (long) s.size, ti, tiplus1, d, |
| ti > tiplus1 ? '#' : ' ', |
| (long) dat[new_thresh_idx].size); |
| |
| /* Stop if the last time method i was faster was more than a |
| certain number of measurements ago. */ |
| #define STOP_SINCE_POSITIVE 200 |
| if (d >= 0) |
| since_positive = 0; |
| else |
| if (++since_positive > STOP_SINCE_POSITIVE) |
| { |
| if (option_trace >= 1) |
| printf ("stopped due to since_positive (%d)\n", |
| STOP_SINCE_POSITIVE); |
| break; |
| } |
| |
| /* Stop if method A has become slower by a certain factor. */ |
| if (ti >= tiplus1 * param->stop_factor) |
| { |
| if (option_trace >= 1) |
| printf ("stopped due to ti >= tiplus1 * factor (%.1f)\n", |
| param->stop_factor); |
| break; |
| } |
| |
| /* Stop if the threshold implied hasn't changed in a certain |
| number of measurements. (It's this condition that ususally |
| stops the loop.) */ |
| if (thresh_idx != new_thresh_idx) |
| since_thresh_change = 0, thresh_idx = new_thresh_idx; |
| else |
| if (++since_thresh_change > param->stop_since_change) |
| { |
| if (option_trace >= 1) |
| printf ("stopped due to since_thresh_change (%d)\n", |
| param->stop_since_change); |
| break; |
| } |
| |
| /* Stop if the threshold implied is more than a certain number of |
| measurements ago. */ |
| #define STOP_SINCE_AFTER 500 |
| if (ndat - thresh_idx > STOP_SINCE_AFTER) |
| { |
| if (option_trace >= 1) |
| printf ("stopped due to ndat - thresh_idx > amount (%d)\n", |
| STOP_SINCE_AFTER); |
| break; |
| } |
| |
| /* Stop when the size limit is reached before the end of the |
| crossover, but only show this as an error for >= the default max |
| size. FIXME: Maybe should make it a param choice whether this is |
| an error. */ |
| if (s.size >= param->max_size && param->max_size >= DEFAULT_MAX_SIZE) |
| { |
| fprintf (stderr, "%s\n", param->name); |
| fprintf (stderr, "sizes %ld to %ld total %d measurements\n", |
| (long) dat[0].size, (long) dat[ndat-1].size, ndat); |
| fprintf (stderr, " max size reached before end of crossover\n"); |
| break; |
| } |
| } |
| |
| if (option_trace >= 1) |
| printf ("sizes %ld to %ld total %d measurements\n", |
| (long) dat[0].size, (long) dat[ndat-1].size, ndat); |
| |
| *threshold = dat[analyze_dat (1)].size; |
| |
| if (param->min_is_always) |
| { |
| if (*threshold == param->min_size) |
| *threshold = 0; |
| } |
| |
| if (! param->noprint || option_trace) |
| print_define_end (param->name, *threshold); |
| } |
| |
| |
| /* Special probing for the fft thresholds. The size restrictions on the |
| FFTs mean the graph of time vs size has a step effect. See this for |
| example using |
| |
| ./speed -s 4096-16384 -t 128 -P foo mpn_mul_fft.8 mpn_mul_fft.9 |
| gnuplot foo.gnuplot |
| |
| The current approach is to compare routines at the midpoint of relevant |
| steps. Arguably a more sophisticated system of threshold data is wanted |
| if this step effect remains. */ |
| |
| struct fft_param_t { |
| const char *table_name; |
| const char *threshold_name; |
| const char *modf_threshold_name; |
| mp_size_t *p_threshold; |
| mp_size_t *p_modf_threshold; |
| mp_size_t first_size; |
| mp_size_t max_size; |
| speed_function_t function; |
| speed_function_t mul_function; |
| mp_size_t sqr; |
| }; |
| |
| |
| /* mpn_mul_fft requires pl a multiple of 2^k limbs, but with |
| N=pl*BIT_PER_MP_LIMB it internally also pads out so N/2^k is a multiple |
| of 2^(k-1) bits. */ |
| |
| mp_size_t |
| fft_step_size (int k) |
| { |
| mp_size_t step; |
| |
| step = MAX ((mp_size_t) 1 << (k-1), BITS_PER_MP_LIMB) / BITS_PER_MP_LIMB; |
| step *= (mp_size_t) 1 << k; |
| |
| if (step <= 0) |
| { |
| printf ("Can't handle k=%d\n", k); |
| abort (); |
| } |
| |
| return step; |
| } |
| |
| mp_size_t |
| fft_next_size (mp_size_t pl, int k) |
| { |
| mp_size_t m = fft_step_size (k); |
| |
| /* printf ("[k=%d %ld] %ld ->", k, m, pl); */ |
| |
| if (pl == 0 || (pl & (m-1)) != 0) |
| pl = (pl | (m-1)) + 1; |
| |
| /* printf (" %ld\n", pl); */ |
| return pl; |
| } |
| |
| void |
| fft (struct fft_param_t *p) |
| { |
| mp_size_t size; |
| int i, k; |
| |
| for (i = 0; i < numberof (mpn_fft_table[p->sqr]); i++) |
| mpn_fft_table[p->sqr][i] = MP_SIZE_T_MAX; |
| |
| *p->p_threshold = MP_SIZE_T_MAX; |
| *p->p_modf_threshold = MP_SIZE_T_MAX; |
| |
| option_trace = MAX (option_trace, option_fft_trace); |
| |
| printf ("#define %s {", p->table_name); |
| if (option_trace >= 2) |
| printf ("\n"); |
| |
| k = FFT_FIRST_K; |
| size = p->first_size; |
| for (;;) |
| { |
| double tk, tk1; |
| |
| size = fft_next_size (size+1, k+1); |
| |
| if (size >= p->max_size) |
| break; |
| if (k >= FFT_FIRST_K + numberof (mpn_fft_table[p->sqr])) |
| break; |
| |
| /* compare k to k+1 in the middle of the current k+1 step */ |
| s.size = size + fft_step_size (k+1) / 2; |
| s.r = k; |
| tk = tuneup_measure (p->function, NULL, &s); |
| if (tk == -1.0) |
| abort (); |
| |
| s.r = k+1; |
| tk1 = tuneup_measure (p->function, NULL, &s); |
| if (tk1 == -1.0) |
| abort (); |
| |
| if (option_trace >= 2) |
| printf ("at %ld size=%ld k=%d %.9f k=%d %.9f\n", |
| (long) size, (long) s.size, k, tk, k+1, tk1); |
| |
| /* declare the k+1 threshold as soon as it's faster at its midpoint */ |
| if (tk1 < tk) |
| { |
| mpn_fft_table[p->sqr][k-FFT_FIRST_K] = s.size; |
| printf (" %ld,", (long) s.size); |
| if (option_trace >= 2) printf ("\n"); |
| k++; |
| } |
| } |
| |
| mpn_fft_table[p->sqr][k-FFT_FIRST_K] = 0; |
| printf (" 0 }\n"); |
| |
| |
| size = p->first_size; |
| |
| /* Declare an FFT faster than a plain toom4 etc multiplication found as |
| soon as one faster measurement obtained. A multiplication in the |
| middle of the FFT step is tested. */ |
| for (;;) |
| { |
| int modf = (*p->p_modf_threshold == MP_SIZE_T_MAX); |
| double tk, tm; |
| |
| /* k=7 should be the first FFT which can beat toom4 on a full |
| multiply, so jump to that threshold and save some probing after the |
| modf threshold is found. */ |
| if (!modf && size < mpn_fft_table[p->sqr][2]) |
| { |
| size = mpn_fft_table[p->sqr][2]; |
| if (option_trace >= 2) |
| printf ("jump to size=%ld\n", (long) size); |
| } |
| |
| size = fft_next_size (size+1, mpn_fft_best_k (size, p->sqr)); |
| k = mpn_fft_best_k (size, p->sqr); |
| |
| if (size >= p->max_size) |
| break; |
| |
| s.size = size + fft_step_size (k) / 2; |
| s.r = k; |
| tk = tuneup_measure (p->function, NULL, &s); |
| if (tk == -1.0) |
| abort (); |
| |
| if (!modf) s.size /= 2; |
| tm = tuneup_measure (p->mul_function, NULL, &s); |
| if (tm == -1.0) |
| abort (); |
| |
| if (option_trace >= 2) |
| printf ("at %ld size=%ld k=%d %.9f size=%ld %s mul %.9f\n", |
| (long) size, |
| (long) size + fft_step_size (k) / 2, k, tk, |
| (long) s.size, modf ? "modf" : "full", tm); |
| |
| if (tk < tm) |
| { |
| if (modf) |
| { |
| *p->p_modf_threshold = s.size; |
| print_define (p->modf_threshold_name, *p->p_modf_threshold); |
| } |
| else |
| { |
| *p->p_threshold = s.size; |
| print_define (p->threshold_name, *p->p_threshold); |
| break; |
| } |
| } |
| } |
| |
| } |
| |
| |
| |
| /* Start karatsuba from 4, since the Cray t90 ieee code is much faster at 2, |
| giving wrong results. */ |
| void |
| tune_mul (void) |
| { |
| static struct param_t param; |
| |
| param.function = speed_mpn_mul_n; |
| |
| param.name = "MUL_KARATSUBA_THRESHOLD"; |
| param.min_size = MAX (4, MPN_KARA_MUL_N_MINSIZE); |
| param.max_size = MUL_KARATSUBA_THRESHOLD_LIMIT-1; |
| one (&mul_karatsuba_threshold, ¶m); |
| |
| param.name = "MUL_TOOM3_THRESHOLD"; |
| param.min_size = MAX (mul_karatsuba_threshold, MPN_TOOM3_MUL_N_MINSIZE); |
| param.max_size = MUL_TOOM3_THRESHOLD_LIMIT-1; |
| one (&mul_toom3_threshold, ¶m); |
| |
| param.name = "MUL_TOOM44_THRESHOLD"; |
| param.min_size = MAX (mul_toom3_threshold, MPN_TOOM44_MUL_N_MINSIZE); |
| param.max_size = MUL_TOOM44_THRESHOLD_LIMIT-1; |
| one (&mul_toom44_threshold, ¶m); |
| |
| /* disabled until tuned */ |
| MUL_FFT_THRESHOLD = MP_SIZE_T_MAX; |
| } |
| |
| |
| /* This was written by the tuneup challenged tege. Kevin, please delete |
| this comment when you've reviewed/rewritten this. :-) */ |
| void |
| tune_mullow (void) |
| { |
| static struct param_t param; |
| |
| param.function = speed_mpn_mullow_n; |
| |
| param.name = "MULLOW_BASECASE_THRESHOLD"; |
| param.min_size = 3; |
| param.min_is_always = 1; |
| param.max_size = MULLOW_BASECASE_THRESHOLD_LIMIT-1; |
| one (&mullow_basecase_threshold, ¶m); |
| |
| param.min_is_always = 0; /* ??? */ |
| |
| param.name = "MULLOW_DC_THRESHOLD"; |
| param.min_size = mul_karatsuba_threshold; |
| param.max_size = 1000; |
| one (&mullow_dc_threshold, ¶m); |
| |
| param.name = "MULLOW_MUL_N_THRESHOLD"; |
| param.min_size = mullow_dc_threshold; |
| param.max_size = 2000; |
| one (&mullow_mul_n_threshold, ¶m); |
| } |
| |
| |
| /* Start the basecase from 3, since 1 is a special case, and if mul_basecase |
| is faster only at size==2 then we don't want to bother with extra code |
| just for that. Start karatsuba from 4 same as MUL above. */ |
| |
| void |
| tune_sqr (void) |
| { |
| /* disabled until tuned */ |
| SQR_FFT_THRESHOLD = MP_SIZE_T_MAX; |
| |
| if (HAVE_NATIVE_mpn_sqr_basecase) |
| { |
| print_define_remark ("SQR_BASECASE_THRESHOLD", 0, "always (native)"); |
| sqr_basecase_threshold = 0; |
| } |
| else |
| { |
| static struct param_t param; |
| param.name = "SQR_BASECASE_THRESHOLD"; |
| param.function = speed_mpn_sqr_n; |
| param.min_size = 3; |
| param.min_is_always = 1; |
| param.max_size = TUNE_SQR_KARATSUBA_MAX; |
| param.noprint = 1; |
| one (&sqr_basecase_threshold, ¶m); |
| } |
| |
| { |
| static struct param_t param; |
| param.name = "SQR_KARATSUBA_THRESHOLD"; |
| param.function = speed_mpn_sqr_n; |
| param.min_size = MAX (4, MPN_KARA_SQR_N_MINSIZE); |
| param.max_size = TUNE_SQR_KARATSUBA_MAX; |
| param.noprint = 1; |
| one (&sqr_karatsuba_threshold, ¶m); |
| |
| if (! HAVE_NATIVE_mpn_sqr_basecase |
| && sqr_karatsuba_threshold < sqr_basecase_threshold) |
| { |
| /* Karatsuba becomes faster than mul_basecase before |
| sqr_basecase does. Arrange for the expression |
| "BELOW_THRESHOLD (un, SQR_KARATSUBA_THRESHOLD))" which |
| selects mpn_sqr_basecase in mpn_sqr_n to be false, by setting |
| SQR_KARATSUBA_THRESHOLD to zero, making |
| SQR_BASECASE_THRESHOLD the karatsuba threshold. */ |
| |
| sqr_basecase_threshold = SQR_KARATSUBA_THRESHOLD; |
| SQR_KARATSUBA_THRESHOLD = 0; |
| |
| print_define_remark ("SQR_BASECASE_THRESHOLD", sqr_basecase_threshold, |
| "karatsuba"); |
| print_define_remark ("SQR_KARATSUBA_THRESHOLD",SQR_KARATSUBA_THRESHOLD, |
| "never sqr_basecase"); |
| } |
| else |
| { |
| if (! HAVE_NATIVE_mpn_sqr_basecase) |
| print_define ("SQR_BASECASE_THRESHOLD", sqr_basecase_threshold); |
| print_define ("SQR_KARATSUBA_THRESHOLD", SQR_KARATSUBA_THRESHOLD); |
| } |
| } |
| |
| { |
| static struct param_t param; |
| mp_size_t toom3_start = MAX (sqr_karatsuba_threshold, sqr_basecase_threshold); |
| |
| param.function = speed_mpn_sqr_n; |
| |
| param.name = "SQR_TOOM3_THRESHOLD"; |
| param.min_size = MAX (toom3_start, MPN_TOOM3_SQR_N_MINSIZE); |
| param.max_size = SQR_TOOM3_THRESHOLD_LIMIT-1; |
| one (&sqr_toom3_threshold, ¶m); |
| |
| param.name = "SQR_TOOM4_THRESHOLD"; |
| param.min_size = MAX (sqr_toom3_threshold, MPN_TOOM4_SQR_N_MINSIZE); |
| param.max_size = SQR_TOOM4_THRESHOLD_LIMIT-1; |
| one (&sqr_toom4_threshold, ¶m); |
| } |
| } |
| |
| |
| void |
| tune_sb_preinv (void) |
| { |
| static struct param_t param; |
| |
| if (GMP_NAIL_BITS != 0) |
| { |
| DIV_SB_PREINV_THRESHOLD = MP_SIZE_T_MAX; |
| print_define_remark ("DIV_SB_PREINV_THRESHOLD", MP_SIZE_T_MAX, |
| "no preinv with nails"); |
| return; |
| } |
| |
| if (UDIV_PREINV_ALWAYS) |
| { |
| print_define_remark ("DIV_SB_PREINV_THRESHOLD", 0L, "preinv always"); |
| return; |
| } |
| |
| param.check_size = 256; |
| param.min_size = 3; |
| param.min_is_always = 1; |
| param.size_extra = 3; |
| param.stop_factor = 2.0; |
| param.name = "DIV_SB_PREINV_THRESHOLD"; |
| param.function = speed_mpn_sb_divrem_m3; |
| one (&div_sb_preinv_threshold, ¶m); |
| } |
| |
| |
| void |
| tune_dc (void) |
| { |
| static struct param_t param; |
| param.name = "DIV_DC_THRESHOLD"; |
| param.function = speed_mpn_dc_tdiv_qr; |
| param.step_factor = 0.02; |
| one (&div_dc_threshold, ¶m); |
| } |
| |
| |
| /* This is an indirect determination, based on a comparison between redc and |
| mpz_mod. A fudge factor of 1.04 is applied to redc, to represent |
| additional overheads it gets in mpz_powm. |
| |
| stop_factor is 1.1 to hopefully help cray vector systems, where otherwise |
| currently it hits the 1000 limb limit with only a factor of about 1.18 |
| (threshold should be around 650). */ |
| |
| void |
| tune_powm (void) |
| { |
| static struct param_t param; |
| param.name = "POWM_THRESHOLD"; |
| param.function = speed_mpn_redc_1; |
| param.function2 = speed_mpz_mod; |
| param.step_factor = 0.03; |
| param.stop_factor = 1.1; |
| param.function_fudge = 1.04; |
| one (&powm_threshold, ¶m); |
| } |
| |
| |
| void |
| tune_matrix22_mul (void) |
| { |
| static struct param_t param; |
| param.name = "MATRIX22_STRASSEN_THRESHOLD"; |
| param.function = speed_mpn_matrix22_mul; |
| param.min_size = 2; |
| one (&matrix22_strassen_threshold, ¶m); |
| } |
| |
| void |
| tune_hgcd (void) |
| { |
| static struct param_t param; |
| param.name = "HGCD_THRESHOLD"; |
| param.function = speed_mpn_hgcd; |
| /* We seem to get strange results for small sizes */ |
| param.min_size = 30; |
| one (&hgcd_threshold, ¶m); |
| } |
| |
| #if 0 |
| void |
| tune_gcd_accel (void) |
| { |
| static struct param_t param; |
| param.name = "GCD_ACCEL_THRESHOLD"; |
| param.function = speed_mpn_gcd; |
| param.min_size = 1; |
| one (&gcd_accel_threshold, ¶m); |
| } |
| #endif |
| void |
| tune_gcd_dc (void) |
| { |
| static struct param_t param; |
| param.name = "GCD_DC_THRESHOLD"; |
| param.function = speed_mpn_gcd; |
| param.min_size = hgcd_threshold; |
| param.max_size = 3000; |
| param.step_factor = 0.02; |
| one (&gcd_dc_threshold, ¶m); |
| } |
| |
| void |
| tune_gcdext_dc (void) |
| { |
| static struct param_t param; |
| param.name = "GCDEXT_DC_THRESHOLD"; |
| param.function = speed_mpn_gcdext; |
| param.min_size = hgcd_threshold; |
| param.max_size = 3000; |
| param.step_factor = 0.02; |
| one (&gcdext_dc_threshold, ¶m); |
| } |
| |
| |
| /* size_extra==1 reflects the fact that with high<divisor one division is |
| always skipped. Forcing high<divisor while testing ensures consistency |
| while stepping through sizes, ie. that size-1 divides will be done each |
| time. |
| |
| min_size==2 and min_is_always are used so that if plain division is only |
| better at size==1 then don't bother including that code just for that |
| case, instead go with preinv always and get a size saving. */ |
| |
| #define DIV_1_PARAMS \ |
| param.check_size = 256; \ |
| param.min_size = 2; \ |
| param.min_is_always = 1; \ |
| param.data_high = DATA_HIGH_LT_R; \ |
| param.size_extra = 1; \ |
| param.stop_factor = 2.0; |
| |
| |
| double (*tuned_speed_mpn_divrem_1) __GMP_PROTO ((struct speed_params *)); |
| |
| void |
| tune_divrem_1 (void) |
| { |
| /* plain version by default */ |
| tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1; |
| |
| /* No support for tuning native assembler code, do that by hand and put |
| the results in the .asm file, there's no need for such thresholds to |
| appear in gmp-mparam.h. */ |
| if (HAVE_NATIVE_mpn_divrem_1) |
| return; |
| |
| if (GMP_NAIL_BITS != 0) |
| { |
| print_define_remark ("DIVREM_1_NORM_THRESHOLD", MP_SIZE_T_MAX, |
| "no preinv with nails"); |
| print_define_remark ("DIVREM_1_UNNORM_THRESHOLD", MP_SIZE_T_MAX, |
| "no preinv with nails"); |
| return; |
| } |
| |
| if (UDIV_PREINV_ALWAYS) |
| { |
| print_define_remark ("DIVREM_1_NORM_THRESHOLD", 0L, "preinv always"); |
| print_define ("DIVREM_1_UNNORM_THRESHOLD", 0L); |
| return; |
| } |
| |
| tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1_tune; |
| |
| /* Tune for the integer part of mpn_divrem_1. This will very possibly be |
| a bit out for the fractional part, but that's too bad, the integer part |
| is more important. */ |
| { |
| static struct param_t param; |
| param.name = "DIVREM_1_NORM_THRESHOLD"; |
| DIV_1_PARAMS; |
| s.r = randlimb_norm (); |
| param.function = speed_mpn_divrem_1_tune; |
| one (&divrem_1_norm_threshold, ¶m); |
| } |
| { |
| static struct param_t param; |
| param.name = "DIVREM_1_UNNORM_THRESHOLD"; |
| DIV_1_PARAMS; |
| s.r = randlimb_half (); |
| param.function = speed_mpn_divrem_1_tune; |
| one (&divrem_1_unnorm_threshold, ¶m); |
| } |
| } |
| |
| |
| double (*tuned_speed_mpn_mod_1) __GMP_PROTO ((struct speed_params *)); |
| |
| void |
| tune_mod_1 (void) |
| { |
| /* plain version by default */ |
| tuned_speed_mpn_mod_1 = speed_mpn_mod_1; |
| |
| /* No support for tuning native assembler code, do that by hand and put |
| the results in the .asm file, there's no need for such thresholds to |
| appear in gmp-mparam.h. */ |
| if (HAVE_NATIVE_mpn_mod_1) |
| return; |
| |
| if (GMP_NAIL_BITS != 0) |
| { |
| print_define_remark ("MOD_1_NORM_THRESHOLD", MP_SIZE_T_MAX, |
| "no preinv with nails"); |
| print_define_remark ("MOD_1_UNNORM_THRESHOLD", MP_SIZE_T_MAX, |
| "no preinv with nails"); |
| return; |
| } |
| |
| if (UDIV_PREINV_ALWAYS) |
| { |
| print_define ("MOD_1_NORM_THRESHOLD", 0L); |
| print_define ("MOD_1_UNNORM_THRESHOLD", 0L); |
| } |
| else |
| { |
| tuned_speed_mpn_mod_1 = speed_mpn_mod_1_tune; |
| |
| { |
| static struct param_t param; |
| param.name = "MOD_1_NORM_THRESHOLD"; |
| DIV_1_PARAMS; |
| s.r = randlimb_norm (); |
| param.function = speed_mpn_mod_1_tune; |
| one (&mod_1_norm_threshold, ¶m); |
| } |
| { |
| static struct param_t param; |
| param.name = "MOD_1_UNNORM_THRESHOLD"; |
| DIV_1_PARAMS; |
| s.r = randlimb_half (); |
| param.function = speed_mpn_mod_1_tune; |
| one (&mod_1_unnorm_threshold, ¶m); |
| } |
| } |
| { |
| static struct param_t param; |
| |
| s.r = GMP_NUMB_MASK / 5; |
| param.function = speed_mpn_mod_1_tune; |
| param.min_size = 1; |
| |
| param.name = "MOD_1_1_THRESHOLD"; |
| one (&mod_1_1_threshold, ¶m); |
| |
| param.name = "MOD_1_2_THRESHOLD"; |
| param.min_size = mod_1_1_threshold + 1; |
| one (&mod_1_2_threshold, ¶m); |
| |
| #if 0 |
| param.name = "MOD_1_3_THRESHOLD"; |
| param.min_size = mod_1_2_threshold + 1; |
| one (&mod_1_3_threshold, ¶m); |
| #endif |
| |
| param.name = "MOD_1_4_THRESHOLD"; |
| param.min_size = mod_1_2_threshold + 1; |
| one (&mod_1_4_threshold, ¶m); |
| } |
| } |
| |
| |
| /* A non-zero DIVREM_1_UNNORM_THRESHOLD (or DIVREM_1_NORM_THRESHOLD) would |
| imply that udiv_qrnnd_preinv is worth using, but it seems most |
| straightforward to compare mpn_preinv_divrem_1 and mpn_divrem_1_div |
| directly. */ |
| |
| void |
| tune_preinv_divrem_1 (void) |
| { |
| static struct param_t param; |
| speed_function_t divrem_1; |
| const char *divrem_1_name; |
| double t1, t2; |
| |
| if (GMP_NAIL_BITS != 0) |
| { |
| print_define_remark ("USE_PREINV_DIVREM_1", 0, "no preinv with nails"); |
| return; |
| } |
| |
| /* Any native version of mpn_preinv_divrem_1 is assumed to exist because |
| it's faster than mpn_divrem_1. */ |
| if (HAVE_NATIVE_mpn_preinv_divrem_1) |
| { |
| print_define_remark ("USE_PREINV_DIVREM_1", 1, "native"); |
| return; |
| } |
| |
| /* If udiv_qrnnd_preinv is the only division method then of course |
| mpn_preinv_divrem_1 should be used. */ |
| if (UDIV_PREINV_ALWAYS) |
| { |
| print_define_remark ("USE_PREINV_DIVREM_1", 1, "preinv always"); |
| return; |
| } |
| |
| /* If we've got an assembler version of mpn_divrem_1, then compare against |
| that, not the mpn_divrem_1_div generic C. */ |
| if (HAVE_NATIVE_mpn_divrem_1) |
| { |
| divrem_1 = speed_mpn_divrem_1; |
| divrem_1_name = "mpn_divrem_1"; |
| } |
| else |
| { |
| divrem_1 = speed_mpn_divrem_1_div; |
| divrem_1_name = "mpn_divrem_1_div"; |
| } |
| |
| param.data_high = DATA_HIGH_LT_R; /* allow skip one division */ |
| s.size = 200; /* generous but not too big */ |
| /* Divisor, nonzero. Unnormalized so as to exercise the shift!=0 case, |
| since in general that's probably most common, though in fact for a |
| 64-bit limb mp_bases[10].big_base is normalized. */ |
| s.r = urandom() & (GMP_NUMB_MASK >> 4); |
| if (s.r == 0) s.r = 123; |
| |
| t1 = tuneup_measure (speed_mpn_preinv_divrem_1, ¶m, &s); |
| t2 = tuneup_measure (divrem_1, ¶m, &s); |
| if (t1 == -1.0 || t2 == -1.0) |
| { |
| printf ("Oops, can't measure mpn_preinv_divrem_1 and %s at %ld\n", |
| divrem_1_name, (long) s.size); |
| abort (); |
| } |
| if (option_trace >= 1) |
| printf ("size=%ld, mpn_preinv_divrem_1 %.9f, %s %.9f\n", |
| (long) s.size, t1, divrem_1_name, t2); |
| |
| print_define_remark ("USE_PREINV_DIVREM_1", (mp_size_t) (t1 < t2), NULL); |
| } |
| |
| |
| /* A non-zero MOD_1_UNNORM_THRESHOLD (or MOD_1_NORM_THRESHOLD) would imply |
| that udiv_qrnnd_preinv is worth using, but it seems most straightforward |
| to compare mpn_preinv_mod_1 and mpn_mod_1_div directly. */ |
| |
| void |
| tune_preinv_mod_1 (void) |
| { |
| static struct param_t param; |
| speed_function_t mod_1; |
| const char *mod_1_name; |
| double t1, t2; |
| |
| /* Any native version of mpn_preinv_mod_1 is assumed to exist because it's |
| faster than mpn_mod_1. */ |
| if (HAVE_NATIVE_mpn_preinv_mod_1) |
| { |
| print_define_remark ("USE_PREINV_MOD_1", 1, "native"); |
| return; |
| } |
| |
| if (GMP_NAIL_BITS != 0) |
| { |
| print_define_remark ("USE_PREINV_MOD_1", 0, "no preinv with nails"); |
| return; |
| } |
| |
| /* If udiv_qrnnd_preinv is the only division method then of course |
| mpn_preinv_mod_1 should be used. */ |
| if (UDIV_PREINV_ALWAYS) |
| { |
| print_define_remark ("USE_PREINV_MOD_1", 1, "preinv always"); |
| return; |
| } |
| |
| /* If we've got an assembler version of mpn_mod_1, then compare against |
| that, not the mpn_mod_1_div generic C. */ |
| if (HAVE_NATIVE_mpn_mod_1) |
| { |
| mod_1 = speed_mpn_mod_1; |
| mod_1_name = "mpn_mod_1"; |
| } |
| else |
| { |
| mod_1 = speed_mpn_mod_1_div; |
| mod_1_name = "mpn_mod_1_div"; |
| } |
| |
| param.data_high = DATA_HIGH_LT_R; /* let mpn_mod_1 skip one division */ |
| s.size = 200; /* generous but not too big */ |
| s.r = randlimb_norm(); /* divisor */ |
| |
| t1 = tuneup_measure (speed_mpn_preinv_mod_1, ¶m, &s); |
| t2 = tuneup_measure (mod_1, ¶m, &s); |
| if (t1 == -1.0 || t2 == -1.0) |
| { |
| printf ("Oops, can't measure mpn_preinv_mod_1 and %s at %ld\n", |
| mod_1_name, (long) s.size); |
| abort (); |
| } |
| if (option_trace >= 1) |
| printf ("size=%ld, mpn_preinv_mod_1 %.9f, %s %.9f\n", |
| (long) s.size, t1, mod_1_name, t2); |
| |
| print_define_remark ("USE_PREINV_MOD_1", (mp_size_t) (t1 < t2), NULL); |
| } |
| |
| |
| void |
| tune_divrem_2 (void) |
| { |
| static struct param_t param; |
| |
| /* No support for tuning native assembler code, do that by hand and put |
| the results in the .asm file, and there's no need for such thresholds |
| to appear in gmp-mparam.h. */ |
| if (HAVE_NATIVE_mpn_divrem_2) |
| return; |
| |
| if (GMP_NAIL_BITS != 0) |
| { |
| print_define_remark ("DIVREM_2_THRESHOLD", MP_SIZE_T_MAX, |
| "no preinv with nails"); |
| return; |
| } |
| |
| if (UDIV_PREINV_ALWAYS) |
| { |
| print_define_remark ("DIVREM_2_THRESHOLD", 0L, "preinv always"); |
| return; |
| } |
| |
| /* Tune for the integer part of mpn_divrem_2. This will very possibly be |
| a bit out for the fractional part, but that's too bad, the integer part |
| is more important. |
| |
| min_size must be >=2 since nsize>=2 is required, but is set to 4 to save |
| code space if plain division is better only at size==2 or size==3. */ |
| param.name = "DIVREM_2_THRESHOLD"; |
| param.check_size = 256; |
| param.min_size = 4; |
| param.min_is_always = 1; |
| param.size_extra = 2; /* does qsize==nsize-2 divisions */ |
| param.stop_factor = 2.0; |
| |
| s.r = randlimb_norm (); |
| param.function = speed_mpn_divrem_2; |
| one (&divrem_2_threshold, ¶m); |
| } |
| |
| |
| /* mpn_divexact_1 is vaguely expected to be used on smallish divisors, so |
| tune for that. Its speed can differ on odd or even divisor, so take an |
| average threshold for the two. |
| |
| mpn_divrem_1 can vary with high<divisor or not, whereas mpn_divexact_1 |
| might not vary that way, but don't test this since high<divisor isn't |
| expected to occur often with small divisors. */ |
| |
| void |
| tune_divexact_1 (void) |
| { |
| static struct param_t param; |
| mp_size_t thresh[2], average; |
| int low, i; |
| |
| /* Any native mpn_divexact_1 is assumed to incorporate all the speed of a |
| full mpn_divrem_1. */ |
| if (HAVE_NATIVE_mpn_divexact_1) |
| { |
| print_define_remark ("DIVEXACT_1_THRESHOLD", 0, "always (native)"); |
| return; |
| } |
| |
| ASSERT_ALWAYS (tuned_speed_mpn_divrem_1 != NULL); |
| |
| param.name = "DIVEXACT_1_THRESHOLD"; |
| param.data_high = DATA_HIGH_GE_R; |
| param.check_size = 256; |
| param.min_size = 2; |
| param.stop_factor = 1.5; |
| param.function = tuned_speed_mpn_divrem_1; |
| param.function2 = speed_mpn_divexact_1; |
| param.noprint = 1; |
| |
| print_define_start (param.name); |
| |
| for (low = 0; low <= 1; low++) |
| { |
| s.r = randlimb_half(); |
| if (low == 0) |
| s.r |= 1; |
| else |
| s.r &= ~CNST_LIMB(7); |
| |
| one (&thresh[low], ¶m); |
| if (option_trace) |
| printf ("low=%d thresh %ld\n", low, (long) thresh[low]); |
| |
| if (thresh[low] == MP_SIZE_T_MAX) |
| { |
| average = MP_SIZE_T_MAX; |
| goto divexact_1_done; |
| } |
| } |
| |
| if (option_trace) |
| { |
| printf ("average of:"); |
| for (i = 0; i < numberof(thresh); i++) |
| printf (" %ld", (long) thresh[i]); |
| printf ("\n"); |
| } |
| |
| average = 0; |
| for (i = 0; i < numberof(thresh); i++) |
| average += thresh[i]; |
| average /= numberof(thresh); |
| |
| /* If divexact turns out to be better as early as 3 limbs, then use it |
| always, so as to reduce code size and conditional jumps. */ |
| if (average <= 3) |
| average = 0; |
| |
| divexact_1_done: |
| print_define_end (param.name, average); |
| } |
| |
| |
| /* The generic mpn_modexact_1_odd skips a divide step if high<divisor, the |
| same as mpn_mod_1, but this might not be true of an assembler |
| implementation. The threshold used is an average based on data where a |
| divide can be skipped and where it can't. |
| |
| If modexact turns out to be better as early as 3 limbs, then use it |
| always, so as to reduce code size and conditional jumps. */ |
| |
| void |
| tune_modexact_1_odd (void) |
| { |
| static struct param_t param; |
| mp_size_t thresh_lt, thresh_ge, average; |
| |
| /* Any native mpn_modexact_1_odd is assumed to incorporate all the speed |
| of a full mpn_mod_1. */ |
| if (HAVE_NATIVE_mpn_modexact_1_odd) |
| { |
| print_define_remark ("MODEXACT_1_ODD_THRESHOLD", 0, "always (native)"); |
| return; |
| } |
| |
| ASSERT_ALWAYS (tuned_speed_mpn_mod_1 != NULL); |
| |
| param.name = "MODEXACT_1_ODD_THRESHOLD"; |
| param.check_size = 256; |
| param.min_size = 2; |
| param.stop_factor = 1.5; |
| param.function = tuned_speed_mpn_mod_1; |
| param.function2 = speed_mpn_modexact_1c_odd; |
| param.noprint = 1; |
| s.r = randlimb_half () | 1; |
| |
| print_define_start (param.name); |
| |
| param.data_high = DATA_HIGH_LT_R; |
| one (&thresh_lt, ¶m); |
| if (option_trace) |
| printf ("lt thresh %ld\n", (long) thresh_lt); |
| |
| average = thresh_lt; |
| if (thresh_lt != MP_SIZE_T_MAX) |
| { |
| param.data_high = DATA_HIGH_GE_R; |
| one (&thresh_ge, ¶m); |
| if (option_trace) |
| printf ("ge thresh %ld\n", (long) thresh_ge); |
| |
| if (thresh_ge != MP_SIZE_T_MAX) |
| { |
| average = (thresh_ge + thresh_lt) / 2; |
| if (thresh_ge <= 3) |
| average = 0; |
| } |
| } |
| |
| print_define_end (param.name, average); |
| } |
| |
| |
| void |
| tune_jacobi_base (void) |
| { |
| static struct param_t param; |
| double t1, t2, t3; |
| int method; |
| |
| s.size = BITS_PER_MP_LIMB * 3 / 4; |
| |
| t1 = tuneup_measure (speed_mpn_jacobi_base_1, ¶m, &s); |
| if (option_trace >= 1) |
| printf ("size=%ld, mpn_jacobi_base_1 %.9f\n", (long) s.size, t1); |
| |
| t2 = tuneup_measure (speed_mpn_jacobi_base_2, ¶m, &s); |
| if (option_trace >= 1) |
| printf ("size=%ld, mpn_jacobi_base_2 %.9f\n", (long) s.size, t2); |
| |
| t3 = tuneup_measure (speed_mpn_jacobi_base_3, ¶m, &s); |
| if (option_trace >= 1) |
| printf ("size=%ld, mpn_jacobi_base_3 %.9f\n", (long) s.size, t3); |
| |
| if (t1 == -1.0 || t2 == -1.0 || t3 == -1.0) |
| { |
| printf ("Oops, can't measure all mpn_jacobi_base methods at %ld\n", |
| (long) s.size); |
| abort (); |
| } |
| |
| if (t1 < t2 && t1 < t3) |
| method = 1; |
| else if (t2 < t3) |
| method = 2; |
| else |
| method = 3; |
| |
| print_define ("JACOBI_BASE_METHOD", method); |
| } |
| |
| |
| void |
| tune_get_str (void) |
| { |
| /* Tune for decimal, it being most common. Some rough testing suggests |
| other bases are different, but not by very much. */ |
| s.r = 10; |
| { |
| static struct param_t param; |
| GET_STR_PRECOMPUTE_THRESHOLD = 0; |
| param.name = "GET_STR_DC_THRESHOLD"; |
| param.function = speed_mpn_get_str; |
| param.min_size = 4; |
| param.max_size = GET_STR_THRESHOLD_LIMIT; |
| one (&get_str_dc_threshold, ¶m); |
| } |
| { |
| static struct param_t param; |
| param.name = "GET_STR_PRECOMPUTE_THRESHOLD"; |
| param.function = speed_mpn_get_str; |
| param.min_size = GET_STR_DC_THRESHOLD; |
| param.max_size = GET_STR_THRESHOLD_LIMIT; |
| one (&get_str_precompute_threshold, ¶m); |
| } |
| } |
| |
| |
| double |
| speed_mpn_pre_set_str (struct speed_params *s) |
| { |
| unsigned char *str; |
| mp_ptr wp; |
| mp_size_t wn; |
| unsigned i; |
| int base; |
| double t; |
| mp_ptr powtab_mem, tp; |
| powers_t powtab[GMP_LIMB_BITS]; |
| mp_size_t un; |
| int chars_per_limb; |
| TMP_DECL; |
| |
| SPEED_RESTRICT_COND (s->size >= 1); |
| |
| base = s->r == 0 ? 10 : s->r; |
| SPEED_RESTRICT_COND (base >= 2 && base <= 256); |
| |
| TMP_MARK; |
| |
| str = TMP_ALLOC (s->size); |
| for (i = 0; i < s->size; i++) |
| str[i] = s->xp[i] % base; |
| |
| wn = ((mp_size_t) (s->size / __mp_bases[base].chars_per_bit_exactly)) |
| / BITS_PER_MP_LIMB + 2; |
| SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp); |
| |
| /* use this during development to check wn is big enough */ |
| /* |
| ASSERT_ALWAYS (mpn_set_str (wp, str, s->size, base) <= wn); |
| */ |
| |
| speed_operand_src (s, (mp_ptr) str, s->size/BYTES_PER_MP_LIMB); |
| speed_operand_dst (s, wp, wn); |
| speed_cache_fill (s); |
| |
| chars_per_limb = __mp_bases[base].chars_per_limb; |
| un = s->size / chars_per_limb + 1; |
| powtab_mem = TMP_BALLOC_LIMBS (mpn_dc_set_str_powtab_alloc (un)); |
| mpn_set_str_compute_powtab (powtab, powtab_mem, un, base); |
| tp = TMP_BALLOC_LIMBS (mpn_dc_set_str_itch (un)); |
| |
| speed_starttime (); |
| i = s->reps; |
| do |
| { |
| mpn_pre_set_str (wp, str, s->size, powtab, tp); |
| } |
| while (--i != 0); |
| t = speed_endtime (); |
| |
| TMP_FREE; |
| return t; |
| } |
| |
| void |
| tune_set_str (void) |
| { |
| static struct param_t param; |
| |
| s.r = 10; /* decimal */ |
| { |
| static struct param_t param; |
| SET_STR_PRECOMPUTE_THRESHOLD = 0; |
| param.step_factor = 0.01; |
| param.name = "SET_STR_DC_THRESHOLD"; |
| param.function = speed_mpn_pre_set_str; |
| param.min_size = 100; |
| param.max_size = 50000; |
| one (&set_str_dc_threshold, ¶m); |
| } |
| { |
| static struct param_t param; |
| param.step_factor = 0.02; |
| param.name = "SET_STR_PRECOMPUTE_THRESHOLD"; |
| param.function = speed_mpn_set_str; |
| param.min_size = SET_STR_DC_THRESHOLD; |
| param.max_size = 100000; |
| one (&set_str_precompute_threshold, ¶m); |
| } |
| } |
| |
| |
| void |
| tune_fft_mul (void) |
| { |
| static struct fft_param_t param; |
| |
| if (option_fft_max_size == 0) |
| return; |
| |
| param.table_name = "MUL_FFT_TABLE"; |
| param.threshold_name = "MUL_FFT_THRESHOLD"; |
| param.p_threshold = &mul_fft_threshold; |
| param.modf_threshold_name = "MUL_FFT_MODF_THRESHOLD"; |
| param.p_modf_threshold = &mul_fft_modf_threshold; |
| param.first_size = MUL_TOOM3_THRESHOLD / 2; |
| param.max_size = option_fft_max_size; |
| param.function = speed_mpn_mul_fft; |
| param.mul_function = speed_mpn_mul_n; |
| param.sqr = 0; |
| fft (¶m); |
| } |
| |
| |
| void |
| tune_fft_sqr (void) |
| { |
| static struct fft_param_t param; |
| |
| if (option_fft_max_size == 0) |
| return; |
| |
| param.table_name = "SQR_FFT_TABLE"; |
| param.threshold_name = "SQR_FFT_THRESHOLD"; |
| param.p_threshold = &sqr_fft_threshold; |
| param.modf_threshold_name = "SQR_FFT_MODF_THRESHOLD"; |
| param.p_modf_threshold = &sqr_fft_modf_threshold; |
| param.first_size = SQR_TOOM3_THRESHOLD / 2; |
| param.max_size = option_fft_max_size; |
| param.function = speed_mpn_mul_fft_sqr; |
| param.mul_function = speed_mpn_sqr_n; |
| param.sqr = 0; |
| fft (¶m); |
| } |
| |
| |
| void |
| all (void) |
| { |
| time_t start_time, end_time; |
| TMP_DECL; |
| |
| TMP_MARK; |
| SPEED_TMP_ALLOC_LIMBS (s.xp_block, SPEED_BLOCK_SIZE, 0); |
| SPEED_TMP_ALLOC_LIMBS (s.yp_block, SPEED_BLOCK_SIZE, 0); |
| |
| mpn_random (s.xp_block, SPEED_BLOCK_SIZE); |
| mpn_random (s.yp_block, SPEED_BLOCK_SIZE); |
| |
| fprintf (stderr, "Parameters for %s\n", GMP_MPARAM_H_SUGGEST); |
| |
| speed_time_init (); |
| fprintf (stderr, "Using: %s\n", speed_time_string); |
| |
| fprintf (stderr, "speed_precision %d", speed_precision); |
| if (speed_unittime == 1.0) |
| fprintf (stderr, ", speed_unittime 1 cycle"); |
| else |
| fprintf (stderr, ", speed_unittime %.2e secs", speed_unittime); |
| if (speed_cycletime == 1.0 || speed_cycletime == 0.0) |
| fprintf (stderr, ", CPU freq unknown\n"); |
| else |
| fprintf (stderr, ", CPU freq %.2f MHz\n", 1e-6/speed_cycletime); |
| |
| fprintf (stderr, "DEFAULT_MAX_SIZE %d, fft_max_size %ld\n", |
| DEFAULT_MAX_SIZE, (long) option_fft_max_size); |
| fprintf (stderr, "\n"); |
| |
| time (&start_time); |
| { |
| struct tm *tp; |
| tp = localtime (&start_time); |
| printf ("/* Generated by tuneup.c, %d-%02d-%02d, ", |
| tp->tm_year+1900, tp->tm_mon+1, tp->tm_mday); |
| |
| #ifdef __GNUC__ |
| /* gcc sub-minor version doesn't seem to come through as a define */ |
| printf ("gcc %d.%d */\n", __GNUC__, __GNUC_MINOR__); |
| #define PRINTED_COMPILER |
| #endif |
| #if defined (__SUNPRO_C) |
| printf ("Sun C %d.%d */\n", __SUNPRO_C / 0x100, __SUNPRO_C % 0x100); |
| #define PRINTED_COMPILER |
| #endif |
| #if ! defined (__GNUC__) && defined (__sgi) && defined (_COMPILER_VERSION) |
| /* gcc defines __sgi and _COMPILER_VERSION on irix 6, avoid that */ |
| printf ("MIPSpro C %d.%d.%d */\n", |
| _COMPILER_VERSION / 100, |
| _COMPILER_VERSION / 10 % 10, |
| _COMPILER_VERSION % 10); |
| #define PRINTED_COMPILER |
| #endif |
| #if defined (__DECC) && defined (__DECC_VER) |
| printf ("DEC C %d */\n", __DECC_VER); |
| #define PRINTED_COMPILER |
| #endif |
| #if ! defined (PRINTED_COMPILER) |
| printf ("system compiler */\n"); |
| #endif |
| } |
| printf ("\n"); |
| |
| tune_mul (); |
| printf("\n"); |
| |
| tune_sqr (); |
| printf("\n"); |
| |
| tune_mullow (); |
| printf("\n"); |
| |
| tune_sb_preinv (); |
| tune_dc (); |
| tune_powm (); |
| printf("\n"); |
| |
| tune_matrix22_mul (); |
| tune_hgcd (); |
| tune_gcd_dc (); |
| tune_gcdext_dc (); |
| #if 0 |
| tune_gcd_accel (); |
| #endif |
| tune_jacobi_base (); |
| printf("\n"); |
| |
| tune_divrem_1 (); |
| tune_mod_1 (); |
| tune_preinv_divrem_1 (); |
| tune_preinv_mod_1 (); |
| tune_divrem_2 (); |
| tune_divexact_1 (); |
| tune_modexact_1_odd (); |
| printf("\n"); |
| |
| tune_get_str (); |
| tune_set_str (); |
| printf("\n"); |
| |
| tune_fft_mul (); |
| printf("\n"); |
| |
| tune_fft_sqr (); |
| printf ("\n"); |
| |
| time (&end_time); |
| printf ("/* Tuneup completed successfully, took %ld seconds */\n", |
| end_time - start_time); |
| |
| TMP_FREE; |
| } |
| |
| |
| int |
| main (int argc, char *argv[]) |
| { |
| int opt; |
| |
| /* Unbuffered so if output is redirected to a file it isn't lost if the |
| program is killed part way through. */ |
| setbuf (stdout, NULL); |
| setbuf (stderr, NULL); |
| |
| while ((opt = getopt(argc, argv, "f:o:p:t")) != EOF) |
| { |
| switch (opt) { |
| case 'f': |
| if (optarg[0] == 't') |
| option_fft_trace = 2; |
| else |
| option_fft_max_size = atol (optarg); |
| break; |
| case 'o': |
| speed_option_set (optarg); |
| break; |
| case 'p': |
| speed_precision = atoi (optarg); |
| break; |
| case 't': |
| option_trace++; |
| break; |
| case '?': |
| exit(1); |
| } |
| } |
| |
| all (); |
| exit (0); |
| } |