| /* hgcd.c. |
| |
| THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY |
| SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST |
| GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. |
| |
| Copyright 2003, 2004, 2005, 2008 Free Software Foundation, Inc. |
| |
| This file is part of the GNU MP Library. |
| |
| The GNU MP Library is free software; you can redistribute it and/or modify |
| it under the terms of the GNU Lesser General Public License as published by |
| the Free Software Foundation; either version 3 of the License, or (at your |
| option) any later version. |
| |
| The GNU MP Library is distributed in the hope that it will be useful, but |
| WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
| or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public |
| License for more details. |
| |
| You should have received a copy of the GNU Lesser General Public License |
| along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ |
| |
| #include "gmp.h" |
| #include "gmp-impl.h" |
| #include "longlong.h" |
| |
| /* For input of size n, matrix elements are of size at most ceil(n/2) |
| - 1, but we need two limbs extra. */ |
| void |
| mpn_hgcd_matrix_init (struct hgcd_matrix *M, mp_size_t n, mp_ptr p) |
| { |
| mp_size_t s = (n+1)/2 + 1; |
| M->alloc = s; |
| M->n = 1; |
| MPN_ZERO (p, 4 * s); |
| M->p[0][0] = p; |
| M->p[0][1] = p + s; |
| M->p[1][0] = p + 2 * s; |
| M->p[1][1] = p + 3 * s; |
| |
| M->p[0][0][0] = M->p[1][1][0] = 1; |
| } |
| |
| /* Updated column COL, adding in column (1-COL). */ |
| static void |
| hgcd_matrix_update_1 (struct hgcd_matrix *M, unsigned col) |
| { |
| mp_limb_t c0, c1; |
| ASSERT (col < 2); |
| |
| c0 = mpn_add_n (M->p[0][col], M->p[0][0], M->p[0][1], M->n); |
| c1 = mpn_add_n (M->p[1][col], M->p[1][0], M->p[1][1], M->n); |
| |
| M->p[0][col][M->n] = c0; |
| M->p[1][col][M->n] = c1; |
| |
| M->n += (c0 | c1) != 0; |
| ASSERT (M->n < M->alloc); |
| } |
| |
| /* Updated column COL, adding in column Q * (1-COL). Temporary |
| * storage: qn + n <= M->alloc, where n is the size of the largest |
| * element in column 1 - COL. */ |
| static void |
| hgcd_matrix_update_q (struct hgcd_matrix *M, mp_srcptr qp, mp_size_t qn, |
| unsigned col, mp_ptr tp) |
| { |
| ASSERT (col < 2); |
| |
| if (qn == 1) |
| { |
| mp_limb_t q = qp[0]; |
| mp_limb_t c0, c1; |
| |
| c0 = mpn_addmul_1 (M->p[0][col], M->p[0][1-col], M->n, q); |
| c1 = mpn_addmul_1 (M->p[1][col], M->p[1][1-col], M->n, q); |
| |
| M->p[0][col][M->n] = c0; |
| M->p[1][col][M->n] = c1; |
| |
| M->n += (c0 | c1) != 0; |
| } |
| else |
| { |
| unsigned row; |
| |
| /* Carries for the unlikely case that we get both high words |
| from the multiplication and carries from the addition. */ |
| mp_limb_t c[2]; |
| mp_size_t n; |
| |
| /* The matrix will not necessarily grow in size by qn, so we |
| need normalization in order not to overflow M. */ |
| |
| for (n = M->n; n + qn > M->n; n--) |
| { |
| ASSERT (n > 0); |
| if (M->p[0][1-col][n-1] > 0 || M->p[1][1-col][n-1] > 0) |
| break; |
| } |
| |
| ASSERT (qn + n <= M->alloc); |
| |
| for (row = 0; row < 2; row++) |
| { |
| if (qn <= n) |
| mpn_mul (tp, M->p[row][1-col], n, qp, qn); |
| else |
| mpn_mul (tp, qp, qn, M->p[row][1-col], n); |
| |
| ASSERT (n + qn >= M->n); |
| c[row] = mpn_add (M->p[row][col], tp, n + qn, M->p[row][col], M->n); |
| } |
| if (c[0] | c[1]) |
| { |
| M->n = n + qn + 1; |
| M->p[0][col][n-1] = c[0]; |
| M->p[1][col][n-1] = c[1]; |
| } |
| else |
| { |
| n += qn; |
| n -= (M->p[0][col][n-1] | M->p[1][col][n-1]) == 0; |
| if (n > M->n) |
| M->n = n; |
| } |
| } |
| |
| ASSERT (M->n < M->alloc); |
| } |
| |
| /* Multiply M by M1 from the right. Since the M1 elements fit in |
| GMP_NUMB_BITS - 1 bits, M grows by at most one limb. Needs |
| temporary space M->n */ |
| static void |
| hgcd_matrix_mul_1 (struct hgcd_matrix *M, const struct hgcd_matrix1 *M1, |
| mp_ptr tp) |
| { |
| mp_size_t n0, n1; |
| |
| /* Could avoid copy by some swapping of pointers. */ |
| MPN_COPY (tp, M->p[0][0], M->n); |
| n0 = mpn_hgcd_mul_matrix1_vector (M1, M->p[0][0], tp, M->p[0][1], M->n); |
| MPN_COPY (tp, M->p[1][0], M->n); |
| n1 = mpn_hgcd_mul_matrix1_vector (M1, M->p[1][0], tp, M->p[1][1], M->n); |
| |
| /* Depends on zero initialization */ |
| M->n = MAX(n0, n1); |
| ASSERT (M->n < M->alloc); |
| } |
| |
| /* Perform a few steps, using some of mpn_hgcd2, subtraction and |
| division. Reduces the size by almost one limb or more, but never |
| below the given size s. Return new size for a and b, or 0 if no |
| more steps are possible. |
| |
| If hgcd2 succeds, needs temporary space for hgcd_matrix_mul_1, M->n |
| limbs, and hgcd_mul_matrix1_inverse_vector, n limbs. If hgcd2 |
| fails, needs space for the quotient, qn <= n - s + 1 limbs, for and |
| hgcd_matrix_update_q, qn + (size of the appropriate column of M) <= |
| resulting size of $. |
| |
| If N is the input size to the calling hgcd, then s = floor(N/2) + |
| 1, M->n < N, qn + matrix size <= n - s + 1 + n - s = 2 (n - s) + 1 |
| < N, so N is sufficient. |
| */ |
| |
| static mp_size_t |
| hgcd_step (mp_size_t n, mp_ptr ap, mp_ptr bp, mp_size_t s, |
| struct hgcd_matrix *M, mp_ptr tp) |
| { |
| struct hgcd_matrix1 M1; |
| mp_limb_t mask; |
| mp_limb_t ah, al, bh, bl; |
| mp_size_t an, bn, qn; |
| int col; |
| |
| ASSERT (n > s); |
| |
| mask = ap[n-1] | bp[n-1]; |
| ASSERT (mask > 0); |
| |
| if (n == s + 1) |
| { |
| if (mask < 4) |
| goto subtract; |
| |
| ah = ap[n-1]; al = ap[n-2]; |
| bh = bp[n-1]; bl = bp[n-2]; |
| } |
| else if (mask & GMP_NUMB_HIGHBIT) |
| { |
| ah = ap[n-1]; al = ap[n-2]; |
| bh = bp[n-1]; bl = bp[n-2]; |
| } |
| else |
| { |
| int shift; |
| |
| count_leading_zeros (shift, mask); |
| ah = MPN_EXTRACT_NUMB (shift, ap[n-1], ap[n-2]); |
| al = MPN_EXTRACT_NUMB (shift, ap[n-2], ap[n-3]); |
| bh = MPN_EXTRACT_NUMB (shift, bp[n-1], bp[n-2]); |
| bl = MPN_EXTRACT_NUMB (shift, bp[n-2], bp[n-3]); |
| } |
| |
| /* Try an mpn_hgcd2 step */ |
| if (mpn_hgcd2 (ah, al, bh, bl, &M1)) |
| { |
| /* Multiply M <- M * M1 */ |
| hgcd_matrix_mul_1 (M, &M1, tp); |
| |
| /* Can't swap inputs, so we need to copy. */ |
| MPN_COPY (tp, ap, n); |
| /* Multiply M1^{-1} (a;b) */ |
| return mpn_hgcd_mul_matrix1_inverse_vector (&M1, ap, tp, bp, n); |
| } |
| |
| subtract: |
| /* There are two ways in which mpn_hgcd2 can fail. Either one of ah and |
| bh was too small, or ah, bh were (almost) equal. Perform one |
| subtraction step (for possible cancellation of high limbs), |
| followed by one division. */ |
| |
| /* Since we must ensure that #(a-b) > s, we handle cancellation of |
| high limbs explicitly up front. (FIXME: Or is it better to just |
| subtract, normalize, and use an addition to undo if it turns out |
| the the difference is too small?) */ |
| for (an = n; an > s; an--) |
| if (ap[an-1] != bp[an-1]) |
| break; |
| |
| if (an == s) |
| return 0; |
| |
| /* Maintain a > b. When needed, swap a and b, and let col keep track |
| of how to update M. */ |
| if (ap[an-1] > bp[an-1]) |
| { |
| /* a is largest. In the subtraction step, we need to update |
| column 1 of M */ |
| col = 1; |
| } |
| else |
| { |
| MP_PTR_SWAP (ap, bp); |
| col = 0; |
| } |
| |
| bn = n; |
| MPN_NORMALIZE (bp, bn); |
| if (bn <= s) |
| return 0; |
| |
| /* We have #a, #b > s. When is it possible that #(a-b) < s? For |
| cancellation to happen, the numbers must be of the form |
| |
| a = x + 1, 0, ..., 0, al |
| b = x , GMP_NUMB_MAX, ..., GMP_NUMB_MAX, bl |
| |
| where al, bl denotes the least significant k limbs. If al < bl, |
| then #(a-b) < k, and if also high(al) != 0, high(bl) != GMP_NUMB_MAX, |
| then #(a-b) = k. If al >= bl, then #(a-b) = k + 1. */ |
| |
| if (ap[an-1] == bp[an-1] + 1) |
| { |
| mp_size_t k; |
| int c; |
| for (k = an-1; k > s; k--) |
| if (ap[k-1] != 0 || bp[k-1] != GMP_NUMB_MAX) |
| break; |
| |
| MPN_CMP (c, ap, bp, k); |
| if (c < 0) |
| { |
| mp_limb_t cy; |
| |
| /* The limbs from k and up are cancelled. */ |
| if (k == s) |
| return 0; |
| cy = mpn_sub_n (ap, ap, bp, k); |
| ASSERT (cy == 1); |
| an = k; |
| } |
| else |
| { |
| ASSERT_NOCARRY (mpn_sub_n (ap, ap, bp, k)); |
| ap[k] = 1; |
| an = k + 1; |
| } |
| } |
| else |
| ASSERT_NOCARRY (mpn_sub_n (ap, ap, bp, an)); |
| |
| ASSERT (an > s); |
| ASSERT (ap[an-1] > 0); |
| ASSERT (bn > s); |
| ASSERT (bp[bn-1] > 0); |
| |
| hgcd_matrix_update_1 (M, col); |
| |
| if (an < bn) |
| { |
| MPN_PTR_SWAP (ap, an, bp, bn); |
| col ^= 1; |
| } |
| else if (an == bn) |
| { |
| int c; |
| MPN_CMP (c, ap, bp, an); |
| if (c < 0) |
| { |
| MP_PTR_SWAP (ap, bp); |
| col ^= 1; |
| } |
| } |
| |
| /* Divide a / b. */ |
| qn = an + 1 - bn; |
| |
| /* FIXME: We could use an approximate division, that may return a |
| too small quotient, and only guarantee that the size of r is |
| almost the size of b. FIXME: Let ap and remainder overlap. */ |
| mpn_tdiv_qr (tp, ap, 0, ap, an, bp, bn); |
| qn -= (tp[qn -1] == 0); |
| |
| /* Normalize remainder */ |
| an = bn; |
| for ( ; an > s; an--) |
| if (ap[an-1] > 0) |
| break; |
| |
| if (an <= s) |
| { |
| /* Quotient is too large */ |
| mp_limb_t cy; |
| |
| cy = mpn_add (ap, bp, bn, ap, an); |
| |
| if (cy > 0) |
| { |
| ASSERT (bn < n); |
| ap[bn] = cy; |
| bp[bn] = 0; |
| bn++; |
| } |
| |
| MPN_DECR_U (tp, qn, 1); |
| qn -= (tp[qn-1] == 0); |
| } |
| |
| if (qn > 0) |
| hgcd_matrix_update_q (M, tp, qn, col, tp + qn); |
| |
| return bn; |
| } |
| |
| /* Reduces a,b until |a-b| fits in n/2 + 1 limbs. Constructs matrix M |
| with elements of size at most (n+1)/2 - 1. Returns new size of a, |
| b, or zero if no reduction is possible. */ |
| mp_size_t |
| mpn_hgcd_lehmer (mp_ptr ap, mp_ptr bp, mp_size_t n, |
| struct hgcd_matrix *M, mp_ptr tp) |
| { |
| mp_size_t s = n/2 + 1; |
| mp_size_t nn; |
| |
| ASSERT (n > s); |
| ASSERT (ap[n-1] > 0 || bp[n-1] > 0); |
| |
| nn = hgcd_step (n, ap, bp, s, M, tp); |
| if (!nn) |
| return 0; |
| |
| for (;;) |
| { |
| n = nn; |
| ASSERT (n > s); |
| nn = hgcd_step (n, ap, bp, s, M, tp); |
| if (!nn ) |
| return n; |
| } |
| } |
| |
| /* Multiply M by M1 from the right. Needs 4*(M->n + M1->n) + 5 limbs |
| of temporary storage (see mpn_matrix22_mul_itch). */ |
| void |
| mpn_hgcd_matrix_mul (struct hgcd_matrix *M, const struct hgcd_matrix *M1, |
| mp_ptr tp) |
| { |
| mp_size_t n; |
| |
| /* About the new size of M:s elements. Since M1's diagonal elements |
| are > 0, no element can decrease. The new elements are of size |
| M->n + M1->n, one limb more or less. The computation of the |
| matrix product produces elements of size M->n + M1->n + 1. But |
| the true size, after normalization, may be three limbs smaller. */ |
| |
| /* FIXME: Strassen multiplication gives only a small speedup. In FFT |
| multiplication range, this function could be sped up quite a lot |
| using invariance. */ |
| ASSERT (M->n + M1->n < M->alloc); |
| |
| ASSERT ((M->p[0][0][M->n-1] | M->p[0][1][M->n-1] |
| | M->p[1][0][M->n-1] | M->p[1][1][M->n-1]) > 0); |
| |
| ASSERT ((M1->p[0][0][M1->n-1] | M1->p[0][1][M1->n-1] |
| | M1->p[1][0][M1->n-1] | M1->p[1][1][M1->n-1]) > 0); |
| |
| mpn_matrix22_mul (M->p[0][0], M->p[0][1], |
| M->p[1][0], M->p[1][1], M->n, |
| M1->p[0][0], M1->p[0][1], |
| M1->p[1][0], M1->p[1][1], M1->n, tp); |
| |
| /* Index of last potentially non-zero limb, size is one greater. */ |
| n = M->n + M1->n; |
| |
| n -= ((M->p[0][0][n] | M->p[0][1][n] | M->p[1][0][n] | M->p[1][1][n]) == 0); |
| n -= ((M->p[0][0][n] | M->p[0][1][n] | M->p[1][0][n] | M->p[1][1][n]) == 0); |
| n -= ((M->p[0][0][n] | M->p[0][1][n] | M->p[1][0][n] | M->p[1][1][n]) == 0); |
| |
| ASSERT ((M->p[0][0][n] | M->p[0][1][n] | M->p[1][0][n] | M->p[1][1][n]) > 0); |
| |
| M->n = n + 1; |
| } |
| |
| /* Multiplies the least significant p limbs of (a;b) by M^-1. |
| Temporary space needed: 2 * (p + M->n)*/ |
| mp_size_t |
| mpn_hgcd_matrix_adjust (struct hgcd_matrix *M, |
| mp_size_t n, mp_ptr ap, mp_ptr bp, |
| mp_size_t p, mp_ptr tp) |
| { |
| /* M^-1 (a;b) = (r11, -r01; -r10, r00) (a ; b) |
| = (r11 a - r01 b; - r10 a + r00 b */ |
| |
| mp_ptr t0 = tp; |
| mp_ptr t1 = tp + p + M->n; |
| mp_limb_t ah, bh; |
| mp_limb_t cy; |
| |
| ASSERT (p + M->n < n); |
| |
| /* First compute the two values depending on a, before overwriting a */ |
| |
| if (M->n >= p) |
| { |
| mpn_mul (t0, M->p[1][1], M->n, ap, p); |
| mpn_mul (t1, M->p[1][0], M->n, ap, p); |
| } |
| else |
| { |
| mpn_mul (t0, ap, p, M->p[1][1], M->n); |
| mpn_mul (t1, ap, p, M->p[1][0], M->n); |
| } |
| |
| /* Update a */ |
| MPN_COPY (ap, t0, p); |
| ah = mpn_add (ap + p, ap + p, n - p, t0 + p, M->n); |
| |
| if (M->n >= p) |
| mpn_mul (t0, M->p[0][1], M->n, bp, p); |
| else |
| mpn_mul (t0, bp, p, M->p[0][1], M->n); |
| |
| cy = mpn_sub (ap, ap, n, t0, p + M->n); |
| ASSERT (cy <= ah); |
| ah -= cy; |
| |
| /* Update b */ |
| if (M->n >= p) |
| mpn_mul (t0, M->p[0][0], M->n, bp, p); |
| else |
| mpn_mul (t0, bp, p, M->p[0][0], M->n); |
| |
| MPN_COPY (bp, t0, p); |
| bh = mpn_add (bp + p, bp + p, n - p, t0 + p, M->n); |
| cy = mpn_sub (bp, bp, n, t1, p + M->n); |
| ASSERT (cy <= bh); |
| bh -= cy; |
| |
| if (ah > 0 || bh > 0) |
| { |
| ap[n] = ah; |
| bp[n] = bh; |
| n++; |
| } |
| else |
| { |
| /* The subtraction can reduce the size by at most one limb. */ |
| if (ap[n-1] == 0 && bp[n-1] == 0) |
| n--; |
| } |
| ASSERT (ap[n-1] > 0 || bp[n-1] > 0); |
| return n; |
| } |
| |
| /* Size analysis for hgcd: |
| |
| For the recursive calls, we have n1 <= ceil(n / 2). Then the |
| storage need is determined by the storage for the recursive call |
| computing M1, and hgcd_matrix_adjust and hgcd_matrix_mul calls that use M1 |
| (after this, the storage needed for M1 can be recycled). |
| |
| Let S(r) denote the required storage. For M1 we need 4 * (ceil(n1/2) + 1) |
| = 4 * (ceil(n/4) + 1), for the hgcd_matrix_adjust call, we need n + 2, |
| and for the hgcd_matrix_mul, we may need 4 ceil(n/2) + 1. In total, |
| 4 * ceil(n/4) + 4 ceil(n/2) + 5 <= 12 ceil(n/4) + 5. |
| |
| For the recursive call, we need S(n1) = S(ceil(n/2)). |
| |
| S(n) <= 12*ceil(n/4) + 5 + S(ceil(n/2)) |
| <= 12*(ceil(n/4) + ... + ceil(n/2^(1+k))) + 5k + S(ceil(n/2^k)) |
| <= 12*(2 ceil(n/4) + k) + 5k + S(n/2^k) |
| <= 24 ceil(n/4) + 17k + S(n/2^k) |
| |
| */ |
| |
| mp_size_t |
| mpn_hgcd_itch (mp_size_t n) |
| { |
| unsigned k; |
| int count; |
| mp_size_t nscaled; |
| |
| if (BELOW_THRESHOLD (n, HGCD_THRESHOLD)) |
| return MPN_HGCD_LEHMER_ITCH (n); |
| |
| /* Get the recursion depth. */ |
| nscaled = (n - 1) / (HGCD_THRESHOLD - 1); |
| count_leading_zeros (count, nscaled); |
| k = GMP_LIMB_BITS - count; |
| |
| return 24 * ((n+3) / 4) + 17 * k |
| + MPN_HGCD_LEHMER_ITCH (HGCD_THRESHOLD); |
| } |
| |
| /* Reduces a,b until |a-b| fits in n/2 + 1 limbs. Constructs matrix M |
| with elements of size at most (n+1)/2 - 1. Returns new size of a, |
| b, or zero if no reduction is possible. */ |
| |
| mp_size_t |
| mpn_hgcd (mp_ptr ap, mp_ptr bp, mp_size_t n, |
| struct hgcd_matrix *M, mp_ptr tp) |
| { |
| mp_size_t s = n/2 + 1; |
| mp_size_t n2 = (3*n)/4 + 1; |
| |
| mp_size_t p, nn; |
| int success = 0; |
| |
| if (n <= s) |
| /* Happens when n <= 2, a fairly uninteresting case but exercised |
| by the random inputs of the testsuite. */ |
| return 0; |
| |
| ASSERT ((ap[n-1] | bp[n-1]) > 0); |
| |
| ASSERT ((n+1)/2 - 1 < M->alloc); |
| |
| if (BELOW_THRESHOLD (n, HGCD_THRESHOLD)) |
| return mpn_hgcd_lehmer (ap, bp, n, M, tp); |
| |
| p = n/2; |
| nn = mpn_hgcd (ap + p, bp + p, n - p, M, tp); |
| if (nn > 0) |
| { |
| /* Needs 2*(p + M->n) <= 2*(floor(n/2) + ceil(n/2) - 1) |
| = 2 (n - 1) */ |
| n = mpn_hgcd_matrix_adjust (M, p + nn, ap, bp, p, tp); |
| success = 1; |
| } |
| while (n > n2) |
| { |
| /* Needs n + 1 storage */ |
| nn = hgcd_step (n, ap, bp, s, M, tp); |
| if (!nn) |
| return success ? n : 0; |
| n = nn; |
| success = 1; |
| } |
| |
| if (n > s + 2) |
| { |
| struct hgcd_matrix M1; |
| mp_size_t scratch; |
| |
| p = 2*s - n + 1; |
| scratch = MPN_HGCD_MATRIX_INIT_ITCH (n-p); |
| |
| mpn_hgcd_matrix_init(&M1, n - p, tp); |
| nn = mpn_hgcd (ap + p, bp + p, n - p, &M1, tp + scratch); |
| if (nn > 0) |
| { |
| /* We always have max(M) > 2^{-(GMP_NUMB_BITS + 1)} max(M1) */ |
| ASSERT (M->n + 2 >= M1.n); |
| |
| /* Furthermore, assume M ends with a quotient (1, q; 0, 1), |
| then either q or q + 1 is a correct quotient, and M1 will |
| start with either (1, 0; 1, 1) or (2, 1; 1, 1). This |
| rules out the case that the size of M * M1 is much |
| smaller than the expected M->n + M1->n. */ |
| |
| ASSERT (M->n + M1.n < M->alloc); |
| |
| /* Needs 2 (p + M->n) <= 2 (2*s - n2 + 1 + n2 - s - 1) |
| = 2*s <= 2*(floor(n/2) + 1) <= n + 2. */ |
| n = mpn_hgcd_matrix_adjust (&M1, p + nn, ap, bp, p, tp + scratch); |
| /* Needs 4 ceil(n/2) + 1 */ |
| mpn_hgcd_matrix_mul (M, &M1, tp + scratch); |
| success = 1; |
| } |
| } |
| |
| /* This really is the base case */ |
| for (;;) |
| { |
| /* Needs s+3 < n */ |
| nn = hgcd_step (n, ap, bp, s, M, tp); |
| if (!nn) |
| return success ? n : 0; |
| |
| n = nn; |
| success = 1; |
| } |
| } |