Ticket #424: gcd1.patch
File gcd1.patch, 137.3 KB (added by , 13 years ago) |
---|
-
gmp-impl.h
diff -N -u -r gmp-4.2.1/gmp-impl.h gmp-4.2.1-patched/gmp-impl.h
old new 2007 2007 #endif 2008 2008 2009 2009 2010 /****************************************************************************** 2011 ******************************************************************************* 2012 2013 This next chunk is from the file gmp-impl.h-gcd-newlines in the development 2014 gcd code, retrieved from GMP's website on 6-sep-2007. 2015 2016 ******************************************************************************* 2017 ******************************************************************************/ 2018 2019 2020 /* HGCD definitions */ 2021 2022 /* Limited by 2 + twice the bitsize of mp_size_t */ 2023 #define QSTACK_MAX_QUOTIENTS 82 2024 2025 /* Name mangling */ 2026 #define qstack_itch __gmpn_qstack_itch 2027 #define qstack_init __gmpn_qstack_init 2028 #define qstack_reset __gmpn_qstack_reset 2029 #define qstack_rotate __gmpn_qstack_rotate 2030 2031 #define mpn_hgcd2 __gmpn_hgcd2 2032 #define mpn_hgcd2_fix __gmpn_hgcd2_fix 2033 #define mpn_hgcd2_lehmer_step __gmpn_hgcd2_lehmer_step 2034 #define mpn_hgcd_max_recursion __gmpn_hgcd_max_recursion 2035 #define mpn_hgcd_init_itch __gmpn_hgcd_init_itch 2036 #define mpn_hgcd_init __gmpn_hgcd_init 2037 #define mpn_hgcd_lehmer_itch __gmpn_hgcd_lehmer_itch 2038 #define mpn_hgcd_lehmer __gmpn_hgcd_lehmer 2039 #define mpn_hgcd_itch __gmpn_hgcd_itch 2040 #define mpn_hgcd __gmpn_hgcd 2041 #define mpn_hgcd_equal __gmpn_hgcd_equal 2042 #define mpn_hgcd_fix __gmpn_hgcd_fix 2043 2044 struct qstack 2045 { 2046 /* Throughout the code we represent q = 1 with qsize = 0. */ 2047 mp_size_t size[QSTACK_MAX_QUOTIENTS]; 2048 mp_ptr limb; 2049 mp_size_t limb_alloc; 2050 2051 /* Number of quotients to keep when we discard old quotients */ 2052 unsigned nkeep; 2053 2054 /* Top quotient is of size size[size_next-1], and starts at 2055 limb+limb_next - size[size_next-1]. We use size_next == 0 for an 2056 empty stack.*/ 2057 unsigned size_next; 2058 mp_size_t limb_next; 2059 }; 2060 2061 mp_size_t 2062 qstack_itch __GMP_PROTO ((mp_size_t size)); 2063 2064 void 2065 qstack_init __GMP_PROTO ((struct qstack *stack, 2066 mp_size_t asize, 2067 mp_limb_t *limbs, mp_size_t alloc)); 2068 2069 void 2070 qstack_reset __GMP_PROTO ((struct qstack *stack, 2071 mp_size_t asize)); 2072 2073 void 2074 qstack_rotate __GMP_PROTO ((struct qstack *stack, 2075 mp_size_t size)); 2076 2077 #if WANT_ASSERT 2078 void 2079 __gmpn_qstack_sanity __GMP_PROTO ((struct qstack *stack)); 2080 #define ASSERT_QSTACK __gmpn_qstack_sanity 2081 #else 2082 #define ASSERT_QSTACK(stack) 2083 #endif 2084 2085 struct hgcd2_row 2086 { 2087 /* r = (-)u a + (-)v b */ 2088 mp_limb_t u; 2089 mp_limb_t v; 2090 }; 2091 2092 struct hgcd2 2093 { 2094 /* Sign of the first row, sign >= 0 implies that u >= 0 and v <= 0, 2095 sign < 0 implies u <= 0, v >= 0 */ 2096 int sign; 2097 struct hgcd2_row row[4]; 2098 }; 2099 2100 int 2101 mpn_hgcd2 __GMP_PROTO ((struct hgcd2 *hgcd, 2102 mp_limb_t ah, mp_limb_t al, 2103 mp_limb_t bh, mp_limb_t bl, 2104 struct qstack *quotients)); 2105 2106 mp_size_t 2107 mpn_hgcd2_fix __GMP_PROTO ((mp_ptr rp, mp_size_t ralloc, 2108 int sign, 2109 mp_limb_t u, mp_srcptr ap, mp_size_t asize, 2110 mp_limb_t v, mp_srcptr bp, mp_size_t bsize)); 2111 2112 int 2113 mpn_hgcd2_lehmer_step __GMP_PROTO ((struct hgcd2 *hgcd, 2114 mp_srcptr ap, mp_size_t asize, 2115 mp_srcptr bp, mp_size_t bsize, 2116 struct qstack *quotients)); 2117 2118 unsigned 2119 mpn_hgcd_max_recursion __GMP_PROTO ((mp_size_t n)); 2120 2121 struct hgcd_row 2122 { 2123 /* [rp, rsize] should always be normalized. */ 2124 mp_ptr rp; mp_size_t rsize; 2125 mp_ptr uvp[2]; 2126 }; 2127 2128 struct hgcd 2129 { 2130 int sign; 2131 /* Space allocated for the uv entries, for sanity checking */ 2132 mp_size_t alloc; 2133 /* Size of the largest u,v entry, usually row[3].uvp[1]. This 2134 element should be normalized. Smaller elements must be zero 2135 padded, and all unused limbs (i.e. between size and alloc) must 2136 be zero. */ 2137 mp_size_t size; 2138 struct hgcd_row row[4]; 2139 }; 2140 2141 mp_size_t 2142 mpn_hgcd_init_itch __GMP_PROTO ((mp_size_t size)); 2143 2144 void 2145 mpn_hgcd_init __GMP_PROTO ((struct hgcd *hgcd, 2146 mp_size_t asize, 2147 mp_limb_t *limbs)); 2148 2149 mp_size_t 2150 mpn_hgcd_lehmer_itch __GMP_PROTO ((mp_size_t asize)); 2151 2152 int 2153 mpn_hgcd_lehmer __GMP_PROTO ((struct hgcd *hgcd, 2154 mp_srcptr ap, mp_size_t asize, 2155 mp_srcptr bp, mp_size_t bsize, 2156 struct qstack *quotients, 2157 mp_ptr tp, mp_size_t talloc)); 2158 2159 mp_size_t 2160 mpn_hgcd_itch __GMP_PROTO ((mp_size_t size)); 2161 2162 int 2163 mpn_hgcd __GMP_PROTO ((struct hgcd *hgcd, 2164 mp_srcptr ap, mp_size_t asize, 2165 mp_srcptr bp, mp_size_t bsize, 2166 struct qstack *quotients, 2167 mp_ptr tp, mp_size_t talloc)); 2168 2169 #if WANT_ASSERT 2170 void 2171 __gmpn_hgcd_sanity __GMP_PROTO ((const struct hgcd *hgcd, 2172 mp_srcptr ap, mp_size_t asize, 2173 mp_srcptr bp, mp_size_t bsize, 2174 unsigned start, unsigned end)); 2175 #define ASSERT_HGCD __gmpn_hgcd_sanity 2176 #else 2177 #define ASSERT_HGCD(hgcd, ap, asize, bp, bsize, start, end) 2178 #endif 2179 2180 int 2181 mpn_hgcd_equal __GMP_PROTO ((const struct hgcd *A, const struct hgcd *B)); 2182 2183 mp_size_t 2184 mpn_hgcd_fix __GMP_PROTO ((mp_size_t k, 2185 mp_ptr rp, mp_size_t ralloc, 2186 int sign, mp_size_t uvsize, 2187 const struct hgcd_row *s, 2188 mp_srcptr ap, mp_srcptr bp, 2189 mp_ptr tp, mp_size_t talloc)); 2190 2191 #ifndef HGCD_SCHOENHAGE_THRESHOLD 2192 #define HGCD_SCHOENHAGE_THRESHOLD 150 2193 #endif 2194 2195 #if 0 2196 #ifndef GCD_LEHMER_THRESHOLD 2197 #define GCD_LEHMER_THRESHOLD 200 2198 #endif 2199 #endif 2200 2201 #ifndef GCD_SCHOENHAGE_THRESHOLD 2202 #define GCD_SCHOENHAGE_THRESHOLD 1000 2203 #endif 2204 2205 #ifndef GCDEXT_SCHOENHAGE_THRESHOLD 2206 #define GCDEXT_SCHOENHAGE_THRESHOLD 600 2207 #endif 2208 2209 2210 /****************************************************************************** 2211 ******************************************************************************* 2212 2213 End of gcd chunk 2214 2215 ******************************************************************************* 2216 ******************************************************************************/ 2217 2218 2219 2010 2220 /* Structure for conversion between internal binary format and 2011 2221 strings in base 2..36. */ 2012 2222 struct bases -
mpn/generic/gcd.c
diff -N -u -r gmp-4.2.1/mpn/generic/gcd.c gmp-4.2.1-patched/mpn/generic/gcd.c
old new 1 1 /* mpn/gcd.c: mpn_gcd for gcd of two odd integers. 2 2 3 Copyright 1991, 1993, 1994, 1995, 1996, 1997, 1998, 2000, 2001, 2002 Free4 Software Foundation, Inc.3 Copyright 1991, 1993, 1994, 1995, 1996, 1997, 1998, 2000, 2001, 2002, 2003, 4 2004 Free Software Foundation, Inc. 5 5 6 6 This file is part of the GNU MP Library. 7 7 … … 16 16 License for more details. 17 17 18 18 You should have received a copy of the GNU Lesser General Public License 19 along with the GNU MP Library; see the file COPYING.LIB. If not, write 20 t o the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,21 Boston, MA 02110-1301, USA. */19 along with the GNU MP Library; see the file COPYING.LIB. If not, write to 20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, 21 MA 02111-1307, USA. */ 22 22 23 23 /* Integer greatest common divisor of two unsigned integers, using 24 24 the accelerated algorithm (see reference below). … … 44 44 K. Weber, The accelerated integer GCD algorithm, ACM Transactions on 45 45 Mathematical Software, v. 21 (March), 1995, pp. 111-122. */ 46 46 47 #include <stdio.h> /* for NULL */ 48 47 49 #include "gmp.h" 48 50 #include "gmp-impl.h" 49 51 #include "longlong.h" 50 52 53 51 54 /* If MIN (usize, vsize) >= GCD_ACCEL_THRESHOLD, then the accelerated 52 55 algorithm is used, otherwise the binary algorithm is used. This may be 53 56 adjusted for different architectures. */ … … 124 127 125 128 #if HAVE_NATIVE_mpn_gcd_finda 126 129 #define find_a(cp) mpn_gcd_finda (cp) 127 128 130 #else 129 131 static 130 132 #if ! defined (__i386__) … … 181 183 } 182 184 #endif 183 185 184 185 mp_size_t186 mpn_gcd (mp_ptr gp, mp_ptr up, mp_size_t usize, mp_ptr vp, mp_size_t vsize)186 /* v must be odd */ 187 static mp_size_t 188 gcd_binary_odd (mp_ptr gp, mp_ptr up, mp_size_t usize, mp_ptr vp, mp_size_t vsize) 187 189 { 188 190 mp_ptr orig_vp = vp; 189 191 mp_size_t orig_vsize = vsize; … … 440 442 TMP_FREE; 441 443 return vsize; 442 444 } 445 446 #define EVEN_P(x) (((x) & 1) == 0) 447 448 /* Allows an even v */ 449 static mp_size_t 450 gcd_binary (mp_ptr gp, mp_ptr up, mp_size_t usize, mp_ptr vp, mp_size_t vsize) 451 { 452 mp_size_t zero_words = 0; 453 mp_size_t gsize; 454 unsigned shift = 0; 455 456 ASSERT (usize > 0); 457 ASSERT (vsize > 0); 458 459 if (up[0] == 0 && vp[0] == 0) 460 { 461 do 462 gp[zero_words++] = 0; 463 while (up[zero_words] == 0 && vp[zero_words] == 0); 464 465 up += zero_words; usize -= zero_words; 466 vp += zero_words; vsize -= zero_words; 467 gp += zero_words; 468 } 469 470 /* Now u and v can have a common power of two < 2^GMP_NUMB_BITS */ 471 if (up[0] == 0) 472 { 473 ASSERT (vp[0] != 0); 474 if (EVEN_P (vp[0])) 475 { 476 count_trailing_zeros (shift, vp[0]); 477 ASSERT (shift > 0); 478 ASSERT_NOCARRY (mpn_rshift (vp, vp, vsize, shift)); 479 if (vp[vsize - 1] == 0) 480 vsize--; 481 } 482 } 483 else if (vp[0] == 0) 484 { 485 if (EVEN_P (up[0])) 486 { 487 count_trailing_zeros (shift, up[0]); 488 ASSERT (shift > 0); 489 } 490 while (vp[0] == 0) 491 { 492 vp++; 493 vsize--; 494 } 495 496 if (EVEN_P (vp[0])) 497 { 498 unsigned vcount; 499 500 count_trailing_zeros (vcount, vp[0]); 501 ASSERT (vcount > 0); 502 ASSERT_NOCARRY (mpn_rshift (vp, vp, vsize, vcount)); 503 if (vp[vsize - 1] == 0) 504 vsize--; 505 } 506 } 507 else if (EVEN_P (vp[0])) 508 { 509 unsigned vcount; 510 count_trailing_zeros (vcount, vp[0]); 511 ASSERT (vcount > 0); 512 ASSERT_NOCARRY (mpn_rshift (vp, vp, vsize, vcount)); 513 if (vp[vsize - 1] == 0) 514 vsize--; 515 516 if (EVEN_P (up[0])) 517 { 518 unsigned ucount; 519 count_trailing_zeros (ucount, up[0]); 520 ASSERT (ucount > 0); 521 shift = MIN (ucount, vcount); 522 } 523 } 524 525 gsize = gcd_binary_odd (gp, up, usize, vp, vsize); 526 if (shift) 527 { 528 mp_limb_t cy = mpn_lshift (gp, gp, gsize, shift); 529 if (cy) 530 gp[gsize++] = cy; 531 } 532 return gsize + zero_words; 533 } 534 535 #define MPN_LEQ_P(ap, asize, bp, bsize) \ 536 ((asize) < (bsize) || ((asize) == (bsize) \ 537 && mpn_cmp ((ap), (bp), (asize)) <= 0)) 538 539 /* Sets (a, b, c, d) <-- (c, d, a, b) */ 540 #define NHGCD_SWAP4_2(row) \ 541 do { \ 542 struct hgcd_row __nhgcd_swap4_2_tmp; \ 543 __nhgcd_swap4_2_tmp = row[0]; \ 544 row[0] = row[2]; \ 545 row[2] = __nhgcd_swap4_2_tmp; \ 546 __nhgcd_swap4_2_tmp = row[1]; \ 547 row[1] = row[3]; \ 548 row[3] = __nhgcd_swap4_2_tmp; \ 549 } while (0) 550 551 /* Sets (a, b, c) <-- (b, c, a) */ 552 #define NHGCD_SWAP3_LEFT(row) \ 553 do { \ 554 struct hgcd_row __nhgcd_swap4_left_tmp; \ 555 __nhgcd_swap4_left_tmp = row[0]; \ 556 row[0] = row[1]; \ 557 row[1] = row[2]; \ 558 row[2] = __nhgcd_swap4_left_tmp; \ 559 } while (0) 560 561 static mp_size_t 562 hgcd_tdiv (mp_ptr qp, 563 mp_ptr rp, mp_size_t *rsizep, 564 mp_srcptr ap, mp_size_t asize, 565 mp_srcptr bp, mp_size_t bsize) 566 { 567 mp_size_t qsize; 568 mp_size_t rsize; 569 570 mpn_tdiv_qr (qp, rp, 0, ap, asize, bp, bsize); 571 572 rsize = bsize; 573 MPN_NORMALIZE (rp, rsize); 574 *rsizep = rsize; 575 576 qsize = asize - bsize + 1; 577 qsize -= (qp[qsize - 1] == 0); 578 579 if (qsize == 1 && qp[0] == 1) 580 return 0; 581 582 return qsize; 583 } 584 585 586 #if 0 587 #define GCD_LEHMER_ITCH(asize) (5*((asize) + 1)) 588 589 static mp_size_t 590 gcd_lehmer (mp_ptr gp, mp_srcptr ap, mp_size_t asize, 591 mp_srcptr bp, mp_size_t bsize, 592 mp_ptr tp, mp_size_t talloc) 593 { 594 struct hgcd_row r[4]; 595 mp_ptr qp; 596 mp_size_t qsize; 597 mp_size_t ralloc = asize + 1; 598 599 ASSERT (asize >= bsize); 600 ASSERT (bsize > 0); 601 602 #if 0 603 if (BELOW_THRESHOLD (asize, MPN_GCD_LEHMER_THRESHOLD)) 604 { 605 ASSERT (asize + bsize + 2 <= talloc); 606 607 MPN_COPY (tp, ap, asize); 608 MPN_COPY (tp + asize + 1, bp, bsize); 609 return nhgcd_gcd_binary (gp, tp, asize, tp + asize + 1, bsize); 610 } 611 #endif 612 613 ASSERT (MPN_LEQ_P (bp, bsize, ap, asize)); 614 ASSERT (5 * asize + 4 <= talloc); 615 616 r[0].rp = tp; tp += ralloc; talloc -= ralloc; 617 r[1].rp = tp; tp += ralloc; talloc -= ralloc; 618 r[2].rp = tp; tp += ralloc; talloc -= ralloc; 619 r[3].rp = tp; tp += ralloc; talloc -= ralloc; 620 qp = tp; tp += asize; talloc -= asize; 621 622 MPN_COPY (r[0].rp, ap, asize); r[0].rsize = asize; 623 MPN_COPY (r[1].rp, bp, bsize); r[1].rsize = bsize; 624 625 #if 0 626 /* u and v fields aren't used, but zero them out so that we can call 627 trace_nhgcd_row */ 628 r[0].uvp[0] = r[0].uvp[1] = NULL; 629 r[1].uvp[0] = r[1].uvp[1] = NULL; 630 r[2].uvp[0] = r[2].uvp[1] = NULL; 631 r[3].uvp[0] = r[3].uvp[1] = NULL; 632 #endif 633 634 while (ABOVE_THRESHOLD (r[0].rsize, GCD_LEHMER_THRESHOLD) && r[1].rsize > 0) 635 { 636 struct hgcd2 hgcd; 637 int res = mpn_hgcd2_lehmer_step (&hgcd, 638 r[0].rp, r[0].rsize, 639 r[1].rp, r[1].rsize, 640 NULL); 641 642 if (!res || (res == 2 && hgcd.row[0].v == 0)) 643 { 644 qsize = hgcd_tdiv (qp, r[2].rp, &r[2].rsize, 645 r[0].rp, r[0].rsize, 646 r[1].rp, r[1].rsize); 647 NHGCD_SWAP3_LEFT (r); 648 } 649 else 650 { 651 const struct hgcd2_row *s = hgcd.row + (res - 2); 652 int sign = hgcd.sign; 653 if (res == 3) 654 sign = ~sign; 655 656 /* s[0] and s[1] correct. */ 657 r[2].rsize 658 = mpn_hgcd2_fix (r[2].rp, ralloc, 659 sign, 660 s[0].u, r[0].rp, r[0].rsize, 661 s[0].v, r[1].rp, r[1].rsize); 662 663 r[3].rsize 664 = mpn_hgcd2_fix (r[3].rp, ralloc, 665 ~sign, 666 s[1].u, r[0].rp, r[0].rsize, 667 s[1].v, r[1].rp, r[1].rsize); 668 669 NHGCD_SWAP4_2 (r); 670 } 671 } 672 673 if (r[1].rsize == 0) 674 { 675 MPN_COPY (gp, r[0].rp, r[0].rsize); 676 return r[0].rsize; 677 } 678 679 return gcd_binary (gp, r[0].rp, r[0].rsize, r[1].rp, r[1].rsize); 680 } 681 #endif 682 683 static mp_size_t 684 gcd_schoenhage_itch (mp_size_t asize) 685 { 686 /* Size for hgcd calls */ 687 mp_size_t ralloc = asize + 1; 688 mp_size_t hgcd_size = (asize + 1) / 2; 689 return (4 * ralloc /* Remainder storage */ 690 + mpn_hgcd_init_itch (hgcd_size) /* hgcd storage */ 691 + qstack_itch (hgcd_size) 692 + mpn_hgcd_itch (hgcd_size) /* nhgcd call */ 693 + 1+ 3 * asize / 4); /* hgcd_fix */ 694 } 695 696 static mp_size_t 697 gcd_schoenhage (mp_ptr gp, mp_srcptr ap, mp_size_t asize, 698 mp_srcptr bp, mp_size_t bsize, 699 mp_ptr tp, mp_size_t talloc) 700 { 701 mp_size_t scratch; 702 struct hgcd hgcd; 703 struct qstack quotients; 704 struct hgcd_row r[4]; 705 706 mp_size_t ralloc = asize + 1; 707 708 ASSERT (asize >= bsize); 709 ASSERT (bsize > 0); 710 711 ASSERT (MPN_LEQ_P (bp, bsize, ap, asize)); 712 713 ASSERT (4 * ralloc <= talloc); 714 tp += ralloc; talloc -= ralloc; 715 r[0].rp = tp; tp += ralloc; talloc -= ralloc; 716 r[1].rp = tp; tp += ralloc; talloc -= ralloc; 717 r[2].rp = tp; tp += ralloc; talloc -= ralloc; 718 r[3].rp = tp; tp += ralloc; talloc -= ralloc; 719 720 MPN_COPY (r[0].rp, ap, asize); r[0].rsize = asize; 721 MPN_COPY (r[1].rp, bp, bsize); r[1].rsize = bsize; 722 723 #if 0 724 /* We don't use the u and v fields, but zero them out so that we can 725 call trace_nhgcd_row while debugging. */ 726 r[0].uvp[0] = r[0].uvp[1] = NULL; 727 r[1].uvp[0] = r[1].uvp[1] = NULL; 728 r[2].uvp[0] = r[2].uvp[1] = NULL; 729 r[3].uvp[0] = r[3].uvp[1] = NULL; 730 #endif 731 732 scratch = mpn_hgcd_init_itch ((asize + 1)/2); 733 ASSERT (scratch <= talloc); 734 mpn_hgcd_init (&hgcd, (asize + 1)/2, tp); 735 tp += scratch; talloc -= scratch; 736 737 { 738 mp_size_t nlimbs = qstack_itch ((asize + 1)/2); 739 740 ASSERT (nlimbs <= talloc); 741 742 qstack_init ("ients, (asize + 1) / 2, tp, nlimbs); 743 744 tp += nlimbs; 745 talloc -= nlimbs; 746 } 747 748 while (ABOVE_THRESHOLD (r[0].rsize, GCD_SCHOENHAGE_THRESHOLD) 749 && r[1].rsize > 0) 750 { 751 mp_size_t k = r[0].rsize / 2; 752 int res; 753 754 #if 0 755 trace ("nhgcd_gcd_schoenhage\n"); 756 trace_nhgcd_row (r); 757 trace_nhgcd_row (r + 1); 758 #endif 759 if (r[1].rsize <= k) 760 goto euclid; 761 762 qstack_reset ("ients, r[0].rsize - k); 763 764 res = mpn_hgcd (&hgcd, 765 r[0].rp + k, r[0].rsize - k, 766 r[1].rp + k, r[1].rsize - k, 767 "ients, 768 tp, talloc); 769 770 if (res == 0 || res == 1) 771 { 772 euclid: 773 ASSERT (r[0].rsize - r[1].rsize + 1 <= talloc); 774 hgcd_tdiv (tp, r[2].rp, &r[2].rsize, 775 r[0].rp, r[0].rsize, 776 r[1].rp, r[1].rsize); 777 778 NHGCD_SWAP3_LEFT (r); 779 } 780 else 781 { 782 const struct hgcd_row *s = hgcd.row + (res - 2); 783 int sign = hgcd.sign; 784 if (res == 3) 785 sign = ~sign; 786 787 /* s[0] and s[1] are correct */ 788 r[2].rsize 789 = mpn_hgcd_fix (k, r[2].rp, ralloc, 790 sign, hgcd.size, s, 791 r[0].rp, r[1].rp, 792 tp, talloc); 793 794 r[3].rsize 795 = mpn_hgcd_fix (k, r[3].rp, ralloc, 796 ~sign, hgcd.size, s+1, 797 r[0].rp, r[1].rp, 798 tp, talloc); 799 800 NHGCD_SWAP4_2 (r); 801 } 802 } 803 804 #if 0 805 trace ("nhgcd_gcd_schoenhage after loop\n"); 806 trace_nhgcd_row (r); 807 trace_nhgcd_row (r + 1); 808 #endif 809 810 if (r[1].rsize == 0) 811 { 812 MPN_COPY (gp, r[0].rp, r[0].rsize); 813 return r[0].rsize; 814 } 815 #if 0 816 else if (ABOVE_THRESHOLD (r[0].rsize, GCD_LEHMER_THRESHOLD)) 817 return gcd_lehmer (gp, 818 r[0].rp, r[0].rsize, 819 r[1].rp, r[1].rsize, 820 tp, talloc); 821 #endif 822 else 823 return gcd_binary (gp, 824 r[0].rp, r[0].rsize, 825 r[1].rp, r[1].rsize); 826 } 827 828 mp_size_t 829 mpn_gcd (mp_ptr gp, mp_ptr up, mp_size_t usize, mp_ptr vp, mp_size_t vsize) 830 { 831 if (BELOW_THRESHOLD (vsize, GCD_SCHOENHAGE_THRESHOLD)) 832 { 833 if (usize > vsize + 1) 834 { 835 /* The size difference between the two operands is more than one 836 limb. Do an initial division, i.e., compute gcd(U mod V, V). */ 837 /* FIXME: Allocates O(usize) memory on stack. The planned mpn_div_r 838 will resolve that. */ 839 mp_ptr tp, qp, rp; 840 mp_size_t rsize, gsize; 841 int cnt; 842 TMP_DECL; 843 844 TMP_MARK; 845 tp = TMP_ALLOC_LIMBS (usize + 1); 846 qp = tp + vsize; 847 rp = tp; 848 849 mpn_tdiv_qr (qp, rp, 0L, up, usize, vp, vsize); 850 851 /* Remove any low zero bits from the remainder. */ 852 for (rsize = vsize; ; rsize--) 853 { 854 if (rsize == 0) 855 { 856 /* Remainder is zero, gcd(U, V) = V. */ 857 MPN_COPY (gp, vp, vsize); 858 return vsize; 859 } 860 if (rp[0] != 0) 861 break; 862 rp++; 863 } 864 if ((rp[0] & 1) == 0) 865 { 866 count_trailing_zeros (cnt, rp[0]); 867 mpn_rshift (rp, rp, rsize, cnt); 868 } 869 MPN_NORMALIZE_NOT_ZERO (rp, rsize); 870 871 gsize = gcd_binary_odd (gp, vp, vsize, rp, rsize); 872 TMP_FREE; 873 return gsize; 874 } 875 else 876 return gcd_binary_odd (gp, up, usize, vp, vsize); 877 } 878 879 /* The algorithms below require U >= V, while mpn_gcd is long documented as 880 requiring only that the position of U's msb >= V's msb. */ 881 if (usize == vsize && mpn_cmp (up, vp, usize) < 0) 882 MP_PTR_SWAP (up, vp); 883 884 #if 0 885 if (BELOW_THRESHOLD (usize, GCD_SCHOENHAGE_THRESHOLD)) 886 { 887 mp_size_t scratch; 888 mp_ptr tp; 889 mp_size_t gsize; 890 TMP_DECL; 891 892 TMP_MARK; 893 894 scratch = GCD_LEHMER_ITCH (usize); 895 tp = TMP_ALLOC_LIMBS (scratch); 896 897 gsize = gcd_lehmer (gp, up, usize, vp, vsize, tp, scratch); 898 TMP_FREE; 899 return gsize; 900 } 901 else 902 #endif 903 { 904 mp_size_t scratch; 905 mp_ptr tp; 906 mp_size_t gsize; 907 908 scratch = gcd_schoenhage_itch (usize); 909 tp = __GMP_ALLOCATE_FUNC_LIMBS (scratch); 910 911 gsize = gcd_schoenhage (gp, up, usize, vp, vsize, tp, scratch); 912 __GMP_FREE_FUNC_LIMBS (tp, scratch); 913 return gsize; 914 } 915 } -
mpn/generic/gcdext.c
diff -N -u -r gmp-4.2.1/mpn/generic/gcdext.c gmp-4.2.1-patched/mpn/generic/gcdext.c
old new 1 1 /* mpn_gcdext -- Extended Greatest Common Divisor. 2 2 3 Copyright 1996, 1998, 2000, 2001, 2002 Free Software Foundation, Inc. 3 Copyright 1996, 1998, 2000, 2001, 2002, 2003, 2004 Free Software Foundation, 4 Inc. 4 5 5 6 This file is part of the GNU MP Library. 6 7 7 8 The GNU MP Library is free software; you can redistribute it and/or modify 8 it under the terms of the GNU LesserGeneral Public License as published by9 it under the terms of the GNU General Public License as published by 9 10 the Free Software Foundation; either version 2.1 of the License, or (at your 10 11 option) any later version. 11 12 12 13 The GNU MP Library is distributed in the hope that it will be useful, but 13 14 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU LesserGeneral Public15 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public 15 16 License for more details. 16 17 17 You should have received a copy of the GNU Lesser General Public License 18 along with the GNU MP Library; see the file COPYING.LIB. If not, write 19 to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 20 Boston, MA 02110-1301, USA. */ 18 You should have received a copy of the GNU General Public License 19 along with the GNU MP Library; see the file COPYING. If not, write to 20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, 21 MA 02111-1307, USA. */ 22 23 #define WANT_TRACE 0 24 25 /* Default to binary gcdext_1, since it is best on most current machines. 26 We should teach tuneup to choose the right gcdext_1. */ 27 #define GCDEXT_1_USE_BINARY 1 28 29 #if WANT_TRACE 30 # include <stdio.h> 31 # include <stdarg.h> 32 #endif 21 33 22 34 #include "gmp.h" 23 35 #include "gmp-impl.h" 24 36 #include "longlong.h" 25 37 26 #ifndef GCDEXT_THRESHOLD27 # define GCDEXT_THRESHOLD 1738 #ifndef NULL 39 # define NULL ((void *) 0) 28 40 #endif 29 41 30 #ifndef EXTEND 31 #define EXTEND 1 42 #if WANT_TRACE 43 static void 44 trace (const char *format, ...) 45 { 46 va_list args; 47 va_start (args, format); 48 gmp_vfprintf (stderr, format, args); 49 va_end (args); 50 } 32 51 #endif 33 52 34 #if STAT 35 int arr[GMP_LIMB_BITS + 1]; 36 #endif 53 /* Comparison of _normalized_ numbers. */ 54 55 #define MPN_EQUAL_P(ap, asize, bp, bsize) \ 56 ((asize) == (bsize) && mpn_cmp ((ap), (bp), (asize)) == 0) 37 57 58 #define MPN_LEQ_P(ap, asize, bp, bsize) \ 59 ((asize) < (bsize) || ((asize) == (bsize) \ 60 && mpn_cmp ((ap), (bp), (asize)) <= 0)) 38 61 39 /* mpn_gcdext (GP, SP, SSIZE, UP, USIZE, VP, VSIZE) 62 /* Returns g, u and v such that g = u A - v B. There are three 63 different cases for the result: 40 64 41 Compute the extended GCD of {UP,USIZE} and {VP,VSIZE} and store the 42 greatest common divisor at GP (unless it is 0), and the first cofactor at 43 SP. Write the size of the cofactor through the pointer SSIZE. Return the 44 size of the value at GP. Note that SP might be a negative number; this is 45 denoted by storing the negative of the size through SSIZE. 46 47 {UP,USIZE} and {VP,VSIZE} are both clobbered. 48 49 The space allocation for all four areas needs to be USIZE+1. 50 51 Preconditions: 1) U >= V. 52 2) V > 0. */ 53 54 /* We use Lehmer's algorithm. The idea is to extract the most significant 55 bits of the operands, and compute the continued fraction for them. We then 56 apply the gathered cofactors to the full operands. 57 58 Idea 1: After we have performed a full division, don't shift operands back, 59 but instead account for the extra factors-of-2 thus introduced. 60 Idea 2: Simple generalization to use divide-and-conquer would give us an 61 algorithm that runs faster than O(n^2). 62 Idea 3: The input numbers need less space as the computation progresses, 63 while the s0 and s1 variables need more space. To save memory, we 64 could make them share space, and have the latter variables grow 65 into the former. 66 Idea 4: We should not do double-limb arithmetic from the start. Instead, 67 do things in single-limb arithmetic until the quotients differ, 68 and then switch to double-limb arithmetic. */ 65 g = u A - v B, 0 < u < b, 0 < v < a 66 g = A u = 1, v = 0 67 g = B u = B, v = A - 1 69 68 69 We always return with 0 < u <= b, 0 <= v < a. 70 */ 71 #if GCDEXT_1_USE_BINARY 70 72 71 /* One-limb division optimized for small quotients. */72 73 static mp_limb_t 73 div1 (mp_limb_t n0, mp_limb_t d0)74 gcdext_1_odd (mp_limb_t *up, mp_limb_t *vp, mp_limb_t a, mp_limb_t b) 74 75 { 75 if ((mp_limb_signed_t) n0 < 0) 76 mp_limb_t u0; 77 mp_limb_t v0; 78 mp_limb_t v1; 79 mp_limb_t u1; 80 81 mp_limb_t B = b; 82 mp_limb_t A = a; 83 84 /* Through out this function maintain 85 86 a = u0 A - v0 B 87 b = u1 A - v1 B 88 89 where A and B are odd. */ 90 91 u0 = 1; v0 = 0; 92 u1 = b; v1 = a-1; 93 94 if (A == 1) 76 95 { 77 mp_limb_t q; 78 int cnt; 79 for (cnt = 1; (mp_limb_signed_t) d0 >= 0; cnt++) 96 *up = u0; *vp = v0; 97 return 1; 98 } 99 else if (B == 1) 100 { 101 *up = u1; *vp = v1; 102 return 1; 103 } 104 105 while (a != b) 106 { 107 mp_limb_t mask; 108 109 ASSERT (a % 2 == 1); 110 ASSERT (b % 2 == 1); 111 112 ASSERT (0 < u0); ASSERT (u0 <= B); 113 ASSERT (0 < u1); ASSERT (u1 <= B); 114 115 ASSERT (0 <= v0); ASSERT (v0 < A); 116 ASSERT (0 <= v1); ASSERT (v1 < A); 117 118 if (a > b) 80 119 { 81 d0 = d0 << 1; 120 MP_LIMB_T_SWAP (a, b); 121 MP_LIMB_T_SWAP (u0, u1); 122 MP_LIMB_T_SWAP (v0, v1); 123 } 124 125 ASSERT (a < b); 126 127 /* Makes b even */ 128 b -= a; 129 130 mask = - (mp_limb_t) (u1 < u0); 131 u1 += B & mask; 132 v1 += A & mask; 133 u1 -= u0; 134 v1 -= v0; 135 136 ASSERT (b % 2 == 0); 137 138 do 139 { 140 /* As b = u1 A + v1 B is even, while A and B are odd, 141 either both or none of u1, v1 is even */ 142 143 ASSERT (u1 % 2 == v1 % 2); 144 145 mask = -(u1 & 1); 146 u1 = u1 / 2 + ((B / 2) & mask) - mask; 147 v1 = v1 / 2 + ((A / 2) & mask) - mask; 148 149 b /= 2; 82 150 } 151 while (b % 2 == 0); 152 } 153 154 /* Now g = a = b */ 155 ASSERT (a == b); 156 ASSERT (u1 <= B); 157 ASSERT (v1 < A); 158 159 ASSERT (A % a == 0); 160 ASSERT (B % a == 0); 161 ASSERT (u0 % (B/a) == u1 % (B/a)); 162 ASSERT (v0 % (A/a) == v1 % (A/a)); 163 164 *up = u0; *vp = v0; 165 166 return a; 167 } 168 169 static mp_limb_t 170 gcdext_1 (mp_limb_t *up, mp_limb_t *vp, mp_limb_t a, mp_limb_t b) 171 { 172 unsigned shift = 0; 173 mp_limb_t g; 174 mp_limb_t u; 175 mp_limb_t v; 176 177 /* We use unsigned values in the range 0, ... B - 1. As the values 178 are uniquely determined only modulo B, we can add B at will, to 179 get numbers in range or flip the least significant bit. */ 180 /* Deal with powers of two */ 181 while ((a | b) % 2 == 0) 182 { 183 a /= 2; b /= 2; shift++; 184 } 83 185 84 q = 0; 85 while (cnt) 186 if (b % 2 == 0) 187 { 188 unsigned k = 0; 189 190 do { 191 b /= 2; k++; 192 } while (b % 2 == 0); 193 194 g = gcdext_1_odd (&u, &v, a, b); 195 196 while (k--) 86 197 { 87 q <<= 1; 88 if (n0 >= d0) 198 /* We have g = u a + v b, and need to construct 199 g = u'a + v'(2b). 200 201 If v is even, we can just set u' = u, v' = v/2 202 If v is odd, we can set v' = (v + a)/2, u' = u + b 203 */ 204 205 if (v % 2 == 0) 206 v /= 2; 207 else 89 208 { 90 n0 = n0 - d0;91 q |=1;209 u = u + b; 210 v = v/2 + a/2 + 1; 92 211 } 93 d0 = d0 >> 1; 94 cnt--; 212 b *= 2; 95 213 } 214 } 215 else if (a % 2 == 0) 216 { 217 unsigned k = 0; 218 219 do { 220 a /= 2; k++; 221 } while (a % 2 == 0); 222 223 g = gcdext_1_odd (&u, &v, a, b); 96 224 97 return q; 225 while (k--) 226 { 227 /* We have g = u a + v b, and need to construct 228 g = u'(2a) + v'b. 229 230 If u is even, we can just set u' = u/2, v' = v. 231 If u is odd, we can set u' = (u + b)/2 232 */ 233 234 if (u % 2 == 0) 235 u /= 2; 236 else 237 { 238 u = u/2 + b/2 + 1; 239 v = v + a; 240 } 241 a *= 2; 242 } 98 243 } 99 244 else 245 /* Ok, both are odd */ 246 g = gcdext_1_odd (&u, &v, a, b); 247 248 *up = u; 249 *vp = v; 250 251 return g << shift; 252 } 253 254 #else /* ! GCDEXT_1_USE_BINARY */ 255 static mp_limb_t 256 gcdext_1_u (mp_limb_t *up, mp_limb_t a, mp_limb_t b) 257 { 258 /* Maintain 259 260 a = u0 A mod B 261 b = - u1 A mod B 262 */ 263 mp_limb_t u0 = 1; 264 mp_limb_t u1 = 0; 265 mp_limb_t B = b; 266 267 ASSERT (a >= b); 268 ASSERT (b > 0); 269 270 for (;;) 100 271 { 101 272 mp_limb_t q; 102 int cnt; 103 for (cnt = 0; n0 >= d0; cnt++) 273 274 q = a / b; 275 a -= q * b; 276 277 if (a == 0) 104 278 { 105 d0 = d0 << 1; 279 *up = B - u1; 280 return b; 106 281 } 282 u0 += q * u1; 283 284 q = b / a; 285 b -= q * a; 107 286 108 q = 0; 109 while (cnt) 287 if (b == 0) 110 288 { 111 d0 = d0 >> 1; 112 q <<= 1; 113 if (n0 >= d0) 114 { 115 n0 = n0 - d0; 116 q |= 1; 117 } 118 cnt--; 289 *up = u0; 290 return a; 119 291 } 120 121 return q; 292 u1 += q * u0; 122 293 } 123 294 } 124 295 125 /* Two-limb division optimized for small quotients. */126 296 static mp_limb_t 127 div2 (mp_limb_t n1, mp_limb_t n0, mp_limb_t d1, mp_limb_t d0)297 gcdext_1 (mp_limb_t *up, mp_limb_t *vp, mp_limb_t a, mp_limb_t b) 128 298 { 129 if ((mp_limb_signed_t) n1 < 0) 299 /* Maintain 300 301 a = u0 A - v0 B 302 b = - u1 A + v1 B = (B - u1) A - (A - v1) B 303 */ 304 mp_limb_t u0 = 1; 305 mp_limb_t v0 = 0; 306 mp_limb_t u1 = 0; 307 mp_limb_t v1 = 1; 308 309 mp_limb_t A = a; 310 mp_limb_t B = b; 311 312 ASSERT (a >= b); 313 ASSERT (b > 0); 314 315 for (;;) 130 316 { 131 317 mp_limb_t q; 132 int cnt; 133 for (cnt = 1; (mp_limb_signed_t) d1 >= 0; cnt++) 318 319 q = a / b; 320 a -= q * b; 321 322 if (a == 0) 134 323 { 135 d1 = (d1 << 1) | (d0 >> (GMP_LIMB_BITS - 1)); 136 d0 = d0 << 1; 324 *up = B - u1; 325 *vp = A - v1; 326 return b; 137 327 } 328 u0 += q * u1; 329 v0 += q * v1; 138 330 139 q = 0; 140 while (cnt) 331 q = b / a; 332 b -= q * a; 333 334 if (b == 0) 141 335 { 142 q <<= 1; 143 if (n1 > d1 || (n1 == d1 && n0 >= d0)) 144 { 145 sub_ddmmss (n1, n0, n1, n0, d1, d0); 146 q |= 1; 147 } 148 d0 = (d1 << (GMP_LIMB_BITS - 1)) | (d0 >> 1); 149 d1 = d1 >> 1; 150 cnt--; 336 *up = u0; 337 *vp = v0; 338 return a; 151 339 } 340 u1 += q * u0; 341 v1 += q * v0; 342 } 343 } 344 #endif /* ! GCDEXT_1_USE_BINARY */ 345 346 /* FIXME: Duplicated in gcd.c */ 347 static mp_size_t 348 hgcd_tdiv (mp_ptr qp, 349 mp_ptr rp, mp_size_t *rsizep, 350 mp_srcptr ap, mp_size_t asize, 351 mp_srcptr bp, mp_size_t bsize) 352 { 353 mp_size_t qsize; 354 mp_size_t rsize; 355 356 mpn_tdiv_qr (qp, rp, 0, ap, asize, bp, bsize); 357 358 rsize = bsize; 359 MPN_NORMALIZE (rp, rsize); 360 *rsizep = rsize; 361 362 qsize = asize - bsize + 1; 363 qsize -= (qp[qsize - 1] == 0); 364 365 if (qsize == 1 && qp[0] == 1) 366 return 0; 367 368 return qsize; 369 } 152 370 153 return q; 371 /* FIXME: Duplicated in hgcd.c */ 372 static mp_limb_t 373 mpn_addmul2_n_1 (mp_ptr rp, mp_size_t n, 374 mp_ptr ap, mp_limb_t u, 375 mp_ptr bp, mp_limb_t v) 376 { 377 mp_limb_t h; 378 mp_limb_t cy; 379 380 h = mpn_mul_1 (rp, ap, n, u); 381 cy = mpn_addmul_1 (rp, bp, n, v); 382 h += cy; 383 #if GMP_NAIL_BITS == 0 384 rp[n] = h; 385 return (h < cy); 386 #else /* GMP_NAIL_BITS > 0 */ 387 rp[n] = h & GMP_NUMB_MASK; 388 return h >> GMP_NUMB_BITS; 389 #endif /* GMP_NAIL_BITS > 0 */ 390 } 391 392 393 /* Computes u2 = u0 + q u1 394 395 Returns new size. 396 397 FIXME: Notation in the function not quite consistent 398 FIXME: Severe code duplication with hgcd_update_uv */ 399 400 static mp_size_t 401 hgcd_update_u (struct hgcd_row *r, mp_size_t usize, 402 mp_srcptr qp, mp_size_t qsize, 403 /* Limbs allocated for the new u, for sanity 404 checking */ 405 mp_size_t alloc) 406 { 407 mp_srcptr u0p = r[0].uvp[0]; 408 mp_srcptr u1p = r[1].uvp[0]; 409 mp_ptr u2p = r[2].uvp[0]; 410 411 ASSERT (usize < alloc); 412 413 /* u1 = 0 is an exceptional case. Except for this, u1 should be 414 normalized. */ 415 416 ASSERT ((usize == 1 && u1p[0] == 0) || u1p[usize - 1] != 0); 417 418 /* Compute u2 = u0 + q u1 */ 419 420 if (usize == 1 && u1p[0] == 0) 421 { 422 /* u1 == 0 is a special case, then q might be large, but it 423 doesn't matter. Can happen only when u0 = v1 = 1, u1 = v0 = 424 0, and hence usize == 1. */ 425 MPN_COPY (u2p, u0p, usize); 426 } 427 else if (qsize == 0) 428 /* Represents a unit quotient */ 429 { 430 mp_limb_t cy = mpn_add_n (u2p, u0p, u1p, usize); 431 u2p[usize] = cy; 432 usize += (cy != 0); 433 } 434 else if (qsize == 1) 435 { 436 mp_limb_t cy; 437 438 cy = mpn_mul_1 (u2p, u1p, usize, qp[0]); 439 cy += mpn_add_n (u2p, u2p, u0p, usize); 440 441 u2p[usize] = cy; 442 usize += (cy != 0); 154 443 } 155 444 else 156 445 { 157 mp_limb_t q; 158 int cnt; 159 for (cnt = 0; n1 > d1 || (n1 == d1 && n0 >= d0); cnt++) 446 if (qsize <= usize) 447 mpn_mul (u2p, u1p, usize, qp, qsize); 448 else 449 mpn_mul (u2p, qp, qsize, u1p, usize); 450 451 ASSERT_NOCARRY (mpn_add (u2p, 452 u2p, usize + qsize, 453 u0p, usize)); 454 455 usize += qsize; 456 usize -= (u2p[usize - 1] == 0); 457 } 458 ASSERT (mpn_cmp (r[1].uvp[0], r[2].uvp[0], usize) <= 0); 459 ASSERT (r[2].uvp[0][usize - 1] != 0); 460 461 return usize; 462 } 463 464 465 /* Computes Y = R * X. No overlap allowed. */ 466 static mp_size_t 467 hgcd2_mul_vector (struct hgcd_row *Y, 468 mp_size_t alloc, 469 const struct hgcd2_row *R, 470 const struct hgcd_row *X, mp_size_t n) 471 { 472 unsigned i; 473 int grow = 0; 474 mp_limb_t h = 0; 475 476 ASSERT (n < alloc); 477 478 for (i = 0; i < 2; i++) 479 { 480 /* Set Y[i] = R[i, 0] X[0] + R[i,1] X[1] 481 = u X[0] + v X[0] */ 482 mp_limb_t cy; 483 484 cy = mpn_addmul2_n_1 (Y[i].uvp[0], n, 485 X[0].uvp[0], R[i].u, 486 X[1].uvp[0], R[i].v); 487 488 if (cy) 160 489 { 161 d1 = (d1 << 1) | (d0 >> (GMP_LIMB_BITS - 1)); 162 d0 = d0 << 1; 490 ASSERT (n + 2 <= alloc); 491 Y[i].uvp[0][n+1] = cy; 492 grow = 1; 163 493 } 494 else 495 h |= Y[i].uvp[0][n]; 496 } 497 if (grow) 498 return n + 2; 499 else 500 /* Don't add redundant zeroes */ 501 return n + (h != 0); 502 } 503 504 /* Sets (a, b, c) <-- (b, c, a) */ 505 #define HGCD_SWAP3_LEFT(row) \ 506 do { \ 507 struct hgcd_row __hgcd_swap4_left_tmp = row[0]; \ 508 row[0] = row[1]; \ 509 row[1] = row[2]; \ 510 row[2] = __hgcd_swap4_left_tmp; \ 511 } while (0) 512 513 /* Sets (a, b, c, d) <-- (c, d, a, b) */ 514 #define HGCD_SWAP4_2(row) \ 515 do { \ 516 struct hgcd_row __hgcd_swap4_2_tmp = row[0]; \ 517 row[0] = row[2]; \ 518 row[2] = __hgcd_swap4_2_tmp; \ 519 __hgcd_swap4_2_tmp = row[1]; \ 520 row[1] = row[3]; \ 521 row[3] = __hgcd_swap4_2_tmp; \ 522 } while (0) 523 524 static mp_size_t 525 gcdext_lehmer_itch (mp_size_t asize, mp_size_t bsize) 526 { 527 mp_size_t ralloc = asize + 1; 528 mp_size_t ualloc = bsize + 1; 529 530 return 4 * ralloc + 4 * ualloc + asize; 531 } 164 532 165 q = 0; 166 while (cnt) 533 static mp_size_t 534 gcdext_lehmer (mp_ptr gp, mp_ptr up, mp_size_t *usize, 535 mp_srcptr ap, mp_size_t asize, 536 mp_srcptr bp, mp_size_t bsize, 537 mp_ptr tp, mp_size_t talloc) 538 { 539 struct hgcd_row r[4]; 540 /* Size and sign of u fields. The largest u should be normalized to 541 this size, and except for the case u1 = 0, that is the latest 542 u. */ 543 int rsize; 544 int rsign; 545 546 mp_ptr qp; 547 mp_size_t qsize; 548 mp_size_t ralloc = asize + 1; 549 mp_size_t ualloc = bsize + 1; 550 551 struct hgcd2 hgcd; 552 int res; 553 554 ASSERT (asize >= bsize); 555 ASSERT (asize > 1); 556 ASSERT (bsize > 0); 557 558 ASSERT (MPN_LEQ_P (bp, bsize, ap, asize)); 559 560 ASSERT (4 * ralloc + 4*ualloc + asize <= talloc); 561 562 r[0].rp = tp; tp += ralloc; talloc -= ralloc; 563 r[1].rp = tp; tp += ralloc; talloc -= ralloc; 564 r[2].rp = tp; tp += ralloc; talloc -= ralloc; 565 r[3].rp = tp; tp += ralloc; talloc -= ralloc; 566 567 /* Must zero out the u fields. We don't use the v fields. */ 568 MPN_ZERO (tp, 4 * ualloc); 569 570 r[0].uvp[0] = tp; tp += ualloc; talloc -= ualloc; 571 r[1].uvp[0] = tp; tp += ualloc; talloc -= ualloc; 572 r[2].uvp[0] = tp; tp += ualloc; talloc -= ualloc; 573 r[3].uvp[0] = tp; tp += ualloc; talloc -= ualloc; 574 575 qp = tp; tp += asize; talloc -= asize; 576 577 res = mpn_hgcd2_lehmer_step (&hgcd, 578 ap, asize, 579 bp, bsize, 580 NULL); 581 582 if (res == 0 || (res == 2 && hgcd.row[0].v == 0)) 583 { 584 qsize = hgcd_tdiv (qp, r[1].rp, &r[1].rsize, 585 ap, asize, 586 bp, bsize); 587 MPN_COPY (r[0].rp, bp, bsize); 588 r[0].rsize = bsize; 589 590 r[0].uvp[0][0] = 0; 591 r[1].uvp[0][0] = 1; 592 rsign = -1; 593 } 594 else 595 { 596 const struct hgcd2_row *s = hgcd.row + (res - 2); 597 rsign = hgcd.sign; 598 if (res == 3) 599 rsign = ~rsign; 600 601 /* s[0] and s[1] correct. */ 602 r[0].rsize 603 = mpn_hgcd2_fix (r[0].rp, ralloc, 604 rsign, 605 s[0].u, ap, asize, 606 s[0].v, bp, bsize); 607 608 r[1].rsize 609 = mpn_hgcd2_fix (r[1].rp, ralloc, 610 ~rsign, 611 s[1].u, ap, asize, 612 s[1].v, bp, bsize); 613 614 r[0].uvp[0][0] = s[0].u; 615 r[1].uvp[0][0] = s[1].u; 616 } 617 rsize = 1; 618 619 while (r[0].rsize >= 2 && r[1].rsize > 0) 620 { 621 res = mpn_hgcd2_lehmer_step (&hgcd, 622 r[0].rp, r[0].rsize, 623 r[1].rp, r[1].rsize, 624 NULL); 625 626 if (res == 0 || (res == 2 && hgcd.row[0].v == 0)) 167 627 { 168 d0 = (d1 << (GMP_LIMB_BITS - 1)) | (d0 >> 1); 169 d1 = d1 >> 1; 170 q <<= 1; 171 if (n1 > d1 || (n1 == d1 && n0 >= d0)) 172 { 173 sub_ddmmss (n1, n0, n1, n0, d1, d0); 174 q |= 1; 175 } 176 cnt--; 628 qsize = hgcd_tdiv (qp, r[2].rp, &r[2].rsize, 629 r[0].rp, r[0].rsize, 630 r[1].rp, r[1].rsize); 631 rsize = hgcd_update_u (r, rsize, qp, qsize, ualloc); 632 HGCD_SWAP3_LEFT (r); 633 rsign = ~rsign; 177 634 } 635 else 636 { 637 const struct hgcd2_row *s = hgcd.row + (res - 2); 638 int sign = hgcd.sign; 639 if (res == 3) 640 sign = ~sign; 641 642 /* s[0] and s[1] correct. */ 643 r[2].rsize 644 = mpn_hgcd2_fix (r[2].rp, ralloc, 645 sign, 646 s[0].u, r[0].rp, r[0].rsize, 647 s[0].v, r[1].rp, r[1].rsize); 648 649 r[3].rsize 650 = mpn_hgcd2_fix (r[3].rp, ralloc, 651 ~sign, 652 s[1].u, r[0].rp, r[0].rsize, 653 s[1].v, r[1].rp, r[1].rsize); 654 655 rsize = hgcd2_mul_vector (r + 2, ralloc, s, r, rsize); 656 rsign ^= sign; 657 HGCD_SWAP4_2 (r); 658 } 659 } 660 661 if (r[1].rsize == 0) 662 { 663 MPN_NORMALIZE (r[0].uvp[0], rsize); 664 MPN_COPY (gp, r[0].rp, r[0].rsize); 665 MPN_COPY (up, r[0].uvp[0], rsize); 178 666 179 return q; 667 *usize = (rsign >= 0) ? rsize : -rsize; 668 return r[0].rsize; 669 } 670 else 671 { 672 mp_limb_t cy; 673 mp_limb_t u; 674 mp_limb_t v; 675 676 gp[0] = gcdext_1 (&u, &v, r[0].rp[0], r[1].rp[0]); 677 cy = mpn_addmul2_n_1 (up, rsize, 678 r[0].uvp[0], u, 679 r[1].uvp[0], v); 680 rsize++; 681 if (cy) 682 up[rsize++] = cy; 683 else 684 MPN_NORMALIZE (up, rsize); 685 686 *usize = (rsign >= 0) ? rsize : -rsize; 687 return 1; 180 688 } 181 689 } 182 690 183 mp_size_t 184 #if EXTEND 185 mpn_gcdext (mp_ptr gp, mp_ptr s0p, mp_size_t *s0size, 186 mp_ptr up, mp_size_t size, mp_ptr vp, mp_size_t vsize) 187 #else 188 mpn_gcd (mp_ptr gp, 189 mp_ptr up, mp_size_t size, mp_ptr vp, mp_size_t vsize) 190 #endif 191 { 192 mp_limb_t A, B, C, D; 193 int cnt; 194 mp_ptr tp, wp; 195 #if RECORD 196 mp_limb_t max = 0; 197 #endif 198 #if EXTEND 199 mp_ptr s1p; 200 mp_ptr orig_s0p = s0p; 201 mp_size_t ssize; 202 int sign = 1; 203 #endif 204 int use_double_flag; 205 TMP_DECL; 691 /* Computes Y = R * X. No overlap allowed. 206 692 207 ASSERT (size >= vsize); 208 ASSERT (vsize >= 1); 209 ASSERT (up[size-1] != 0); 210 ASSERT (vp[vsize-1] != 0); 211 ASSERT (! MPN_OVERLAP_P (up, size+1, vp, vsize+1)); 212 #if EXTEND 213 ASSERT (! MPN_OVERLAP_P (s0p, size, up, size+1)); 214 ASSERT (! MPN_OVERLAP_P (s0p, size, vp, vsize+1)); 215 #endif 216 ASSERT (MPN_SAME_OR_SEPARATE_P (gp, up, size)); 217 ASSERT (MPN_SAME_OR_SEPARATE2_P (gp, size, vp, vsize)); 693 Temporary space is needed for two numbers smaller than the 694 resulting matrix elements, i.e. bounded by 2*L <= N. 218 695 219 TMP_MARK;696 FIXME: Severe code duplication with hgcd.c: hgcd_mul. */ 220 697 221 tp = (mp_ptr) TMP_ALLOC ((size + 1) * BYTES_PER_MP_LIMB); 222 wp = (mp_ptr) TMP_ALLOC ((size + 1) * BYTES_PER_MP_LIMB); 223 #if EXTEND 224 s1p = (mp_ptr) TMP_ALLOC ((size + 1) * BYTES_PER_MP_LIMB); 225 226 #if ! WANT_GCDEXT_ONE_STEP 227 MPN_ZERO (s0p, size); 228 MPN_ZERO (s1p, size); 229 #endif 698 static mp_size_t 699 hgcd_mul_vector (struct hgcd_row *Y, mp_size_t alloc, 700 const struct hgcd_row *R, mp_size_t rsize, 701 const struct hgcd_row *X, mp_size_t xsize, 702 mp_ptr tp, mp_size_t talloc) 703 { 704 unsigned i; 230 705 231 s0p[0] = 1; 232 s1p[0] = 0; 233 ssize = 1; 234 #endif 706 mp_size_t ysize; 707 mp_limb_t h; 708 int grow; 709 710 MPN_NORMALIZE (R[1].uvp[1], rsize); 711 /* u1 = 0 is an exceptional case. Except for this, u1 should be 712 normalized. */ 713 ASSERT ((xsize == 1 && X[1].uvp[0][0] == 0) 714 || X[1].uvp[0][xsize - 1] != 0); 235 715 236 if ( size > vsize)716 if (xsize == 1 && X[1].uvp[0][0] == 0) 237 717 { 238 mpn_tdiv_qr (tp, up, (mp_size_t) 0, up, size, vp, vsize); 718 /* Special case. Set Y[i, 0] = R[i, 0] */ 719 ASSERT (X[0].uvp[0][0] == 1); 239 720 240 #if EXTEND 241 /* This is really what it boils down to in this case... */ 242 s0p[0] = 0; 243 s1p[0] = 1; 244 sign = -sign; 245 #endif 246 size = vsize; 247 MP_PTR_SWAP (up, vp); 721 if (rsize > 1) 722 MPN_NORMALIZE (R[1].uvp[0], rsize); 723 MPN_COPY (Y[0].uvp[0], R[0].uvp[0], rsize); 724 MPN_COPY (Y[1].uvp[0], R[1].uvp[0], rsize); 725 726 return rsize; 248 727 } 249 728 250 use_double_flag = ABOVE_THRESHOLD (size, GCDEXT_THRESHOLD); 729 ysize = rsize + xsize; 730 ASSERT (ysize <= talloc); 251 731 252 for (;;) 732 h = 0; grow = 0; 733 734 if (rsize >= xsize) 253 735 { 254 mp_limb_t asign; 255 /* Figure out exact size of V. */ 256 vsize = size; 257 MPN_NORMALIZE (vp, vsize); 258 if (vsize <= 1) 259 break; 260 261 if (use_double_flag) 262 { 263 mp_limb_t uh, vh, ul, vl; 264 /* Let UH,UL be the most significant limbs of U, and let VH,VL be 265 the corresponding bits from V. */ 266 uh = up[size - 1]; 267 vh = vp[size - 1]; 268 ul = up[size - 2]; 269 vl = vp[size - 2]; 270 count_leading_zeros (cnt, uh); 271 #if GMP_NAIL_BITS == 0 272 if (cnt != 0) 273 { 274 uh = (uh << cnt) | (ul >> (GMP_LIMB_BITS - cnt)); 275 vh = (vh << cnt) | (vl >> (GMP_LIMB_BITS - cnt)); 276 vl <<= cnt; 277 ul <<= cnt; 278 if (size >= 3) 279 { 280 ul |= (up[size - 3] >> (GMP_LIMB_BITS - cnt)); 281 vl |= (vp[size - 3] >> (GMP_LIMB_BITS - cnt)); 282 } 283 } 284 #else 285 uh = uh << cnt; 286 vh = vh << cnt; 287 if (cnt < GMP_NUMB_BITS) 288 { /* GMP_NAIL_BITS <= cnt < GMP_NUMB_BITS */ 289 uh |= ul >> (GMP_NUMB_BITS - cnt); 290 vh |= vl >> (GMP_NUMB_BITS - cnt); 291 ul <<= cnt + GMP_NAIL_BITS; 292 vl <<= cnt + GMP_NAIL_BITS; 293 if (size >= 3) 294 { 295 if (cnt + GMP_NAIL_BITS > GMP_NUMB_BITS) 296 { 297 ul |= up[size - 3] << cnt + GMP_NAIL_BITS - GMP_NUMB_BITS; 298 vl |= vp[size - 3] << cnt + GMP_NAIL_BITS - GMP_NUMB_BITS; 299 if (size >= 4) 300 { 301 ul |= up[size - 4] >> 2 * GMP_NUMB_BITS - GMP_NAIL_BITS - cnt; 302 vl |= vp[size - 4] >> 2 * GMP_NUMB_BITS - GMP_NAIL_BITS - cnt; 303 } 304 } 305 else 306 { 307 ul |= up[size - 3] >> (GMP_LIMB_BITS - cnt - 2 * GMP_NAIL_BITS); 308 vl |= vp[size - 3] >> (GMP_LIMB_BITS - cnt - 2 * GMP_NAIL_BITS); 309 } 310 } 311 } 312 else 313 { /* GMP_NUMB_BITS <= cnt <= GMP_LIMB_BITS-1 */ 314 uh |= ul << cnt - GMP_NUMB_BITS; /* 0 <= c <= GMP_NAIL_BITS-1 */ 315 vh |= vl << cnt - GMP_NUMB_BITS; 316 if (size >= 3) 317 { 318 if (cnt - GMP_NUMB_BITS != 0) 319 { /* uh/vh need yet more bits! */ 320 uh |= up[size - 3] >> 2 * GMP_NUMB_BITS - cnt; 321 vh |= vp[size - 3] >> 2 * GMP_NUMB_BITS - cnt; 322 ul = up[size - 3] << cnt + GMP_NAIL_BITS - GMP_NUMB_BITS; 323 vl = vp[size - 3] << cnt + GMP_NAIL_BITS - GMP_NUMB_BITS; 324 if (size >= 4) 325 { 326 ul |= up[size - 4] >> 2 * GMP_NUMB_BITS - GMP_NAIL_BITS - cnt; 327 vl |= vp[size - 4] >> 2 * GMP_NUMB_BITS - GMP_NAIL_BITS - cnt; 328 } 329 } 330 else 331 { 332 ul = up[size - 3] << GMP_LIMB_BITS - cnt; 333 vl = vp[size - 3] << GMP_LIMB_BITS - cnt; 334 if (size >= 4) 335 { 336 ul |= up[size - 4] >> GMP_NUMB_BITS - (GMP_LIMB_BITS - cnt); 337 vl |= vp[size - 4] >> GMP_NUMB_BITS - (GMP_LIMB_BITS - cnt); 338 } 339 } 340 } 341 else 342 { 343 ul = 0; 344 vl = 0; 345 } 346 } 347 #endif 736 for (i = 0; i < 2; i++) 737 { 738 /* Set Y[i, 0] = R[i, 0] X[0, 0] + R[i,1] X[1, 0] */ 739 mp_limb_t cy; 348 740 349 A = 1; 350 B = 0; 351 C = 0; 352 D = 1; 741 mpn_mul (Y[i].uvp[0], R[i].uvp[0], rsize, X[0].uvp[0], xsize); 742 mpn_mul (tp, R[i].uvp[1], rsize, X[1].uvp[0], xsize); 353 743 354 asign = 0; 355 for (;;) 356 { 357 mp_limb_t Tac, Tbd; 358 mp_limb_t q1, q2; 359 mp_limb_t nh, nl, dh, dl; 360 mp_limb_t t1, t0; 361 mp_limb_t Th, Tl; 362 363 sub_ddmmss (dh, dl, vh, vl, 0, C); 364 if (dh == 0) 365 break; 366 add_ssaaaa (nh, nl, uh, ul, 0, A); 367 q1 = div2 (nh, nl, dh, dl); 368 369 add_ssaaaa (dh, dl, vh, vl, 0, D); 370 if (dh == 0) 371 break; 372 sub_ddmmss (nh, nl, uh, ul, 0, B); 373 q2 = div2 (nh, nl, dh, dl); 374 375 if (q1 != q2) 376 break; 377 378 Tac = A + q1 * C; 379 if (GMP_NAIL_BITS != 0 && Tac > GMP_NUMB_MAX) 380 break; 381 Tbd = B + q1 * D; 382 if (GMP_NAIL_BITS != 0 && Tbd > GMP_NUMB_MAX) 383 break; 384 A = C; 385 C = Tac; 386 B = D; 387 D = Tbd; 388 umul_ppmm (t1, t0, q1, vl); 389 t1 += q1 * vh; 390 sub_ddmmss (Th, Tl, uh, ul, t1, t0); 391 uh = vh, ul = vl; 392 vh = Th, vl = Tl; 393 394 asign = ~asign; 395 396 add_ssaaaa (dh, dl, vh, vl, 0, C); 397 /* if (dh == 0) should never happen 398 break; */ 399 sub_ddmmss (nh, nl, uh, ul, 0, A); 400 q1 = div2 (nh, nl, dh, dl); 401 402 sub_ddmmss (dh, dl, vh, vl, 0, D); 403 if (dh == 0) 404 break; 405 add_ssaaaa (nh, nl, uh, ul, 0, B); 406 q2 = div2 (nh, nl, dh, dl); 407 408 if (q1 != q2) 409 break; 410 411 Tac = A + q1 * C; 412 if (GMP_NAIL_BITS != 0 && Tac > GMP_NUMB_MAX) 413 break; 414 Tbd = B + q1 * D; 415 if (GMP_NAIL_BITS != 0 && Tbd > GMP_NUMB_MAX) 416 break; 417 A = C; 418 C = Tac; 419 B = D; 420 D = Tbd; 421 umul_ppmm (t1, t0, q1, vl); 422 t1 += q1 * vh; 423 sub_ddmmss (Th, Tl, uh, ul, t1, t0); 424 uh = vh, ul = vl; 425 vh = Th, vl = Tl; 744 cy = mpn_add_n (Y[i].uvp[0], Y[i].uvp[0], tp, ysize); 426 745 427 asign = ~asign; 746 if (cy) 747 { 748 ASSERT (ysize + 1 < alloc); 749 Y[i].uvp[0][ysize] = cy; 750 grow = 1; 428 751 } 429 #if EXTEND 430 if (asign) 431 sign = -sign; 432 #endif 752 else 753 h |= Y[i].uvp[0][ysize - 1]; 433 754 } 434 else /* Same, but using single-limb calculations. */ 755 } 756 else 757 { 758 for (i = 0; i < 2; i++) 435 759 { 436 mp_limb_t uh, vh; 437 /* Make UH be the most significant limb of U, and make VH be 438 corresponding bits from V. */ 439 uh = up[size - 1]; 440 vh = vp[size - 1]; 441 count_leading_zeros (cnt, uh); 442 #if GMP_NAIL_BITS == 0 443 if (cnt != 0) 444 { 445 uh = (uh << cnt) | (up[size - 2] >> (GMP_LIMB_BITS - cnt)); 446 vh = (vh << cnt) | (vp[size - 2] >> (GMP_LIMB_BITS - cnt)); 447 } 448 #else 449 uh <<= cnt; 450 vh <<= cnt; 451 if (cnt < GMP_NUMB_BITS) 760 /* Set Y[i, 0] = R[i, 0] X[0, 0] + R[i,1] X[1, 0] */ 761 mp_limb_t cy; 762 763 mpn_mul (Y[i].uvp[0], X[0].uvp[0], xsize, R[i].uvp[0], rsize); 764 mpn_mul (tp, X[1].uvp[0], xsize, R[i].uvp[1], rsize); 765 766 cy = mpn_add_n (Y[i].uvp[0], Y[i].uvp[0], tp, ysize); 767 768 if (cy) 452 769 { 453 uh |= up[size - 2] >> (GMP_NUMB_BITS - cnt); 454 vh |= vp[size - 2] >> (GMP_NUMB_BITS - cnt); 770 ASSERT (ysize + 1 < alloc); 771 Y[i].uvp[0][ysize] = cy; 772 grow = 1; 455 773 } 456 774 else 457 { 458 uh |= up[size - 2] << cnt - GMP_NUMB_BITS; 459 vh |= vp[size - 2] << cnt - GMP_NUMB_BITS; 460 if (size >= 3) 461 { 462 uh |= up[size - 3] >> 2 * GMP_NUMB_BITS - cnt; 463 vh |= vp[size - 3] >> 2 * GMP_NUMB_BITS - cnt; 464 } 465 } 466 #endif 775 h |= Y[i].uvp[0][ysize - 1]; 776 } 777 } 467 778 468 A = 1; 469 B = 0;470 C = 0; 471 D = 1;779 if (grow) 780 ysize++; 781 else 782 ysize -= (h == 0); 472 783 473 asign = 0; 474 for (;;) 475 { 476 mp_limb_t q, T; 477 if (vh - C == 0 || vh + D == 0) 478 break; 479 480 q = (uh + A) / (vh - C); 481 if (q != (uh - B) / (vh + D)) 482 break; 483 484 T = A + q * C; 485 A = C; 486 C = T; 487 T = B + q * D; 488 B = D; 489 D = T; 490 T = uh - q * vh; 491 uh = vh; 492 vh = T; 493 494 asign = ~asign; 495 496 if (vh - D == 0) 497 break; 498 499 q = (uh - A) / (vh + C); 500 if (q != (uh + B) / (vh - D)) 501 break; 502 503 T = A + q * C; 504 A = C; 505 C = T; 506 T = B + q * D; 507 B = D; 508 D = T; 509 T = uh - q * vh; 510 uh = vh; 511 vh = T; 784 ASSERT ((ysize == 1 && Y[1].uvp[0][0] == 0) || Y[1].uvp[0][ysize - 1] != 0); 512 785 513 asign = ~asign; 514 } 515 #if EXTEND 516 if (asign) 517 sign = -sign; 786 return ysize; 787 } 788 789 #define COMPUTE_V_ITCH(asize, bsize, usize) \ 790 ((usize) + (asize) + 1 + (bsize)) 791 792 /* Computes |v| = |(c - u a)| / b, where u may be positive or negative, 793 and v is of the opposite sign. Requires that b, c, |u| <= a. */ 794 static mp_size_t 795 compute_v (mp_ptr vp, mp_size_t valloc, 796 mp_srcptr ap, mp_size_t asize, 797 mp_srcptr bp, mp_size_t bsize, 798 mp_srcptr cp, mp_size_t csize, 799 mp_srcptr up, mp_size_t usize, 800 mp_ptr tp, mp_size_t talloc) 801 { 802 mp_size_t size; 803 mp_size_t vsize; 804 mp_ptr rp; 805 806 ASSERT (asize); 807 ASSERT (bsize); 808 ASSERT (csize); 809 ASSERT (asize >= bsize); 810 811 #if 0 812 trace ("compute_v: a = %Nd\n" 813 " b = %Nd\n" 814 " c = %Nd\n" 815 " u = %Nd\n", 816 ap, asize, bp, bsize, cp, csize, up, usize); 518 817 #endif 818 819 ASSERT (usize); 820 821 size = ABS (usize); 822 823 ASSERT (size <= asize); 824 ASSERT (asize + size <= talloc); 825 826 mpn_mul (tp, ap, asize, up, size); 827 size += asize; 828 829 ASSERT (csize <= size); 830 831 if (usize > 0) 832 { 833 /* |v| = -v = (u a - c) / b */ 834 835 ASSERT_NOCARRY (mpn_sub (tp, tp, size, cp, csize)); 836 MPN_NORMALIZE (tp, size); 837 if (size == 0) 838 return 0; 839 } 840 else 841 { /* usize < 0 */ 842 /* |v| = v = (c - u a) / b = (c + |u| a) / b */ 843 mp_limb_t cy = mpn_add (tp, tp, size, cp, csize); 844 if (cy) 845 { 846 ASSERT (size < talloc); 847 tp[size++] = cy; 519 848 } 849 } 850 851 /* Now divide t / b. There must be no remainder */ 520 852 521 #if RECORD 522 max = MAX (A, max); max = MAX (B, max); 523 max = MAX (C, max); max = MAX (D, max); 853 ASSERT (size >= bsize); 854 ASSERT (size + bsize <= talloc); 855 rp = tp + size; 856 857 vsize = size + 1 - bsize; 858 ASSERT (vsize <= valloc); 859 860 mpn_tdiv_qr (vp, rp, 0, tp, size, bp, bsize); 861 MPN_NORMALIZE (vp, vsize); 862 863 /* Remainder must be zero */ 864 #if WANT_ASSERT 865 { 866 mp_size_t i; 867 for (i = 0; i < bsize; i++) 868 { 869 ASSERT (rp[i] == 0); 870 } 871 } 524 872 #endif 873 return vsize; 874 } 875 876 static mp_size_t 877 gcdext_schoenhage_itch (mp_size_t asize, mp_size_t bsize) 878 { 879 mp_size_t itch; 525 880 526 if (B == 0) 881 mp_size_t ralloc = asize + 1; 882 mp_size_t ualloc = bsize + 1; 883 /* Input size for hgcd calls */ 884 mp_size_t halloc = (asize + 1) / 2; 885 886 /* Storage for the rows and quotient */ 887 mp_size_t rstorage = 4 * ralloc + 4 * ualloc + asize; 888 889 /* Storage for hgcd calls */ 890 mp_size_t tstorage = mpn_hgcd_init_itch (halloc) 891 + qstack_itch (halloc) 892 + mpn_hgcd_itch (halloc); 893 894 /* Storage needed for final gcdext_lehmer */ 895 mp_size_t lstorage 896 = gcdext_lehmer_itch (GCDEXT_SCHOENHAGE_THRESHOLD, 897 GCDEXT_SCHOENHAGE_THRESHOLD); 898 899 /* Storage needed after final nhgcd_gcdext_lehmer */ 900 mp_size_t fstorage 901 = COMPUTE_V_ITCH (GCDEXT_SCHOENHAGE_THRESHOLD, 902 GCDEXT_SCHOENHAGE_THRESHOLD, 903 ualloc); 904 905 /* We need rstorage + MAX (tstorage, lstorage, fstorage) */ 906 907 itch = tstorage; 908 if (lstorage > tstorage) 909 itch = lstorage; 910 if (fstorage > itch) 911 itch = fstorage; 912 913 return rstorage + itch; 914 } 915 916 #if WANT_ASSERT 917 static void 918 sanity_check_row (mp_srcptr ap, mp_size_t asize, 919 mp_srcptr bp, mp_size_t bsize, 920 int sign, mp_size_t usize, 921 const struct hgcd_row *r) 922 { 923 /* Check that x = u * a + v * b, for some v, i.e. that 924 x - u*a is divisible by b. */ 925 mp_srcptr up = r->uvp[0]; 926 mp_srcptr xp = r->rp; 927 mp_size_t xsize = r->rsize; 928 mp_ptr tp; 929 mp_size_t tsize; 930 mp_ptr qp; 931 mp_size_t qsize; 932 mp_ptr rp; 933 mp_size_t i; 934 TMP_DECL; 935 TMP_MARK; 936 937 ASSERT (asize > 0 && ap[asize - 1] != 0); 938 ASSERT (bsize > 0 && bp[bsize - 1] != 0); 939 ASSERT (xsize == 0 || xp[xsize - 1] != 0); 940 ASSERT (MPN_LEQ_P (xp, xsize, ap, asize)); 941 ASSERT (MPN_LEQ_P (up, usize, bp, bsize)); 942 943 MPN_NORMALIZE (up, usize); 944 if (usize == 0) 945 { 946 ASSERT (MPN_EQUAL_P (xp, xsize, bp, bsize)); 947 return; 948 } 949 950 tp = TMP_ALLOC_LIMBS (usize + asize + 1); 951 qp = TMP_ALLOC_LIMBS (usize + asize + 2 - bsize); 952 rp = TMP_ALLOC_LIMBS (bsize); 953 954 mpn_mul (tp, ap, asize, up, usize); 955 tsize = asize + usize; 956 tsize -= (tp[tsize - 1] == 0); 957 958 if (sign >= 0) 959 { 960 ASSERT_NOCARRY (mpn_sub (tp, tp, tsize, xp, xsize)); 961 MPN_NORMALIZE (tp, tsize); 962 } 963 else 964 { 965 mp_limb_t cy = mpn_add (tp, tp, tsize, xp, xsize); 966 tp[tsize] = cy; 967 tsize += (cy != 0); 968 } 969 970 if (tsize > 0) 971 { 972 mpn_tdiv_qr (qp, rp, 0, tp, tsize, bp, bsize); 973 for (i = 0; i < bsize; i++) 974 ASSERT (rp[i] == 0); 975 qsize = tsize - bsize; 976 qsize += (qp[qsize] != 0); 977 ASSERT (MPN_LEQ_P (qp, qsize, ap, asize)); 978 } 979 TMP_FREE; 980 } 981 # define ASSERT_ROW(ap, asize, bp, bsize, sign, usize, r) \ 982 sanity_check_row (ap, asize, bp, bsize, sign, usize, r) 983 984 #else /* !WANT_ASSERT */ 985 # define ASSERT_ROW(ap, asize, bp, bsize, sign, usize, r) 986 #endif /* !WANT_ASSERT */ 987 988 static mp_size_t 989 gcdext_schoenhage (mp_ptr gp, mp_ptr up, mp_size_t *usizep, 990 mp_srcptr ap, mp_size_t asize, 991 mp_srcptr bp, mp_size_t bsize, 992 mp_ptr tp, mp_size_t talloc) 993 { 994 mp_size_t scratch; 995 struct hgcd hgcd; 996 struct qstack quotients; 997 struct hgcd_row r[4]; 998 999 /* Size and sign of u fields. The largest u should be normalized to 1000 this size, and except for the case u1 = 0, that is the latest 1001 u. */ 1002 int rsize; 1003 int rsign; 1004 1005 mp_ptr qp; 1006 mp_size_t qsize; 1007 mp_size_t ralloc = asize + 1; 1008 mp_size_t ualloc = bsize + 1; 1009 1010 ASSERT (asize >= bsize); 1011 ASSERT (bsize > 0); 1012 1013 ASSERT (MPN_LEQ_P (bp, bsize, ap, asize)); 1014 1015 ASSERT (4 * ralloc + 4*ualloc + asize <= talloc); 1016 1017 r[0].rp = tp; tp += ralloc; talloc -= ralloc; 1018 r[1].rp = tp; tp += ralloc; talloc -= ralloc; 1019 r[2].rp = tp; tp += ralloc; talloc -= ralloc; 1020 r[3].rp = tp; tp += ralloc; talloc -= ralloc; 1021 1022 /* Must zero out the u fields */ 1023 MPN_ZERO (tp, 4 * ualloc); 1024 1025 r[0].uvp[0] = tp; tp += ualloc; talloc -= ualloc; 1026 r[1].uvp[0] = tp; tp += ualloc; talloc -= ualloc; 1027 r[2].uvp[0] = tp; tp += ualloc; talloc -= ualloc; 1028 r[3].uvp[0] = tp; tp += ualloc; talloc -= ualloc; 1029 1030 qp = tp; tp += asize; talloc -= asize; 1031 1032 ASSERT (asize >= bsize); 1033 ASSERT (bsize > 0); 1034 MPN_COPY (r[0].rp, ap, asize); r[0].rsize = asize; 1035 MPN_COPY (r[1].rp, bp, bsize); r[1].rsize = bsize; 1036 1037 r[0].uvp[0][0] = 1; 1038 r[1].uvp[0][0] = 0; 1039 1040 /* We don't use the v fields. */ 1041 rsize = 1; 1042 rsign = 0; 1043 1044 scratch = mpn_hgcd_init_itch ((asize + 1) / 2); 1045 ASSERT (scratch <= talloc); 1046 mpn_hgcd_init (&hgcd, (asize + 1) / 2, tp); 1047 tp += scratch; talloc -= scratch; 1048 1049 { 1050 mp_size_t nlimbs = qstack_itch ((asize + 1) / 2); 1051 1052 ASSERT (nlimbs <= talloc); 1053 qstack_init ("ients, (asize + 1) / 2, tp, nlimbs); 1054 1055 tp += nlimbs; 1056 talloc -= nlimbs; 1057 scratch += nlimbs; 1058 } 1059 1060 while (ABOVE_THRESHOLD (r[0].rsize, GCDEXT_SCHOENHAGE_THRESHOLD) 1061 && r[1].rsize > 0) 1062 { 1063 mp_size_t k = r[0].rsize / 2; 1064 int res; 1065 1066 ASSERT_ROW (ap, asize, bp, bsize, rsign, rsize, r); 1067 ASSERT_ROW (ap, asize, bp, bsize, ~rsign, rsize, r + 1); 1068 1069 if (r[1].rsize <= k) 1070 goto euclid; 1071 1072 qstack_reset ("ients, r[0].rsize - k); 1073 1074 res = mpn_hgcd (&hgcd, 1075 r[0].rp + k, r[0].rsize - k, 1076 r[1].rp + k, r[1].rsize - k, 1077 "ients, 1078 tp, talloc); 1079 1080 if (res == 0 || res == 1) 527 1081 { 528 /* This is quite rare. I.e., optimize something else! */ 1082 euclid: 1083 qsize = hgcd_tdiv (qp, r[2].rp, &r[2].rsize, 1084 r[0].rp, r[0].rsize, 1085 r[1].rp, r[1].rsize); 1086 rsize = hgcd_update_u (r, rsize, qp, qsize, ualloc); 1087 ASSERT (rsize < ualloc); 529 1088 530 mpn_tdiv_qr (wp, up, (mp_size_t) 0, up, size, vp, vsize);1089 ASSERT_ROW (ap, asize, bp, bsize, rsign, rsize, r + 2); 531 1090 532 #if EXTEND 533 MPN_COPY (tp, s0p, ssize); 534 { 535 mp_size_t qsize; 536 mp_size_t i; 537 538 qsize = size - vsize + 1; /* size of stored quotient from division */ 539 MPN_ZERO (s1p + ssize, qsize); /* zero s1 too */ 540 541 for (i = 0; i < qsize; i++) 542 { 543 mp_limb_t cy; 544 cy = mpn_addmul_1 (tp + i, s1p, ssize, wp[i]); 545 tp[ssize + i] = cy; 546 } 547 548 ssize += qsize; 549 ssize -= tp[ssize - 1] == 0; 550 } 551 552 sign = -sign; 553 MP_PTR_SWAP (s0p, s1p); 554 MP_PTR_SWAP (s1p, tp); 555 #endif 556 size = vsize; 557 MP_PTR_SWAP (up, vp); 1091 HGCD_SWAP3_LEFT (r); 1092 rsign = ~rsign; 558 1093 } 559 1094 else 560 1095 { 561 #if EXTEND 562 mp_size_t tsize, wsize; 563 #endif 564 /* T = U*A + V*B 565 W = U*C + V*D 566 U = T 567 V = W */ 568 569 #if STAT 570 { mp_limb_t x; x = A | B | C | D; count_leading_zeros (cnt, x); 571 arr[GMP_LIMB_BITS - cnt]++; } 572 #endif 573 if (A == 0) 574 { 575 /* B == 1 and C == 1 (D is arbitrary) */ 576 mp_limb_t cy; 577 MPN_COPY (tp, vp, size); 578 MPN_COPY (wp, up, size); 579 mpn_submul_1 (wp, vp, size, D); 580 MP_PTR_SWAP (tp, up); 581 MP_PTR_SWAP (wp, vp); 582 #if EXTEND 583 MPN_COPY (tp, s1p, ssize); 584 tsize = ssize; 585 tp[ssize] = 0; /* must zero since wp might spill below */ 586 MPN_COPY (wp, s0p, ssize); 587 cy = mpn_addmul_1 (wp, s1p, ssize, D); 588 wp[ssize] = cy; 589 wsize = ssize + (cy != 0); 590 MP_PTR_SWAP (tp, s0p); 591 MP_PTR_SWAP (wp, s1p); 592 ssize = MAX (wsize, tsize); 593 #endif 594 } 595 else 596 { 597 mp_limb_t cy, cy1, cy2; 1096 const struct hgcd_row *s = hgcd.row + (res - 2); 1097 int sign = hgcd.sign; 1098 if (res == 3) 1099 sign = ~sign; 1100 1101 /* s[0] and s[1] are correct */ 1102 r[2].rsize 1103 = mpn_hgcd_fix (k, r[2].rp, ralloc, 1104 sign, hgcd.size, s, 1105 r[0].rp, r[1].rp, 1106 tp, talloc); 1107 1108 r[3].rsize 1109 = mpn_hgcd_fix (k, r[3].rp, ralloc, 1110 ~sign, hgcd.size, s+1, 1111 r[0].rp, r[1].rp, 1112 tp, talloc); 1113 1114 rsize = hgcd_mul_vector (r + 2, ualloc, s, hgcd.size, 1115 r, rsize, tp, talloc); 1116 ASSERT (rsize < ualloc); 1117 1118 rsign ^= sign; 1119 ASSERT_ROW (ap, asize, bp, bsize, rsign, rsize, r + 2); 1120 ASSERT_ROW (ap, asize, bp, bsize, ~rsign, rsize, r + 3); 598 1121 599 if (asign) 600 { 601 mpn_mul_1 (tp, vp, size, B); 602 mpn_submul_1 (tp, up, size, A); 603 mpn_mul_1 (wp, up, size, C); 604 mpn_submul_1 (wp, vp, size, D); 605 } 606 else 607 { 608 mpn_mul_1 (tp, up, size, A); 609 mpn_submul_1 (tp, vp, size, B); 610 mpn_mul_1 (wp, vp, size, D); 611 mpn_submul_1 (wp, up, size, C); 612 } 613 MP_PTR_SWAP (tp, up); 614 MP_PTR_SWAP (wp, vp); 615 #if EXTEND 616 /* Compute new s0 */ 617 cy1 = mpn_mul_1 (tp, s0p, ssize, A); 618 cy2 = mpn_addmul_1 (tp, s1p, ssize, B); 619 cy = cy1 + cy2; 620 tp[ssize] = cy & GMP_NUMB_MASK; 621 tsize = ssize + (cy != 0); 622 #if GMP_NAIL_BITS == 0 623 if (cy < cy1) 624 #else 625 if (cy > GMP_NUMB_MAX) 626 #endif 627 { 628 tp[tsize] = 1; 629 wp[tsize] = 0; 630 tsize++; 631 /* This happens just for nails, since we get more work done 632 per numb there. */ 633 } 634 635 /* Compute new s1 */ 636 cy1 = mpn_mul_1 (wp, s1p, ssize, D); 637 cy2 = mpn_addmul_1 (wp, s0p, ssize, C); 638 cy = cy1 + cy2; 639 wp[ssize] = cy & GMP_NUMB_MASK; 640 wsize = ssize + (cy != 0); 641 #if GMP_NAIL_BITS == 0 642 if (cy < cy1) 643 #else 644 if (cy > GMP_NUMB_MAX) 645 #endif 646 { 647 wp[wsize] = 1; 648 if (wsize >= tsize) 649 tp[wsize] = 0; 650 wsize++; 651 } 652 653 MP_PTR_SWAP (tp, s0p); 654 MP_PTR_SWAP (wp, s1p); 655 ssize = MAX (wsize, tsize); 656 #endif 657 } 658 size -= up[size - 1] == 0; 659 #if GMP_NAIL_BITS != 0 660 size -= up[size - 1] == 0; 661 #endif 1122 HGCD_SWAP4_2 (r); 662 1123 } 663 664 #if WANT_GCDEXT_ONE_STEP665 TMP_FREE;666 return 0;667 #endif668 1124 } 1125 if (r[1].rsize == 0) 1126 { 1127 MPN_COPY (gp, r[0].rp, r[0].rsize); 1128 MPN_NORMALIZE (r[0].uvp[0], rsize); 1129 MPN_COPY (up, r[0].uvp[0], rsize); 669 1130 670 #if RECORD 671 printf ("max: %lx\n", max); 672 #endif 1131 *usizep = (rsign >= 0) ? rsize : - rsize; 1132 return r[0].rsize; 1133 } 1134 else if (r[0].rsize == 1) 1135 { 1136 mp_limb_t u; 1137 mp_limb_t v; 1138 mp_limb_t cy; 1139 1140 gp[0] = gcdext_1 (&u, &v, r[0].rp[0], r[1].rp[0]); 1141 1142 /* g = u r0 + v r1 = (u u0 + v u1) a + (...) b */ 1143 cy = mpn_addmul2_n_1 (up, rsize, 1144 r[0].uvp[0], u, 1145 r[1].uvp[0], v); 1146 1147 rsize++; 1148 if (cy) 1149 up[rsize++] = cy; 1150 else 1151 MPN_NORMALIZE (up, rsize); 673 1152 674 #if STAT 675 {int i; for (i = 0; i <= GMP_LIMB_BITS; i++) printf ("%d:%d\n", i, arr[i]);} 676 #endif 1153 *usizep = (rsign >= 0) ? rsize : -rsize; 1154 return 1; 677 1155 678 if (vsize == 0)679 {680 if (gp != up && gp != 0)681 MPN_COPY (gp, up, size);682 #if EXTEND683 MPN_NORMALIZE (s0p, ssize);684 if (orig_s0p != s0p)685 MPN_COPY (orig_s0p, s0p, ssize);686 *s0size = sign >= 0 ? ssize : -ssize;687 #endif688 TMP_FREE;689 return size;690 1156 } 691 1157 else 692 1158 { 693 mp_limb_t vl, ul, t; 694 #if EXTEND 695 mp_size_t qsize, i; 696 #endif 697 vl = vp[0]; 698 #if EXTEND 699 t = mpn_divmod_1 (wp, up, size, vl); 700 701 MPN_COPY (tp, s0p, ssize); 702 703 qsize = size - (wp[size - 1] == 0); /* size of quotient from division */ 704 if (ssize < qsize) 705 { 706 MPN_ZERO (tp + ssize, qsize - ssize); 707 MPN_ZERO (s1p + ssize, qsize); /* zero s1 too */ 708 for (i = 0; i < ssize; i++) 709 { 710 mp_limb_t cy; 711 cy = mpn_addmul_1 (tp + i, wp, qsize, s1p[i]); 712 tp[qsize + i] = cy; 713 } 1159 /* We have r0 = u0 a + v0 b, 1160 r1 = u1 a + v1 b 1161 1162 Compute g = u r0 + v r1 = (u u0 + v u1) a + (...) b 1163 In the expression (u u0 + v u1), we have 1164 1165 u <= r1, 1166 u0 <= b/r0 (except if r0 = a, which should never be the case here) 1167 v <= r0 1168 u1 <= b/r0 1169 */ 1170 1171 mp_size_t gsize; 1172 mp_size_t usize; 1173 mp_size_t vsize; 1174 1175 /* u1 should be non-zero, and normalized */ 1176 ASSERT (rsize); 1177 ASSERT (r[1].uvp[0][rsize - 1] != 0); 1178 #if WANT_TRACE 1179 trace ("gcdext: \n" 1180 "r0 = %Nd\n" 1181 "r1 = %Nd\n" 1182 "u0 = %Nd\n" 1183 "u1 = %Nd\n", 1184 r[0].rp, r[0].rsize, r[1].rp, r[1].rsize, 1185 r[0].uvp[0], rsize, r[1].uvp[0], rsize); 1186 #endif 1187 /* We don't need the space for hgcd and the quotient stack any more */ 1188 tp -= scratch; talloc += scratch; 1189 1190 /* Stores u in r[2] and v in r[3] */ 1191 gsize = gcdext_lehmer (gp, r[2].uvp[0], &usize, 1192 r[0].rp, r[0].rsize, 1193 r[1].rp, r[1].rsize, 1194 tp, talloc); 1195 1196 if (usize == 0) 1197 { 1198 /* u == 0 ==> v = g / b == 1 ==> g = u1 a + (...) b */ 1199 1200 MPN_NORMALIZE (r[1].uvp[0], rsize); 1201 MPN_COPY (up, r[1].uvp[0], rsize); 1202 *usizep = (rsign >= 0) ? - rsize : rsize; 1203 1204 return gsize; 714 1205 } 1206 1207 /* Compute v = (g - s r0) / r1, storing it in r[3] */ 1208 vsize = compute_v (r[3].uvp[0], ualloc, 1209 r[0].rp, r[0].rsize, r[1].rp, r[1].rsize, 1210 gp, gsize, 1211 r[2].uvp[0], usize, 1212 tp, talloc); 1213 1214 if (usize < 0) 1215 { 1216 usize = - usize; 1217 rsign = ~rsign; 1218 } 1219 1220 /* It's possible that u0 = 0, u1 = 1 */ 1221 if (rsize == 1 && r[0].uvp[0][0] == 0) 1222 { 1223 /* u0 == 0 ==> u u0 + v u1 = v */ 1224 MPN_COPY (up, r[3].uvp[0], vsize); 1225 *usizep = (rsign >= 0) ? vsize : - vsize; 1226 1227 return gsize; 1228 } 1229 1230 /* Ok, now u0, u1, u are non-zero. We may still have v == 0 */ 1231 ASSERT (usize + rsize <= ualloc); 1232 ASSERT (vsize + rsize <= ualloc); 1233 1234 /* Compute u u0 */ 1235 if (usize <= rsize) 1236 /* Should be the common case */ 1237 mpn_mul (up, 1238 r[0].uvp[0], rsize, 1239 r[2].uvp[0], usize); 715 1240 else 1241 mpn_mul (up, 1242 r[2].uvp[0], usize, 1243 r[0].uvp[0], rsize); 1244 1245 usize += rsize; 1246 1247 /* There may be more than one zero limb, if #u0 < #u1 */ 1248 MPN_NORMALIZE (up, usize); 1249 ASSERT (usize < ualloc); 1250 1251 if (vsize) 716 1252 { 717 MPN_ZERO (s1p + ssize, qsize); /* zero s1 too */ 718 for (i = 0; i < qsize; i++) 1253 mp_limb_t cy; 1254 1255 /* Overwrites old r[2].uvp[0] value */ 1256 if (vsize <= rsize) 1257 /* Should be the common case */ 1258 cy = mpn_mul (r[2].uvp[0], 1259 r[1].uvp[0], rsize, 1260 r[3].uvp[0], vsize); 1261 else 1262 cy = mpn_mul (r[2].uvp[0], 1263 r[3].uvp[0], vsize, 1264 r[1].uvp[0], rsize); 1265 1266 vsize += rsize - (cy == 0); 1267 ASSERT (vsize < ualloc); 1268 1269 if (vsize <= usize) 1270 cy = mpn_add (up, up, usize, r[2].uvp[0], vsize); 1271 else 719 1272 { 720 mp_limb_t cy; 721 cy = mpn_addmul_1 (tp + i, s1p, ssize, wp[i]); 722 tp[ssize + i] = cy; 1273 cy = mpn_add (up, r[2].uvp[0], vsize, up, usize); 1274 usize = vsize; 723 1275 } 1276 up[usize] = cy; 1277 usize += (cy != 0); 1278 1279 ASSERT (usize < ualloc); 724 1280 } 725 ssize += qsize; 726 ssize -= tp[ssize - 1] == 0; 1281 *usizep = (rsign >= 0) ? usize : -usize; 727 1282 728 sign = -sign; 729 MP_PTR_SWAP (s0p, s1p); 730 MP_PTR_SWAP (s1p, tp); 731 #else 732 t = mpn_mod_1 (up, size, vl); 733 #endif 734 ul = vl; 735 vl = t; 736 while (vl != 0) 737 { 738 mp_limb_t t; 739 #if EXTEND 740 mp_limb_t q; 741 q = ul / vl; 742 t = ul - q * vl; 743 744 MPN_COPY (tp, s0p, ssize); 745 746 MPN_ZERO (s1p + ssize, 1); /* zero s1 too */ 747 748 { 749 mp_limb_t cy; 750 cy = mpn_addmul_1 (tp, s1p, ssize, q); 751 tp[ssize] = cy; 752 } 753 754 ssize += 1; 755 ssize -= tp[ssize - 1] == 0; 756 757 sign = -sign; 758 MP_PTR_SWAP (s0p, s1p); 759 MP_PTR_SWAP (s1p, tp); 1283 return gsize; 1284 } 1285 } 1286 1287 mp_size_t 1288 mpn_gcdext (mp_ptr gp, mp_ptr up, mp_size_t *usizep, 1289 mp_ptr ap, mp_size_t asize, mp_ptr bp, mp_size_t bsize) 1290 { 1291 ASSERT (asize >= bsize); 1292 ASSERT (bsize > 0); 1293 1294 if (asize == 1) 1295 { 1296 #if GCDEXT_1_USE_BINARY 1297 mp_limb_t v; 1298 *gp = gcdext_1 (up, &v, ap[0], bp[0]); 760 1299 #else 761 t = ul % vl;1300 *gp = gcdext_1_u (up, ap[0], bp[0]); 762 1301 #endif 763 ul = vl; 764 vl = t; 765 } 766 if (gp != 0) 767 gp[0] = ul; 768 #if EXTEND 769 MPN_NORMALIZE (s0p, ssize); 770 if (orig_s0p != s0p) 771 MPN_COPY (orig_s0p, s0p, ssize); 772 *s0size = sign >= 0 ? ssize : -ssize; 773 #endif 774 TMP_FREE; 1302 *usizep = (up[0] != 0); 1303 ASSERT(gp[0] != 0); 775 1304 return 1; 776 1305 } 1306 else if (BELOW_THRESHOLD (asize, GCDEXT_SCHOENHAGE_THRESHOLD)) 1307 { 1308 mp_size_t gsize; 1309 mp_ptr tp; 1310 mp_size_t talloc = gcdext_lehmer_itch (asize, bsize); 1311 TMP_DECL; 1312 TMP_MARK; 1313 1314 tp = TMP_ALLOC_LIMBS (talloc); 1315 gsize = gcdext_lehmer (gp, up, usizep, ap, asize, bp, bsize, 1316 tp, talloc); 1317 TMP_FREE; 1318 return gsize; 1319 } 1320 else 1321 { 1322 mp_size_t gsize; 1323 mp_ptr tp; 1324 mp_size_t talloc = gcdext_schoenhage_itch (asize, bsize); 1325 TMP_DECL; 1326 TMP_MARK; 1327 1328 tp = TMP_ALLOC_LIMBS (talloc); 1329 gsize = gcdext_schoenhage (gp, up, usizep, ap, asize, bp, bsize, 1330 tp, talloc); 1331 TMP_FREE; 1332 return gsize; 1333 } 777 1334 } -
mpn/generic/hgcd.c
diff -N -u -r gmp-4.2.1/mpn/generic/hgcd.c gmp-4.2.1-patched/mpn/generic/hgcd.c
old new 1 /* hgcd.c. 2 3 THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY 4 SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST 5 GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. 6 7 Copyright 2003, 2004 Free Software Foundation, Inc. 8 9 This file is part of the GNU MP Library. 10 11 The GNU MP Library is free software; you can redistribute it and/or modify 12 it under the terms of the GNU General Public License as published by 13 the Free Software Foundation; either version 2.1 of the License, or (at your 14 option) any later version. 15 16 The GNU MP Library is distributed in the hope that it will be useful, but 17 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 18 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public 19 License for more details. 20 21 You should have received a copy of the GNU General Public License 22 along with the GNU MP Library; see the file COPYING. If not, write to 23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, 24 MA 02111-1307, USA. */ 25 26 #define WANT_TRACE 0 27 28 #if WANT_TRACE 29 # include <stdio.h> 30 # include <stdarg.h> 31 #endif 32 33 #include "gmp.h" 34 #include "gmp-impl.h" 35 #include "longlong.h" 36 37 #if WANT_TRACE 38 static void 39 trace (const char *format, ...) 40 { 41 va_list args; 42 va_start (args, format); 43 gmp_vfprintf (stderr, format, args); 44 va_end (args); 45 } 46 #endif 47 48 /* Comparison of _normalized_ numbers. */ 49 50 #define MPN_EQUAL_P(ap, asize, bp, bsize) \ 51 ((asize) == (bsize) && mpn_cmp ((ap), (bp), (asize)) == 0) 52 53 #define MPN_LEQ_P(ap, asize, bp, bsize) \ 54 ((asize) < (bsize) || ((asize) == (bsize) \ 55 && mpn_cmp ((ap), (bp), (asize)) <= 0)) 56 57 #define MPN_LESS_P(ap, asize, bp, bsize) \ 58 ((asize) < (bsize) || ((asize) == (bsize) \ 59 && mpn_cmp ((ap), (bp), (asize)) < 0)) 60 61 /* Extract one limb, shifting count bits left 62 ________ ________ 63 |___xh___||___xl___| 64 |____r____| 65 >count < 66 67 The count includes any nail bits, so it should work fine if 68 count is computed using count_leading_zeros. 69 */ 70 71 #define MPN_EXTRACT_LIMB(count, xh, xl) \ 72 ((((xh) << ((count) - GMP_NAIL_BITS)) & GMP_NUMB_MASK) | \ 73 ((xl) >> (GMP_LIMB_BITS - (count)))) 74 75 76 /* Return -1 if a < x + y + z, 77 0 if a = x + y + z, 78 1 if a > x + y + z. */ 79 static int 80 mpn_cmp_sum3 (mp_srcptr ap, mp_size_t an, 81 mp_srcptr xp, mp_size_t xn, 82 mp_srcptr yp, mp_size_t yn, 83 mp_srcptr zp, mp_size_t zn) 84 { 85 mp_limb_t cy; 86 87 /* Check that all limbs beyond an are zero. This should be slightly 88 cheaper than fully normalizing all the input numbers. */ 89 90 while (xn > an) 91 if (xp[--xn] > 0) return -1; 92 while (yn > an) 93 if (yp[--yn] > 0) return -1; 94 while (zn > an) 95 if (zp[--zn] > 0) return -1; 96 97 /* Start by sorting so that xn >= yn >= zn. Six permutations, so we 98 can't get away with less than three comparisons, at least not for 99 the worst case. */ 100 101 if (xn < yn) 102 MPN_SRCPTR_SWAP (xp, xn, yp, yn); 103 if (yn < zn) 104 MPN_SRCPTR_SWAP (yp, yn, zp, zn); 105 if (xn < yn) 106 MPN_SRCPTR_SWAP (xp, xn, yp, yn); 107 108 ASSERT (an >= xn && xn >= yn && yn >= zn); 109 110 /* Assume that a = x + y + z, and write the addition limb by limb. 111 112 (c[1], a[0]) = x[0] + y[0] + z[0] + c[0] 113 (c[2], a[1]) = x[1] + y[1] + z[1] + c[1] 114 (c[k+1], a[k]) = x[k] + y[k] + z[k] + c[2] 115 ... 116 (c[n], a[n-1]) = x[n-1] + y[n-1] + z[n-1] + c[n-1] 117 118 where the start and stop conditions are that c[0] = c[n] = 0. 119 Then we can start at the high end, iterating 120 121 c[k] = (c[k+1], a[k]) - x[k] - y[k] - z[k] 122 123 If equality holds, then 0 <= c[k] <= 2 for all k (since for 124 example 0xf + 0xf + 0xf + 2 = 0x2f). If we find c[k] < 0, then we 125 know that a < x + y + z, and if we find c[k] > 2, then we know a 126 > x + y + z. */ 127 128 cy = 0; 129 130 while (an > xn) 131 { 132 /* c[k] = (c[k+1], a[k]) */ 133 if (cy > 0) 134 return 1; 135 136 cy = ap[--an]; 137 } 138 139 #if GMP_NAIL_BITS >= 2 140 while (an > yn) 141 { 142 if (cy > 1) 143 return 1; 144 145 cy = (cy << GMP_NUMB_BITS) + ap[--an]; 146 if (cy < xp[an]) 147 return -1; 148 cy -= xp[an]; 149 } 150 while (an > zn) 151 { 152 mp_limb_t s; 153 154 if (cy > 2) 155 return 1; 156 157 cy = (cy << GMP_NUMB_BITS ) + ap[--an]; 158 s = xp[an] + yp[an]; 159 if (cy < s) 160 return -1; 161 cy -= s; 162 } 163 while (an > 0) 164 { 165 mp_limb_t s; 166 167 if (cy > 2) 168 return 1; 169 170 cy = (cy << GMP_NUMB_BITS ) + ap[--an]; 171 s = xp[an] + yp[an] + zp[an]; 172 if (cy < s) 173 return -1; 174 cy -= s; 175 } 176 #else /* GMP_NAIL_BITS < 2 */ 177 #if GMP_NAIL_BITS == 1 178 loselose 179 #endif 180 while (an > yn) 181 { 182 /* c[k] = (c[k+1], a[k]) - x[k] */ 183 if (cy > 1) 184 return 1; 185 186 --an; 187 188 if (cy == 1) 189 { 190 if (ap[an] >= xp[an]) 191 return 1; 192 cy = (ap[an] - xp[an]) & GMP_NUMB_MASK; 193 } 194 else 195 { 196 /* cy == 0 */ 197 if (ap[an] < xp[an]) 198 return -1; 199 else 200 cy = ap[an] - xp[an]; 201 } 202 } 203 204 while (an > zn) 205 { 206 mp_limb_t sh, sl; 207 208 /* c[k] = (c[k+1], a[k]) - x[k] - y[k] */ 209 if (cy > 2) 210 return 1; 211 212 --an; 213 214 sl = xp[an] + yp[an]; 215 sh = (sl < xp[an]); 216 217 if (cy < sh || (cy == sh && ap[an] < sl)) 218 return -1; 219 220 sl = ap[an] - sl; /* Monkey business */ 221 sh = cy - sh - (sl > ap[an]); 222 if (sh > 0) 223 return 1; 224 cy = sl; 225 } 226 while (an > 0) 227 { 228 mp_limb_t sh, sl; 229 if (cy > 2) 230 return 1; 231 232 --an; 233 234 sl = xp[an] + yp[an]; 235 sh = (sl < xp[an]); 236 237 sl += zp[an]; 238 sh += sl < zp[an]; 239 240 if (cy < sh || (cy == sh && ap[an] < sl)) 241 return -1; 242 sl = ap[an] - sl; /* Monkey business */ 243 sh = cy - sh - (sl > ap[an]); 244 if (sh > 0) 245 return 1; 246 cy = sl; 247 } 248 #endif /* GMP_NAIL_BITS < 2 */ 249 return cy > 0; 250 } 251 252 /* Only the first row has v = 0, a = 1 * a + 0 * b */ 253 static inline int 254 hgcd_start_row_p (const struct hgcd_row *r, mp_size_t n) 255 { 256 mp_size_t i; 257 mp_srcptr vp = r->uvp[1]; 258 259 for (i = 0; i < n; i++) 260 if (vp[i] != 0) 261 return 0; 262 263 return 1; 264 } 265 266 /* Called when r[0, 1, 2] >= W^M, r[3] < W^M. Returns the number of 267 remainders that satisfy Jebelean's criterion, i.e. find the largest k 268 such that 269 270 r[k+1] >= max (-u[k+1], - v[k+1]) 271 272 r[k] - r[k-1] >= max (u[k+1] - u[k], v[k+1] - v[k]) 273 274 Return 0 on failure, i.e. if B or A mod B < W^M. Return 1 in case 275 r0 and r1 are correct, but we still make no progress because r0 = 276 A, r1 = B. 277 278 Otherwise return 2, 3 or 4, the number of r:s that are correct. 279 */ 280 static int 281 hgcd_jebelean (const struct hgcd *hgcd, mp_size_t M) 282 { 283 mp_size_t L; 284 unsigned bit; 285 286 ASSERT (hgcd->row[0].rsize > M); 287 ASSERT (hgcd->row[1].rsize > M); 288 ASSERT (hgcd->row[2].rsize > M); 289 ASSERT (hgcd->row[3].rsize <= M); 290 291 ASSERT (MPN_LESS_P (hgcd->row[1].rp, hgcd->row[1].rsize, 292 hgcd->row[0].rp, hgcd->row[0].rsize)); 293 ASSERT (MPN_LESS_P (hgcd->row[2].rp, hgcd->row[2].rsize, 294 hgcd->row[1].rp, hgcd->row[1].rsize)); 295 ASSERT (MPN_LESS_P (hgcd->row[3].rp, hgcd->row[3].rsize, 296 hgcd->row[2].rp, hgcd->row[2].rsize)); 297 298 ASSERT (mpn_cmp (hgcd->row[0].uvp[1], hgcd->row[1].uvp[1], hgcd->size) <= 0); 299 ASSERT (mpn_cmp (hgcd->row[1].uvp[1], hgcd->row[2].uvp[1], hgcd->size) <= 0); 300 ASSERT (mpn_cmp (hgcd->row[2].uvp[1], hgcd->row[3].uvp[1], hgcd->size) <= 0); 301 302 /* The bound is really floor (N/2), which is <= M = ceil (N/2) */ 303 L = hgcd->size; 304 ASSERT (L <= M); 305 306 ASSERT (L > 0); 307 ASSERT (hgcd->row[3].uvp[1][L - 1] != 0); 308 309 bit = hgcd->sign < 0; 310 311 /* Check r1 - r2 >= max (u2 - u1, v2 - v1) = {|u1| + |u2|, |v1| + |v2|}[bit] */ 312 313 if (mpn_cmp_sum3 (hgcd->row[1].rp, hgcd->row[1].rsize, 314 hgcd->row[2].rp, hgcd->row[2].rsize, 315 hgcd->row[1].uvp[bit], L, 316 hgcd->row[2].uvp[bit], L) < 0) 317 return 2 - (hgcd_start_row_p (hgcd->row, hgcd->size)); 318 319 /* Ok, r2 is correct */ 320 321 /* Check r3 >= max (-u3, -v3) = (|u3|, |v3|)[bit] */ 322 if (hgcd->row[3].rsize > L) 323 /* Condition satisfied */ 324 ; 325 else 326 { 327 mp_size_t size; 328 for (size = L; size > hgcd->row[3].rsize; size--) 329 { 330 if (hgcd->row[3].uvp[bit][size-1] != 0) 331 return 3; 332 } 333 if (mpn_cmp (hgcd->row[3].rp, hgcd->row[3].uvp[bit], size) < 0) 334 return 3; 335 } 336 337 /* Check r3 - r2 >= max(u3-u2, v3-v2) = {|u2| + |u3|, |v2| +|v3|}[1-bit] */ 338 339 if (mpn_cmp_sum3 (hgcd->row[2].rp, hgcd->row[2].rsize, 340 hgcd->row[3].rp, hgcd->row[3].rsize, 341 hgcd->row[2].uvp[bit ^ 1], L, 342 hgcd->row[3].uvp[bit ^ 1], L) < 0) 343 return 3; 344 345 /* Ok, r3 is correct */ 346 return 4; 347 } 348 349 350 /* Compute au + bv. u and v are single limbs, a and b are n limbs each. 351 Stores n+1 limbs in rp, and returns the (n+2)'nd limb. */ 352 /* FIXME: With nails, we can instead return limb n+1, possibly including 353 one non-zero nail bit. */ 354 static mp_limb_t 355 mpn_addmul2_n_1 (mp_ptr rp, mp_size_t n, 356 mp_srcptr ap, mp_limb_t u, 357 mp_srcptr bp, mp_limb_t v) 358 { 359 mp_limb_t h; 360 mp_limb_t cy; 361 362 h = mpn_mul_1 (rp, ap, n, u); 363 cy = mpn_addmul_1 (rp, bp, n, v); 364 h += cy; 365 #if GMP_NAIL_BITS == 0 366 rp[n] = h; 367 return (h < cy); 368 #else /* GMP_NAIL_BITS > 0 */ 369 rp[n] = h & GMP_NUMB_MASK; 370 return h >> GMP_NUMB_BITS; 371 #endif /* GMP_NAIL_BITS > 0 */ 372 } 373 374 375 static inline void 376 qstack_drop (struct qstack *stack) 377 { 378 ASSERT (stack->size_next); 379 stack->limb_next -= stack->size[--stack->size_next]; 380 } 381 382 /* Get top element */ 383 static inline mp_size_t 384 qstack_get_0 (const struct qstack *stack, 385 mp_srcptr *qp) 386 { 387 mp_size_t qsize; 388 ASSERT (stack->size_next); 389 390 qsize = stack->size[stack->size_next - 1]; 391 *qp = stack->limb + stack->limb_next - qsize; 392 393 return qsize; 394 } 395 396 /* Get element just below the top */ 397 static inline mp_size_t 398 qstack_get_1 (const struct qstack *stack, 399 mp_srcptr *qp) 400 { 401 mp_size_t qsize; 402 ASSERT (stack->size_next >= 2); 403 404 qsize = stack->size[stack->size_next - 2]; 405 *qp = stack->limb + stack->limb_next 406 - stack->size[stack->size_next - 1] 407 - qsize; 408 409 return qsize; 410 } 411 412 /* Adds d to the element on top of the stack */ 413 static void 414 qstack_adjust (struct qstack *stack, mp_limb_t d) 415 { 416 mp_size_t qsize; 417 mp_ptr qp; 418 419 ASSERT (stack->size_next); 420 421 ASSERT_QSTACK (stack); 422 423 if (stack->limb_next >= stack->limb_alloc) 424 { 425 qstack_rotate (stack, 1); 426 } 427 428 ASSERT (stack->limb_next < stack->limb_alloc); 429 430 qsize = stack->size[stack->size_next - 1]; 431 qp = stack->limb + stack->limb_next - qsize; 432 433 if (qsize == 0) 434 { 435 qp[0] = 1 + d; 436 stack->size[stack->size_next - 1] = 1; 437 stack->limb_next++; 438 } 439 else 440 { 441 mp_limb_t cy = mpn_add_1 (qp, qp, qsize, d); 442 if (cy) 443 { 444 qp[qsize] = cy; 445 stack->size[stack->size_next - 1]++; 446 stack->limb_next++; 447 } 448 } 449 450 ASSERT_QSTACK (stack); 451 } 452 453 /* hgcd2 operations */ 454 455 /* Computes P = R * S. No overlap allowed. */ 456 static mp_size_t 457 hgcd2_mul (struct hgcd_row *P, mp_size_t alloc, 458 const struct hgcd2_row *R, 459 const struct hgcd_row *S, mp_size_t n) 460 { 461 int grow = 0; 462 mp_limb_t h = 0; 463 unsigned i; 464 unsigned j; 465 466 ASSERT (n < alloc); 467 468 for (i = 0; i < 2; i++) 469 for (j = 0; j < 2; j++) 470 { 471 /* Set P[i, j] = R[i, 0] S[0, j] + R[i,1] S[1, j] 472 = u_i s0j + v_i s1j */ 473 mp_limb_t cy; 474 475 cy = mpn_addmul2_n_1 (P[i].uvp[j], n, 476 S[0].uvp[j], R[i].u, 477 S[1].uvp[j], R[i].v); 478 if (cy) 479 { 480 ASSERT (n + 2 <= alloc); 481 P[i].uvp[j][n+1] = cy; 482 grow = 1; 483 } 484 else 485 h |= P[i].uvp[j][n]; 486 } 487 if (grow) 488 return n + 2; 489 else 490 /* Don't add redundant zeroes */ 491 return n + (h != 0); 492 } 493 494 unsigned 495 mpn_hgcd_max_recursion (mp_size_t n) 496 { 497 int count; 498 499 count_leading_zeros (count, (mp_limb_t) 500 (1 + n / (HGCD_SCHOENHAGE_THRESHOLD - 5))); 501 502 return GMP_LIMB_BITS - count; 503 } 504 505 mp_size_t 506 mpn_hgcd_init_itch (mp_size_t size) 507 { 508 /* r0 <= a, r1, r2, r3 <= b, but for simplicity, we allocate asize + 509 1 for all of them. The size of the uv:s are limited to asize / 2, 510 but we allocate one extra limb. */ 511 512 return 4 * (size + 1) + 8 * ((size / 2) + 1); 513 } 514 515 void 516 mpn_hgcd_init (struct hgcd *hgcd, 517 mp_size_t asize, 518 mp_limb_t *limbs) 519 { 520 unsigned i; 521 unsigned j; 522 mp_size_t alloc = (asize / 2) + 1; 523 524 hgcd->sign = 0; 525 526 for (i = 0; i < 4; i++) 527 { 528 hgcd->row[i].rp = limbs; 529 hgcd->row[i].rsize = asize + 1; limbs += asize + 1; 530 } 531 532 hgcd->alloc = alloc; 533 hgcd->size = alloc; 534 535 for (i = 0; i < 4; i++) 536 for (j = 0; j < 2; j++) 537 { 538 hgcd->row[i].uvp[j] = limbs; 539 limbs += alloc; 540 } 541 } 542 543 #if WANT_ASSERT 544 void 545 __gmpn_hgcd_sanity (const struct hgcd *hgcd, 546 mp_srcptr ap, mp_size_t asize, 547 mp_srcptr bp, mp_size_t bsize, 548 unsigned start, unsigned end) 549 { 550 int sign; 551 unsigned i; 552 mp_size_t L = hgcd->size; 553 mp_ptr tp; 554 mp_size_t talloc; 555 mp_ptr t1p; 556 mp_ptr t2p; 557 const struct hgcd_row *r; 558 559 ASSERT (asize >= bsize); 560 561 ASSERT (L <= asize / 2); 562 ASSERT (L); 563 564 ASSERT (L <= asize); 565 ASSERT (L <= bsize); 566 567 /* NOTE: We really need only asize + bsize + 2*L, but since we're 568 * swapping the pointers around, we allocate 2*(asize + L). */ 569 talloc = 2*(asize + L); 570 tp = __GMP_ALLOCATE_FUNC_LIMBS (talloc); 571 t1p = tp; 572 t2p = t1p + (asize + L); 573 574 sign = hgcd->sign; 575 if (start % 2) 576 sign = ~sign; 577 for (i = start, r = &hgcd->row[start]; i < end; i++, sign = ~sign, r++) 578 { 579 mp_size_t t1size = asize + L; 580 mp_size_t t2size = bsize + L; 581 582 mp_size_t k; 583 for (k = hgcd->size; k < hgcd->alloc; k++) 584 { 585 ASSERT (r->uvp[0][k] == 0); 586 ASSERT (r->uvp[1][k] == 0); 587 } 588 589 mpn_mul (t1p, ap, asize, r->uvp[0], L); 590 mpn_mul (t2p, bp, bsize, r->uvp[1], L); 591 592 if (sign < 0) 593 MPN_PTR_SWAP (t1p, t1size, t2p, t2size); 594 595 MPN_NORMALIZE (t2p, t2size); 596 ASSERT (t2size <= t1size); 597 ASSERT_NOCARRY (mpn_sub (t1p, t1p, t1size, t2p, t2size)); 598 599 MPN_NORMALIZE (t1p, t1size); 600 ASSERT (MPN_EQUAL_P (t1p, t1size, r->rp, r->rsize)); 601 } 602 __GMP_FREE_FUNC_LIMBS (tp, talloc); 603 for (i = start; i < end - 1; i++) 604 { 605 /* We should have strict inequality after each reduction step, 606 but we allow equal values for input. */ 607 ASSERT (MPN_LEQ_P (hgcd->row[i+1].rp, hgcd->row[i+1].rsize, 608 hgcd->row[i].rp, hgcd->row[i].rsize)); 609 } 610 } 611 #endif /* WANT_ASSERT */ 612 613 /* Helper functions for hgcd */ 614 /* Sets (a, b, c, d) <-- (b, c, d, a) */ 615 #define HGCD_SWAP4_LEFT(row) \ 616 do { \ 617 struct hgcd_row __hgcd_swap4_left_tmp; \ 618 __hgcd_swap4_left_tmp = row[0]; \ 619 row[0] = row[1]; \ 620 row[1] = row[2]; \ 621 row[2] = row[3]; \ 622 row[3] = __hgcd_swap4_left_tmp; \ 623 } while (0) 624 625 /* Sets (a, b, c, d) <-- (d, a, b, c) */ 626 #define HGCD_SWAP4_RIGHT(row) \ 627 do { \ 628 struct hgcd_row __hgcd_swap4_right_tmp; \ 629 __hgcd_swap4_right_tmp = row[3]; \ 630 row[3] = row[2]; \ 631 row[2] = row[1]; \ 632 row[1] = row[0]; \ 633 row[0] = __hgcd_swap4_right_tmp; \ 634 } while (0) 635 636 /* Sets (a, b, c, d) <-- (c, d, a, b) */ 637 #define HGCD_SWAP4_2(row) \ 638 do { \ 639 struct hgcd_row __hgcd_swap4_2_tmp; \ 640 __hgcd_swap4_2_tmp = row[0]; \ 641 row[0] = row[2]; \ 642 row[2] = __hgcd_swap4_2_tmp; \ 643 __hgcd_swap4_2_tmp = row[1]; \ 644 row[1] = row[3]; \ 645 row[3] = __hgcd_swap4_2_tmp; \ 646 } while (0) 647 648 /* Sets (a, b, c) <-- (b, c, a) */ 649 #define HGCD_SWAP3_LEFT(row) \ 650 do { \ 651 struct hgcd_row __hgcd_swap4_left_tmp; \ 652 __hgcd_swap4_left_tmp = row[0]; \ 653 row[0] = row[1]; \ 654 row[1] = row[2]; \ 655 row[2] = __hgcd_swap4_left_tmp; \ 656 } while (0) 657 658 /* Computes P = R * S. No overlap allowed. 659 660 Temporary space is needed for two numbers smaller than the 661 resulting matrix elements, i.e. bounded by 2*L <= N. */ 662 static mp_size_t 663 hgcd_mul (struct hgcd_row *P, mp_size_t alloc, 664 const struct hgcd_row *R, mp_size_t rsize, 665 const struct hgcd_row *S, mp_size_t ssize, 666 mp_ptr tp, mp_size_t talloc) 667 { 668 unsigned i; 669 unsigned j; 670 671 mp_size_t psize; 672 mp_limb_t h = 0; 673 int grow = 0; 674 675 MPN_NORMALIZE (R[1].uvp[1], rsize); 676 ASSERT (S[1].uvp[1][ssize - 1] != 0); 677 678 psize = rsize + ssize; 679 ASSERT (psize <= talloc); 680 681 if (rsize >= ssize) 682 { 683 for (i = 0; i < 2; i++) 684 for (j = 0; j < 2; j++) 685 { 686 /* Set P[i, j] = R[i, 0] S[0, j] + R[i,1] S[1, j] */ 687 mp_limb_t cy; 688 689 mpn_mul (P[i].uvp[j], R[i].uvp[0], rsize, S[0].uvp[j], ssize); 690 mpn_mul (tp, R[i].uvp[1], rsize, S[1].uvp[j], ssize); 691 692 cy = mpn_add_n (P[i].uvp[j], P[i].uvp[j], tp, psize); 693 694 if (cy) 695 { 696 ASSERT (psize + 1 < alloc); 697 P[i].uvp[j][psize] = cy; 698 grow = 1; 699 } 700 else 701 h |= P[i].uvp[j][psize - 1]; 702 } 703 } 704 else 705 { 706 for (i = 0; i < 2; i++) 707 for (j = 0; j < 2; j++) 708 { 709 /* Set P[i, j] = R[i, 0] S[0, j] + R[i,1] S[1, j] */ 710 mp_limb_t cy; 711 712 mpn_mul (P[i].uvp[j], S[0].uvp[j], ssize, R[i].uvp[0], rsize); 713 mpn_mul (tp, S[1].uvp[j], ssize, R[i].uvp[1], rsize); 714 715 cy = mpn_add_n (P[i].uvp[j], P[i].uvp[j], tp, psize); 716 717 if (cy) 718 { 719 ASSERT (psize + 1 < alloc); 720 P[i].uvp[j][psize] = cy; 721 grow = 1; 722 } 723 else 724 h |= P[i].uvp[j][psize - 1]; 725 } 726 } 727 728 if (grow) 729 return psize + 1; 730 else 731 return psize - (h == 0); 732 } 733 734 /* Computes R = W^k s->r + s->u A' - s->v B', which must be 735 non-negative. W denotes 2^(GMP_NUMB_BITS). Temporary space needed 736 is k + uvsize <= M + L = N. 737 738 Must have v > 0, v >= u. */ 739 740 mp_size_t 741 mpn_hgcd_fix (mp_size_t k, 742 mp_ptr rp, mp_size_t ralloc, 743 int sign, mp_size_t uvsize, 744 const struct hgcd_row *s, 745 mp_srcptr ap, 746 mp_srcptr bp, 747 mp_ptr tp, mp_size_t talloc) 748 { 749 mp_size_t tsize; 750 mp_limb_t cy; 751 mp_size_t rsize; 752 mp_srcptr up; 753 mp_srcptr vp; 754 755 up = s->uvp[0]; vp = s->uvp[1]; 756 MPN_NORMALIZE (vp, uvsize); 757 ASSERT (uvsize > 0); 758 759 if (sign < 0) 760 { 761 MP_SRCPTR_SWAP (up, vp); 762 MP_SRCPTR_SWAP (ap, bp); 763 } 764 765 tsize = k + uvsize; 766 767 ASSERT (k + s->rsize <= ralloc); 768 ASSERT (tsize <= talloc); 769 ASSERT (tsize <= ralloc); 770 771 ASSERT (rp != s->rp); 772 773 /* r = W^k s + u a */ 774 if (uvsize <= k) 775 mpn_mul (rp, ap, k, up, uvsize); 776 else 777 mpn_mul (rp, up, uvsize, ap, k); 778 779 if (uvsize <= s->rsize) 780 { 781 cy = mpn_add (rp + k, s->rp, s->rsize, rp + k, uvsize); 782 rsize = k + s->rsize; 783 } 784 else 785 { 786 cy = mpn_add (rp + k, rp + k, uvsize, s->rp, s->rsize); 787 rsize = k + uvsize; 788 } 789 790 if (cy) 791 { 792 ASSERT (rsize < ralloc); 793 rp[rsize++] = cy; 794 } 795 796 /* r -= v b */ 797 798 if (uvsize <= k) 799 mpn_mul (tp, bp, k, vp, uvsize); 800 else 801 mpn_mul (tp, vp, uvsize, bp, k); 802 803 ASSERT_NOCARRY (mpn_sub (rp, rp, rsize, tp, tsize)); 804 MPN_NORMALIZE (rp, rsize); 805 806 return rsize; 807 } 808 809 /* Compute r2 = r0 - q r1 */ 810 static void 811 hgcd_update_r (struct hgcd_row *r, mp_srcptr qp, mp_size_t qsize) 812 { 813 mp_srcptr r0p = r[0].rp; 814 mp_srcptr r1p = r[1].rp; 815 mp_ptr r2p = r[2].rp; 816 mp_size_t r0size = r[0].rsize; 817 mp_size_t r1size = r[1].rsize; 818 819 ASSERT (MPN_LESS_P (r1p, r1size, r0p, r0size)); 820 821 if (qsize == 0) 822 { 823 ASSERT_NOCARRY (mpn_sub (r2p, r0p, r0size, r1p, r1size)); 824 } 825 else if (qsize == 1) 826 { 827 mp_size_t size; 828 mp_limb_t cy = mpn_mul_1 (r2p, r1p, r1size, qp[0]); 829 size = r1size; 830 831 if (cy) 832 { 833 ASSERT (size < r0size); 834 r2p[size++] = cy; 835 } 836 837 ASSERT_NOCARRY (mpn_sub (r2p, r0p, r0size, r2p, size)); 838 } 839 else 840 { 841 mp_size_t size = r1size + qsize; 842 ASSERT (size <= r0size + 1); 843 844 if (qsize <= r1size) 845 mpn_mul (r2p, r1p, r1size, qp, qsize); 846 else 847 mpn_mul (r2p, qp, qsize, r1p, r1size); 848 849 if (size > r0size) 850 { 851 ASSERT (size == r0size + 1); 852 size--; 853 ASSERT (r2p[size] == 0); 854 } 855 856 ASSERT_NOCARRY (mpn_sub (r2p, r0p, r0size, r2p, size)); 857 } 858 859 MPN_NORMALIZE (r[2].rp, r0size); 860 r[2].rsize = r0size; 861 862 ASSERT (MPN_LESS_P (r2p, r0size, r1p, r1size)); 863 } 864 865 /* Compute (u2, v2) = (u0, v0) + q (u1, v1) 866 Return the size of the largest u,v element. 867 Caller must ensure that usize + qsize <= available storage */ 868 static mp_size_t 869 hgcd_update_uv (struct hgcd_row *r, mp_size_t usize, 870 mp_srcptr qp, mp_size_t qsize) 871 { 872 unsigned i; 873 mp_size_t grow; 874 875 ASSERT (r[1].uvp[1][usize - 1] != 0); 876 877 /* Compute u2 = u0 + q u1 */ 878 879 if (qsize == 0) 880 { 881 /* Represents a unit quotient */ 882 mp_limb_t cy; 883 884 cy = mpn_add_n (r[2].uvp[0], r[0].uvp[0], r[1].uvp[0], usize); 885 r[2].uvp[0][usize] = cy; 886 887 cy = mpn_add_n (r[2].uvp[1], r[0].uvp[1], r[1].uvp[1], usize); 888 r[2].uvp[1][usize] = cy; 889 grow = cy; 890 } 891 else if (qsize == 1) 892 { 893 mp_limb_t q = qp[0]; 894 for (i = 0; i < 2; i++) 895 { 896 mp_srcptr u0p = r[0].uvp[i]; 897 mp_srcptr u1p = r[1].uvp[i]; 898 mp_ptr u2p = r[2].uvp[i]; 899 mp_limb_t cy; 900 901 /* Too bad we don't have an addmul_1 with distinct source and 902 destination */ 903 cy = mpn_mul_1 (u2p, u1p, usize, q); 904 cy += mpn_add_n (u2p, u2p, u0p, usize); 905 906 u2p[usize] = cy; 907 grow = cy != 0; 908 } 909 } 910 else 911 { 912 for (i = 0; i < 2; i++) 913 { 914 mp_srcptr u0p = r[0].uvp[i]; 915 mp_srcptr u1p = r[1].uvp[i]; 916 mp_ptr u2p = r[2].uvp[i]; 917 918 if (qsize <= usize) 919 mpn_mul (u2p, u1p, usize, qp, qsize); 920 else 921 mpn_mul (u2p, qp, qsize, u1p, usize); 922 923 ASSERT_NOCARRY (mpn_add (u2p, u2p, usize + qsize, u0p, usize)); 924 grow = qsize - ((u2p[usize + qsize - 1]) == 0); 925 } 926 } 927 928 usize += grow; 929 930 /* The values should be allocated with one limb margin */ 931 ASSERT (mpn_cmp (r[1].uvp[0], r[2].uvp[0], usize) <= 0); 932 ASSERT (mpn_cmp (r[1].uvp[1], r[2].uvp[1], usize) <= 0); 933 ASSERT (r[2].uvp[1][usize - 1] != 0); 934 935 return usize; 936 } 937 938 /* Compute r0 = r2 + q r1, and the corresponding uv */ 939 static void 940 hgcd_backup (struct hgcd_row *r, mp_size_t usize, 941 mp_srcptr qp, mp_size_t qsize) 942 { 943 mp_ptr r0p = r[0].rp; 944 mp_srcptr r1p = r[1].rp; 945 mp_srcptr r2p = r[2].rp; 946 mp_size_t r0size; 947 mp_size_t r1size = r[1].rsize; 948 mp_size_t r2size = r[2].rsize; 949 950 mp_ptr u0p = r[0].uvp[0]; 951 mp_ptr v0p = r[0].uvp[1]; 952 mp_srcptr u1p = r[1].uvp[0]; 953 mp_srcptr v1p = r[1].uvp[1]; 954 mp_srcptr u2p = r[2].uvp[0]; 955 mp_srcptr v2p = r[2].uvp[1]; 956 957 ASSERT (MPN_LESS_P (r2p, r2size, r1p, r1size)); 958 959 if (qsize == 0) 960 { 961 /* r0 = r2 + r1 */ 962 mp_limb_t cy = mpn_add (r0p, r1p, r1size, r2p, r2size); 963 r0size = r1size; 964 if (cy) 965 r0p[r0size++] = cy; 966 967 /* (u0,v0) = (u2,v2) - (u1, v1) */ 968 969 ASSERT_NOCARRY (mpn_sub_n (u0p, u2p, u1p, usize)); 970 ASSERT_NOCARRY (mpn_sub_n (v0p, v2p, v1p, usize)); 971 } 972 else if (qsize == 1) 973 { 974 /* r0 = r2 + q r1 975 976 Just like for mpn_addmul_1, the result is the same size as r1, or 977 one limb larger. */ 978 979 mp_limb_t cy; 980 981 cy = mpn_mul_1 (r0p, r1p, r1size, qp[0]); 982 cy += mpn_add (r0p, r0p, r1size, r2p, r2size); 983 984 r0size = r1size; 985 if (cy) 986 r0p[r0size++] = cy; 987 988 /* (u0,v0) = (u2,v2) - q (u1, v1) */ 989 990 ASSERT_NOCARRY (mpn_mul_1 (u0p, u1p, usize, qp[0])); 991 ASSERT_NOCARRY (mpn_sub_n (u0p, u2p, u0p, usize)); 992 993 ASSERT_NOCARRY (mpn_mul_1 (v0p, v1p, usize, qp[0])); 994 ASSERT_NOCARRY (mpn_sub_n (v0p, v2p, v0p, usize)); 995 } 996 else 997 { 998 /* r0 = r2 + q r1 999 1000 Result must be of size r1size + q1size - 1, or one limb 1001 larger. */ 1002 1003 mp_size_t size; 1004 1005 r0size = r1size + qsize; 1006 if (r1size >= qsize) 1007 mpn_mul (r0p, r1p, r1size, qp, qsize); 1008 else 1009 mpn_mul (r0p, qp, qsize, r1p, r1size); 1010 1011 ASSERT_NOCARRY (mpn_add (r0p, r0p, r0size, r2p, r2size)); 1012 1013 r0size -= (r0p[r0size-1] == 0); 1014 1015 /* (u0,v0) = (u2,v2) - q (u1, v1) */ 1016 1017 /* We must have 1018 1019 usize >= #(q u1) >= qsize + #u1 - 1 1020 1021 which means that u1 must have at least 1022 1023 usize - #u1 >= qsize - 1 1024 1025 zero limbs at the high end, and similarly for v1. */ 1026 1027 ASSERT (qsize <= usize); 1028 size = usize - qsize + 1; 1029 #if WANT_ASSERT 1030 { 1031 mp_size_t i; 1032 for (i = size; i < usize; i++) 1033 { 1034 ASSERT (u1p[i] == 0); 1035 ASSERT (v1p[i] == 0); 1036 } 1037 } 1038 #endif 1039 /* NOTE: Needs an extra limb for the u,v values */ 1040 1041 if (qsize <= size) 1042 { 1043 mpn_mul (u0p, u1p, size, qp, qsize); 1044 mpn_mul (v0p, v1p, size, qp, qsize); 1045 } 1046 else 1047 { 1048 mpn_mul (u0p, qp, qsize, u1p, size); 1049 mpn_mul (v0p, qp, qsize, v1p, size); 1050 } 1051 1052 /* qsize + size = usize + 1 */ 1053 ASSERT (u0p[usize] == 0); 1054 ASSERT (v0p[usize] == 0); 1055 1056 ASSERT_NOCARRY (mpn_sub_n (u0p, u2p, u0p, usize)); 1057 ASSERT_NOCARRY (mpn_sub_n (v0p, v2p, v0p, usize)); 1058 } 1059 1060 r[0].rsize = r0size; 1061 } 1062 1063 /* Called after HGCD_SWAP4_RIGHT, to adjust the size field. Large 1064 numbers in row 0 don't count, and are overwritten. */ 1065 static void 1066 hgcd_normalize (struct hgcd *hgcd) 1067 { 1068 mp_size_t size = hgcd->size; 1069 1070 /* v3 should always be the largest element */ 1071 while (size > 0 && hgcd->row[3].uvp[1][size - 1] == 0) 1072 { 1073 size--; 1074 /* Row 0 is about to be overwritten. We must zero out unused limbs */ 1075 hgcd->row[0].uvp[0][size] = 0; 1076 hgcd->row[0].uvp[1][size] = 0; 1077 1078 ASSERT (hgcd->row[1].uvp[0][size] == 0); 1079 ASSERT (hgcd->row[1].uvp[1][size] == 0); 1080 ASSERT (hgcd->row[2].uvp[0][size] == 0); 1081 ASSERT (hgcd->row[2].uvp[1][size] == 0); 1082 ASSERT (hgcd->row[3].uvp[0][size] == 0); 1083 } 1084 1085 hgcd->size = size; 1086 } 1087 1088 int 1089 mpn_hgcd2_lehmer_step (struct hgcd2 *hgcd, 1090 mp_srcptr ap, mp_size_t asize, 1091 mp_srcptr bp, mp_size_t bsize, 1092 struct qstack *quotients) 1093 { 1094 mp_limb_t ah; 1095 mp_limb_t al; 1096 mp_limb_t bh; 1097 mp_limb_t bl; 1098 1099 ASSERT (asize >= bsize); 1100 ASSERT (MPN_LEQ_P (bp, bsize, ap, asize)); 1101 1102 if (bsize < 2) 1103 return 0; 1104 1105 #if 0 && WANT_TRACE 1106 trace ("lehmer_step:\n" 1107 " a = %Nd\n" 1108 " b = %Nd\n", 1109 ap, asize, bp, bsize); 1110 #endif 1111 #if WANT_TRACE 1112 trace ("lehmer_step: asize = %d, bsize = %d\n", asize, bsize); 1113 #endif 1114 1115 /* The case asize == 2 is needed to take care of values that are 1116 between one and two *full* limbs in size. */ 1117 if (asize == 2 || (ap[asize-1] & GMP_NUMB_HIGHBIT)) 1118 { 1119 if (bsize < asize) 1120 return 0; 1121 1122 al = ap[asize - 2]; 1123 ah = ap[asize - 1]; 1124 1125 ASSERT (asize == bsize); 1126 bl = bp[asize - 2]; 1127 bh = bp[asize - 1]; 1128 } 1129 else 1130 { 1131 unsigned shift; 1132 if (bsize + 1 < asize) 1133 return 0; 1134 1135 /* We want two *full* limbs */ 1136 ASSERT (asize > 2); 1137 1138 count_leading_zeros (shift, ap[asize-1]); 1139 #if 0 && WANT_TRACE 1140 trace ("shift = %d\n", shift); 1141 #endif 1142 if (bsize == asize) 1143 bh = MPN_EXTRACT_LIMB (shift, bp[asize - 1], bp[asize - 2]); 1144 else 1145 { 1146 ASSERT (asize == bsize + 1); 1147 bh = bp[asize - 2] >> (GMP_LIMB_BITS - shift); 1148 } 1149 1150 bl = MPN_EXTRACT_LIMB (shift, bp[asize - 2], bp[asize - 3]); 1151 1152 al = MPN_EXTRACT_LIMB (shift, ap[asize - 2], ap[asize - 3]); 1153 ah = MPN_EXTRACT_LIMB (shift, ap[asize - 1], ap[asize - 2]); 1154 } 1155 1156 #if WANT_TRACE 1157 trace ("lehmer_step: ah = %lx, al = %lx, bh = %lx, bl = %lx\n", 1158 (unsigned long) ah, (unsigned long) al, 1159 (unsigned long) bh, (unsigned long) bl); 1160 #endif 1161 return mpn_hgcd2 (hgcd, ah, al, bh, bl, quotients); 1162 } 1163 1164 /* Called when r2 has been computed, and it is too small. Top element 1165 on the stack is r0/r1. One backup step is needed. */ 1166 static int 1167 hgcd_small_1 (struct hgcd *hgcd, mp_size_t M, 1168 struct qstack *quotients) 1169 { 1170 mp_srcptr qp; 1171 mp_size_t qsize; 1172 1173 if (hgcd_start_row_p (hgcd->row, hgcd->size)) 1174 { 1175 qstack_drop (quotients); 1176 return 0; 1177 } 1178 1179 HGCD_SWAP4_RIGHT (hgcd->row); 1180 hgcd_normalize (hgcd); 1181 1182 qsize = qstack_get_1 (quotients, &qp); 1183 1184 hgcd_backup (hgcd->row, hgcd->size, qp, qsize); 1185 hgcd->sign = ~hgcd->sign; 1186 1187 #if WANT_ASSERT 1188 qstack_rotate (quotients, 0); 1189 #endif 1190 1191 return hgcd_jebelean (hgcd, M); 1192 } 1193 1194 /* Called when r3 has been computed, and is small enough. Two backup 1195 steps are needed. */ 1196 static int 1197 hgcd_small_2 (struct hgcd *hgcd, mp_size_t M, 1198 const struct qstack *quotients) 1199 { 1200 mp_srcptr qp; 1201 mp_size_t qsize; 1202 1203 if (hgcd_start_row_p (hgcd->row + 2, hgcd->size)) 1204 return 0; 1205 1206 qsize = qstack_get_0 (quotients, &qp); 1207 hgcd_backup (hgcd->row+1, hgcd->size, qp, qsize); 1208 1209 if (hgcd_start_row_p (hgcd->row + 1, hgcd->size)) 1210 return 0; 1211 1212 qsize = qstack_get_1 (quotients, &qp); 1213 hgcd_backup (hgcd->row, hgcd->size, qp, qsize); 1214 1215 return hgcd_jebelean (hgcd, M); 1216 } 1217 1218 static void 1219 hgcd_start (struct hgcd *hgcd, 1220 mp_srcptr ap, mp_size_t asize, 1221 mp_srcptr bp, mp_size_t bsize) 1222 { 1223 MPN_COPY (hgcd->row[0].rp, ap, asize); 1224 hgcd->row[0].rsize = asize; 1225 1226 MPN_COPY (hgcd->row[1].rp, bp, bsize); 1227 hgcd->row[1].rsize = bsize; 1228 1229 hgcd->sign = 0; 1230 if (hgcd->size != 0) 1231 { 1232 /* We must zero out the uv array */ 1233 unsigned i; 1234 unsigned j; 1235 1236 for (i = 0; i < 4; i++) 1237 for (j = 0; j < 2; j++) 1238 MPN_ZERO (hgcd->row[i].uvp[j], hgcd->size); 1239 } 1240 #if WANT_ASSERT 1241 { 1242 unsigned i; 1243 unsigned j; 1244 mp_size_t k; 1245 1246 for (i = 0; i < 4; i++) 1247 for (j = 0; j < 2; j++) 1248 for (k = hgcd->size; k < hgcd->alloc; k++) 1249 ASSERT (hgcd->row[i].uvp[j][k] == 0); 1250 } 1251 #endif 1252 1253 hgcd->size = 1; 1254 hgcd->row[0].uvp[0][0] = 1; 1255 hgcd->row[1].uvp[1][0] = 1; 1256 } 1257 1258 /* Performs one euclid step on r0, r1. Returns >= 0 if hgcd should be 1259 terminated, -1 if we should go on */ 1260 static int 1261 euclid_step (struct hgcd *hgcd, mp_size_t M, 1262 struct qstack *quotients) 1263 { 1264 mp_size_t asize; 1265 1266 mp_size_t qsize; 1267 mp_size_t rsize; 1268 mp_ptr qp; 1269 mp_ptr rp; 1270 1271 asize = hgcd->row[0].rsize; 1272 rsize = hgcd->row[1].rsize; 1273 qsize = asize - rsize + 1; 1274 1275 /* Make sure we have space on stack */ 1276 ASSERT_QSTACK (quotients); 1277 1278 if (qsize > quotients->limb_alloc - quotients->limb_next) 1279 { 1280 qstack_rotate (quotients, 1281 qsize - (quotients->limb_alloc - quotients->limb_next)); 1282 ASSERT (quotients->size_next < QSTACK_MAX_QUOTIENTS); 1283 } 1284 else if (quotients->size_next >= QSTACK_MAX_QUOTIENTS) 1285 { 1286 qstack_rotate (quotients, 0); 1287 } 1288 1289 ASSERT (qsize <= quotients->limb_alloc - quotients->limb_next); 1290 1291 qp = quotients->limb + quotients->limb_next; 1292 1293 rp = hgcd->row[2].rp; 1294 mpn_tdiv_qr (qp, rp, 0, hgcd->row[0].rp, asize, hgcd->row[1].rp, rsize); 1295 MPN_NORMALIZE (rp, rsize); 1296 hgcd->row[2].rsize = rsize; 1297 1298 if (qp[qsize - 1] == 0) 1299 qsize--; 1300 1301 if (qsize == 1 && qp[0] == 1) 1302 qsize = 0; 1303 1304 quotients->size[quotients->size_next++] = qsize; 1305 quotients->limb_next += qsize; 1306 1307 ASSERT_QSTACK (quotients); 1308 1309 /* Update u and v */ 1310 ASSERT (hgcd->size + qsize <= hgcd->alloc); 1311 hgcd->size = hgcd_update_uv (hgcd->row, hgcd->size, qp, qsize); 1312 ASSERT (hgcd->size < hgcd->alloc); 1313 1314 if (hgcd->row[2].rsize <= M) 1315 return hgcd_small_1 (hgcd, M, quotients); 1316 else 1317 { 1318 /* Keep this remainder */ 1319 hgcd->sign = ~hgcd->sign; 1320 1321 HGCD_SWAP4_LEFT (hgcd->row); 1322 return -1; 1323 } 1324 } 1325 1326 /* Called when values have been computed in r[0] and r[1], and the 1327 latter value is too large, and we know that it's not much too 1328 large. Returns the updated size for the uv matrix. */ 1329 static mp_size_t 1330 hgcd_adjust (struct hgcd_row *r, mp_size_t size, 1331 struct qstack *quotients) 1332 { 1333 mp_limb_t c0; 1334 mp_limb_t c1; 1335 mp_limb_t d; 1336 1337 /* Compute the correct r1. We have r1' = r1 - d r0, and we always 1338 have d = 1 or 2. */ 1339 1340 ASSERT_NOCARRY (mpn_sub (r[1].rp, r[1].rp, r[1].rsize, r[0].rp, r[0].rsize)); 1341 1342 MPN_NORMALIZE (r[1].rp, r[1].rsize); 1343 1344 if (MPN_LESS_P (r[1].rp, r[1].rsize, r[0].rp, r[0].rsize)) 1345 { 1346 c0 = mpn_add_n (r[1].uvp[0], r[1].uvp[0], r[0].uvp[0], size); 1347 c1 = mpn_add_n (r[1].uvp[1], r[1].uvp[1], r[0].uvp[1], size); 1348 d = 1; 1349 } 1350 else 1351 { 1352 ASSERT_NOCARRY (mpn_sub (r[1].rp, r[1].rp, r[1].rsize, r[0].rp, r[0].rsize)); 1353 MPN_NORMALIZE (r[1].rp, r[1].rsize); 1354 ASSERT (MPN_LESS_P (r[1].rp, r[1].rsize, r[0].rp, r[0].rsize)); 1355 1356 c0 = mpn_addmul_1 (r[1].uvp[0], r[0].uvp[0], size, 2); 1357 c1 = mpn_addmul_1 (r[1].uvp[1], r[0].uvp[1], size, 2); 1358 d = 2; 1359 } 1360 1361 /* FIXME: Can avoid branches */ 1362 if (c1 != 0) 1363 { 1364 r[1].uvp[0][size] = c0; 1365 r[1].uvp[1][size] = c1; 1366 size++; 1367 } 1368 else 1369 { 1370 ASSERT (c0 == 0); 1371 } 1372 1373 /* Remains to adjust the quotient on stack */ 1374 qstack_adjust (quotients, d); 1375 1376 return size; 1377 } 1378 1379 /* Reduce using Lehmer steps. Called by mpn_hgcd when r1 has been 1380 reduced to approximately the right size. Also used by 1381 mpn_hgcd_lehmer. */ 1382 static int 1383 hgcd_final (struct hgcd *hgcd, mp_size_t M, 1384 struct qstack *quotients) 1385 { 1386 ASSERT (hgcd->row[0].rsize > M); 1387 ASSERT (hgcd->row[1].rsize > M); 1388 1389 /* Can be equal when called by hgcd_lehmer. */ 1390 ASSERT (MPN_LEQ_P (hgcd->row[1].rp, hgcd->row[1].rsize, 1391 hgcd->row[0].rp, hgcd->row[0].rsize)); 1392 1393 for (;;) 1394 { 1395 mp_size_t L = hgcd->row[0].rsize; 1396 1397 struct hgcd2 R; 1398 int res; 1399 1400 if (L <= M + 2 1401 && (L < M + 2 || (hgcd->row[0].rp[M+1] & GMP_NUMB_HIGHBIT) == 0)) 1402 break; 1403 1404 res = mpn_hgcd2_lehmer_step (&R, 1405 hgcd->row[0].rp, hgcd->row[0].rsize, 1406 hgcd->row[1].rp, hgcd->row[1].rsize, 1407 quotients); 1408 1409 if (res == 0) 1410 { 1411 /* We must divide to make progress */ 1412 res = euclid_step (hgcd, M, quotients); 1413 1414 if (res >= 0) 1415 return res; 1416 } 1417 else if (res == 1) 1418 { 1419 mp_size_t qsize; 1420 1421 /* The quotient that has been computed for r2 is at most 2 1422 off. So adjust that, and avoid a full division. */ 1423 qstack_drop (quotients); 1424 1425 /* Top two rows of R must be the identity matrix, followed 1426 by a row (1, q). */ 1427 ASSERT (R.row[0].u == 1 && R.row[0].v == 0); 1428 ASSERT (R.row[1].u == 0 && R.row[1].v == 1); 1429 ASSERT (R.row[2].u == 1); 1430 1431 qsize = (R.row[2].v != 0); 1432 1433 hgcd_update_r (hgcd->row, &R.row[2].v, qsize); 1434 hgcd->size = hgcd_update_uv (hgcd->row, hgcd->size, 1435 &R.row[2].v, qsize); 1436 ASSERT (hgcd->size < hgcd->alloc); 1437 1438 if (MPN_LEQ_P (hgcd->row[1].rp, hgcd->row[1].rsize, 1439 hgcd->row[2].rp, hgcd->row[2].rsize)) 1440 hgcd->size = hgcd_adjust (hgcd->row + 1, hgcd->size, quotients); 1441 1442 ASSERT (hgcd->size < hgcd->alloc); 1443 1444 hgcd->sign = ~hgcd->sign; 1445 HGCD_SWAP4_LEFT (hgcd->row); 1446 } 1447 else 1448 { 1449 const struct hgcd2_row *s = R.row + (res - 2); 1450 int sign = R.sign; 1451 /* Max size after reduction, plus one */ 1452 mp_size_t ralloc = hgcd->row[1].rsize + 1; 1453 1454 if (res == 2) 1455 { 1456 qstack_drop (quotients); 1457 qstack_drop (quotients); 1458 } 1459 else if (res == 3) 1460 { 1461 sign = ~sign; 1462 qstack_drop (quotients); 1463 } 1464 1465 /* s[0] and s[1] correct. */ 1466 hgcd->row[2].rsize 1467 = mpn_hgcd2_fix (hgcd->row[2].rp, ralloc, 1468 sign, 1469 s[0].u, hgcd->row[0].rp, hgcd->row[0].rsize, 1470 s[0].v, hgcd->row[1].rp, hgcd->row[1].rsize); 1471 1472 hgcd->row[3].rsize 1473 = mpn_hgcd2_fix (hgcd->row[3].rp, ralloc, 1474 ~sign, 1475 s[1].u, hgcd->row[0].rp, hgcd->row[0].rsize, 1476 s[1].v, hgcd->row[1].rp, hgcd->row[1].rsize); 1477 1478 hgcd->size = hgcd2_mul (hgcd->row + 2, hgcd->alloc, 1479 s, hgcd->row, hgcd->size); 1480 hgcd->sign ^= sign; 1481 1482 ASSERT (hgcd->row[2].rsize > M); 1483 1484 #if WANT_ASSERT 1485 switch (res) 1486 { 1487 default: 1488 ASSERT_ALWAYS (0 == "Unexpected value of res"); 1489 break; 1490 case 2: 1491 ASSERT (hgcd->row[2].rsize >= L - 1); 1492 ASSERT (hgcd->row[3].rsize >= L - 2); 1493 ASSERT (hgcd->row[2].rsize > M + 1); 1494 ASSERT (hgcd->row[3].rsize > M); 1495 break; 1496 case 3: 1497 ASSERT (hgcd->row[2].rsize >= L - 2); 1498 ASSERT (hgcd->row[3].rsize >= L - 2); 1499 ASSERT (hgcd->row[3].rsize > M); 1500 break; 1501 case 4: 1502 ASSERT (hgcd->row[2].rsize >= L - 2); 1503 ASSERT (hgcd->row[3].rsize < L || hgcd->row[3].rp[L-1] == 1); 1504 break; 1505 } 1506 #endif 1507 if (hgcd->row[3].rsize <= M) 1508 { 1509 /* Can happen only in the res == 4 case */ 1510 ASSERT (res == 4); 1511 1512 /* Backup two steps */ 1513 ASSERT (!hgcd_start_row_p (hgcd->row + 2, hgcd->size)); 1514 1515 return hgcd_small_2 (hgcd, M, quotients); 1516 } 1517 1518 HGCD_SWAP4_2 (hgcd->row); 1519 } 1520 } 1521 1522 ASSERT (hgcd->row[1].rsize > M); 1523 1524 for (;;) 1525 { 1526 mp_size_t L = hgcd->row[0].rsize; 1527 mp_size_t ralloc; 1528 1529 mp_size_t qsize; 1530 mp_srcptr qp; 1531 1532 struct hgcd2 R; 1533 int res; 1534 1535 /* We don't want hgcd2 to pickup any bits below r0p[M-1], so 1536 don't tell mpn_hgcd2_lehmer_step about them. */ 1537 res = mpn_hgcd2_lehmer_step (&R, 1538 hgcd->row[0].rp+M-1, hgcd->row[0].rsize-M+1, 1539 hgcd->row[1].rp+M-1, hgcd->row[1].rsize-M+1, 1540 quotients); 1541 if (res == 0) 1542 { 1543 /* We must divide to make progress */ 1544 res = euclid_step (hgcd, M, quotients); 1545 1546 if (res >= 0) 1547 return res; 1548 1549 continue; 1550 } 1551 1552 if (res == 1) 1553 { 1554 mp_size_t qsize; 1555 1556 /* The quotient that has been computed for r2 is at most 2 1557 off. So adjust that, and avoid a full division. */ 1558 qstack_drop (quotients); 1559 1560 /* Top two rows of R must be the identity matrix, followed 1561 by a row (1, q). */ 1562 ASSERT (R.row[0].u == 1 && R.row[0].v == 0); 1563 ASSERT (R.row[1].u == 0 && R.row[1].v == 1); 1564 ASSERT (R.row[2].u == 1); 1565 1566 qsize = (R.row[2].v != 0); 1567 1568 hgcd_update_r (hgcd->row, &R.row[2].v, qsize); 1569 hgcd->size = hgcd_update_uv (hgcd->row, hgcd->size, 1570 &R.row[2].v, qsize); 1571 ASSERT (hgcd->size < hgcd->alloc); 1572 1573 if (MPN_LEQ_P (hgcd->row[1].rp, hgcd->row[1].rsize, 1574 hgcd->row[2].rp, hgcd->row[2].rsize)) 1575 hgcd->size = hgcd_adjust (hgcd->row + 1, hgcd->size, quotients); 1576 1577 ASSERT (hgcd->size < hgcd->alloc); 1578 1579 hgcd->sign = ~hgcd->sign; 1580 HGCD_SWAP4_LEFT (hgcd->row); 1581 1582 continue; 1583 } 1584 1585 /* Now r0 and r1 are always correct. */ 1586 /* Store new values in rows 2 and 3, to avoid overlap */ 1587 1588 /* Max size after reduction, plus one */ 1589 ralloc = hgcd->row[1].rsize + 1; 1590 1591 hgcd->row[2].rsize 1592 = mpn_hgcd2_fix (hgcd->row[2].rp, ralloc, 1593 R.sign, 1594 R.row[0].u, hgcd->row[0].rp, hgcd->row[0].rsize, 1595 R.row[0].v, hgcd->row[1].rp, hgcd->row[1].rsize); 1596 1597 hgcd->row[3].rsize 1598 = mpn_hgcd2_fix (hgcd->row[3].rp, ralloc, 1599 ~R.sign, 1600 R.row[1].u, hgcd->row[0].rp, hgcd->row[0].rsize, 1601 R.row[1].v, hgcd->row[1].rp, hgcd->row[1].rsize); 1602 1603 ASSERT (hgcd->row[2].rsize >= L - 1); 1604 ASSERT (hgcd->row[3].rsize >= L - 2); 1605 1606 ASSERT (hgcd->row[2].rsize > M); 1607 ASSERT (hgcd->row[3].rsize > M-1); 1608 1609 hgcd->size = hgcd2_mul (hgcd->row + 2, hgcd->alloc, 1610 R.row, hgcd->row, hgcd->size); 1611 hgcd->sign ^= R.sign; 1612 1613 if (hgcd->row[3].rsize <= M) 1614 { 1615 /* Backup two steps */ 1616 1617 /* We don't use R.row[2] and R.row[3], so drop the 1618 corresponding quotients. */ 1619 qstack_drop (quotients); 1620 qstack_drop (quotients); 1621 1622 return hgcd_small_2 (hgcd, M, quotients); 1623 } 1624 1625 HGCD_SWAP4_2 (hgcd->row); 1626 1627 if (res == 2) 1628 { 1629 qstack_drop (quotients); 1630 qstack_drop (quotients); 1631 1632 continue; 1633 } 1634 1635 /* We already know the correct q for computing r2 */ 1636 1637 qsize = qstack_get_1 (quotients, &qp); 1638 ASSERT (qsize < 2); 1639 1640 ASSERT (qsize + hgcd->size <= hgcd->alloc); 1641 hgcd_update_r (hgcd->row, qp, qsize); 1642 hgcd->size = hgcd_update_uv (hgcd->row, hgcd->size, 1643 qp, qsize); 1644 ASSERT (hgcd->size < hgcd->alloc); 1645 1646 ASSERT (hgcd->row[2].rsize >= M - 2); 1647 1648 if (hgcd->row[2].rsize <= M) 1649 { 1650 /* Discard r3 */ 1651 qstack_drop (quotients); 1652 return hgcd_small_1 (hgcd, M, quotients); 1653 } 1654 if (res == 3) 1655 { 1656 /* Drop quotient for r3 */ 1657 qstack_drop (quotients); 1658 1659 hgcd->sign = ~hgcd->sign; 1660 HGCD_SWAP4_LEFT (hgcd->row); 1661 1662 continue; 1663 } 1664 1665 ASSERT (res == 4); 1666 ASSERT (hgcd->row[2].rsize > M); 1667 1668 /* We already know the correct q for computing r3 */ 1669 qsize = qstack_get_0 (quotients, &qp); 1670 ASSERT (qsize < 2); 1671 1672 ASSERT (qsize + hgcd->size <= hgcd->alloc); 1673 hgcd_update_r (hgcd->row + 1, qp, qsize); 1674 hgcd->size = hgcd_update_uv (hgcd->row + 1, hgcd->size, 1675 qp, qsize); 1676 ASSERT (hgcd->size < hgcd->alloc); 1677 1678 ASSERT (hgcd->row[3].rsize <= M + 1); 1679 /* Appearantly not true. Probably because we have leading zeros 1680 when we call hgcd2. */ 1681 /* ASSERT (hgcd->row[3].rsize <= M || hgcd->row[3].rp[M] == 1); */ 1682 1683 if (hgcd->row[3].rsize <= M) 1684 return hgcd_jebelean (hgcd, M); 1685 1686 HGCD_SWAP4_2 (hgcd->row); 1687 } 1688 } 1689 1690 mp_size_t 1691 mpn_hgcd_itch (mp_size_t asize) 1692 { 1693 /* Scratch space is needed for calling hgcd. We need space for the 1694 results of all recursive calls. In addition, we need space for 1695 calling hgcd_fix and hgcd_mul, for which N = asize limbs should 1696 be enough. */ 1697 1698 /* Limit on the recursion depth */ 1699 unsigned k = mpn_hgcd_max_recursion (asize); 1700 1701 return asize + mpn_hgcd_init_itch (asize + 6 * k) + 12 * k; 1702 } 1703 1704 /* Repeatedly divides A by B, until the remainder fits in M = 1705 ceil(asize / 2) limbs. Stores cofactors in HGCD, and pushes the 1706 quotients on STACK. On success, HGCD->row[0, 1, 2] correspond to 1707 remainders that are larger than M limbs, while HGCD->row[3] 1708 correspond to a remainder that fit in M limbs. 1709 1710 Return 0 on failure (if B or A mod B fits in M limbs), otherwise 1711 return one of 1 - 4 as specified for hgcd_jebelean. */ 1712 int 1713 mpn_hgcd (struct hgcd *hgcd, 1714 mp_srcptr ap, mp_size_t asize, 1715 mp_srcptr bp, mp_size_t bsize, 1716 struct qstack *quotients, 1717 mp_ptr tp, mp_size_t talloc) 1718 { 1719 mp_size_t N = asize; 1720 mp_size_t M = (N + 1)/2; 1721 mp_size_t n; 1722 mp_size_t m; 1723 1724 struct hgcd R; 1725 mp_size_t itch; 1726 1727 ASSERT (M); 1728 #if WANT_TRACE 1729 trace ("hgcd: asize = %d, bsize = %d, HGCD_SCHOENHAGE_THRESHOLD = %d\n", 1730 asize, bsize, HGCD_SCHOENHAGE_THRESHOLD); 1731 if (asize < 100) 1732 trace (" a = %Nd\n" 1733 " b = %Nd\n", ap, asize, bp, bsize); 1734 #endif 1735 1736 if (bsize <= M) 1737 return 0; 1738 1739 ASSERT (asize >= 2); 1740 1741 /* Initialize, we keep r0 and r1 as the reduced numbers (so far). */ 1742 hgcd_start (hgcd, ap, asize, bp, bsize); 1743 1744 if (BELOW_THRESHOLD (N, HGCD_SCHOENHAGE_THRESHOLD)) 1745 return hgcd_final (hgcd, M, quotients); 1746 1747 /* Reduce the size to M + m + 1. Usually, only one hgcd call is 1748 needed, but we may need multiple calls. When finished, the values 1749 are stored in r0 (potentially large) and r1 (smaller size) */ 1750 1751 n = N - M; 1752 m = (n + 1)/2; 1753 1754 /* The second recursive call can use numbers of size up to n+3 */ 1755 itch = mpn_hgcd_init_itch (n+3); 1756 1757 ASSERT (itch <= talloc); 1758 mpn_hgcd_init (&R, n+3, tp); 1759 tp += itch; talloc -= itch; 1760 1761 while (hgcd->row[1].rsize > M + m + 1) 1762 { 1763 /* Max size after reduction, plus one */ 1764 mp_size_t ralloc = hgcd->row[1].rsize + 1; 1765 1766 int res = mpn_hgcd (&R, 1767 hgcd->row[0].rp + M, hgcd->row[0].rsize - M, 1768 hgcd->row[1].rp + M, hgcd->row[1].rsize - M, 1769 quotients, tp, talloc); 1770 1771 if (res == 0) 1772 { 1773 /* We must divide to make progress */ 1774 res = euclid_step (hgcd, M, quotients); 1775 1776 if (res > 0) 1777 ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 0, 4); 1778 if (res >= 0) 1779 return res; 1780 1781 ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 0, 2); 1782 } 1783 else if (res <= 2) 1784 { 1785 /* The reason we use hgcd_adjust also when res == 2 is that 1786 either r2 is correct, and we get it for free. 1787 1788 Or r2 is too large. Then can correct it by a few bignum 1789 subtractions, and we are *guaranteed* that the result is 1790 small enough that we don't need another run through this 1791 loop. */ 1792 1793 /* FIXME: For res == 1, the newly computed row[2] will be 1794 the same as the old row[1], so we do some unnecessary 1795 computations. */ 1796 1797 qstack_drop (quotients); 1798 1799 /* Store new values in rows 2 and 3, to avoid overlap */ 1800 hgcd->row[2].rsize 1801 = mpn_hgcd_fix (M, hgcd->row[2].rp, ralloc, 1802 ~R.sign, R.size, &R.row[1], 1803 hgcd->row[0].rp, hgcd->row[1].rp, 1804 tp, talloc); 1805 1806 hgcd->row[3].rsize 1807 = mpn_hgcd_fix (M, hgcd->row[3].rp, ralloc, 1808 R.sign, R.size, &R.row[2], 1809 hgcd->row[0].rp, hgcd->row[1].rp, 1810 tp, talloc); 1811 1812 ASSERT (hgcd->row[2].rsize > M); 1813 ASSERT (hgcd->row[3].rsize > M); 1814 1815 /* Computes the uv matrix for the (possibly incorrect) 1816 values r1, r2. The elements must be smaller than the 1817 correct ones, since they correspond to a too small q. */ 1818 1819 hgcd->size = hgcd_mul (hgcd->row + 2, hgcd->alloc, 1820 R.row + 1, R.size, 1821 hgcd->row, hgcd->size, 1822 tp, talloc); 1823 hgcd->sign ^= ~R.sign; 1824 1825 if (MPN_LESS_P (hgcd->row[3].rp, hgcd->row[3].rsize, 1826 hgcd->row[2].rp, hgcd->row[2].rsize)) 1827 { 1828 ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 2, 4); 1829 1830 HGCD_SWAP4_2 (hgcd->row); 1831 } 1832 else 1833 { 1834 /* r2 was too large, i.e. q0 too small. In this case we 1835 must have r2 % r1 <= r2 - r1 smaller than M + m + 1. */ 1836 1837 hgcd->size = hgcd_adjust (hgcd->row + 2, hgcd->size, quotients); 1838 ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 2, 4); 1839 1840 ASSERT (hgcd->row[3].rsize <= M + m + 1); 1841 1842 if (hgcd->row[3].rsize <= M) 1843 { 1844 /* Backup two steps */ 1845 ASSERT (!hgcd_start_row_p (hgcd->row + 2, hgcd->size)); 1846 1847 return hgcd_small_2 (hgcd, M, quotients); 1848 } 1849 1850 HGCD_SWAP4_2 (hgcd->row); 1851 1852 /* Loop always terminates here. */ 1853 break; 1854 } 1855 } 1856 else if (res == 3) 1857 { 1858 qstack_drop(quotients); 1859 1860 ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 0, 2); 1861 1862 /* Store new values in rows 2 and 3, to avoid overlap */ 1863 hgcd->row[2].rsize 1864 = mpn_hgcd_fix (M, hgcd->row[2].rp, ralloc, 1865 ~R.sign, R.size, &R.row[1], 1866 hgcd->row[0].rp, hgcd->row[1].rp, 1867 tp, talloc); 1868 1869 hgcd->row[3].rsize 1870 = mpn_hgcd_fix (M, hgcd->row[3].rp, ralloc, 1871 R.sign, R.size, &R.row[2], 1872 hgcd->row[0].rp, hgcd->row[1].rp, 1873 tp, talloc); 1874 1875 ASSERT (hgcd->row[2].rsize > M); 1876 ASSERT (hgcd->row[3].rsize > M); 1877 1878 hgcd->size = hgcd_mul (hgcd->row + 2, hgcd->alloc, 1879 R.row + 1, R.size, 1880 hgcd->row, hgcd->size, 1881 tp, talloc); 1882 hgcd->sign ^= ~R.sign; 1883 1884 ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 2, 4); 1885 1886 HGCD_SWAP4_2 (hgcd->row); 1887 } 1888 else 1889 { 1890 ASSERT (res == 4); 1891 1892 /* All of r0, r1, r3 and r3 are correct. 1893 Compute r2 and r3 */ 1894 1895 ASSERT_HGCD (&R, 1896 hgcd->row[0].rp + M, hgcd->row[0].rsize - M, 1897 hgcd->row[1].rp + M, hgcd->row[1].rsize - M, 1898 0, 4); 1899 1900 /* Store new values in rows 2 and 3, to avoid overlap */ 1901 hgcd->row[2].rsize 1902 = mpn_hgcd_fix (M, hgcd->row[2].rp, ralloc, 1903 R.sign, R.size, &R.row[2], 1904 hgcd->row[0].rp, hgcd->row[1].rp, 1905 tp, talloc); 1906 1907 hgcd->row[3].rsize 1908 = mpn_hgcd_fix (M, hgcd->row[3].rp, ralloc, 1909 ~R.sign, R.size, &R.row[3], 1910 hgcd->row[0].rp, hgcd->row[1].rp, 1911 tp, talloc); 1912 1913 ASSERT (hgcd->row[2].rsize > M); 1914 ASSERT (hgcd->row[3].rsize <= M + m + 1); 1915 1916 hgcd->size = hgcd_mul (hgcd->row+2, hgcd->alloc, 1917 R.row+2, R.size, 1918 hgcd->row, hgcd->size, 1919 tp, talloc); 1920 hgcd->sign ^= R.sign; 1921 1922 ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 2, 4); 1923 1924 if (hgcd->row[3].rsize <= M) 1925 { 1926 /* Backup two steps */ 1927 /* Both steps must always be possible, but it's not 1928 trivial to ASSERT that here. */ 1929 ASSERT (!hgcd_start_row_p (hgcd->row + 2, hgcd->size)); 1930 1931 return hgcd_small_2 (hgcd, M, quotients); 1932 } 1933 HGCD_SWAP4_2 (hgcd->row); 1934 1935 /* Always exit the loop. */ 1936 break; 1937 } 1938 } 1939 1940 ASSERT (hgcd->row[0].rsize >= hgcd->row[1].rsize); 1941 ASSERT (hgcd->row[1].rsize > M); 1942 ASSERT (hgcd->row[1].rsize <= M + m + 1); 1943 1944 if (hgcd->row[0].rsize > M + m + 1) 1945 { 1946 /* One euclid step to reduce size. */ 1947 int res = euclid_step (hgcd, M, quotients); 1948 1949 if (res > 0) 1950 ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 0, 4); 1951 if (res >= 0) 1952 return res; 1953 1954 ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 0, 2); 1955 } 1956 1957 ASSERT (hgcd->row[0].rsize >= hgcd->row[1].rsize); 1958 ASSERT (hgcd->row[0].rsize <= M + m + 1); 1959 ASSERT (hgcd->row[1].rsize > M); 1960 1961 /* Second phase, reduce size until we have one number of size > M 1962 and one of size <= M+1 */ 1963 while (hgcd->row[1].rsize > M + 1) 1964 { 1965 mp_size_t k = 2*M - hgcd->row[0].rsize; 1966 mp_size_t n1 = hgcd->row[0].rsize - k; 1967 mp_size_t qsize; 1968 mp_srcptr qp; 1969 int res; 1970 1971 ASSERT (k + (n1 + 1)/2 == M); 1972 ASSERT (n1 >= 2); 1973 1974 ASSERT (n1 <= 2*(m + 1)); 1975 ASSERT (n1 <= n + 3); 1976 1977 res = mpn_hgcd (&R, 1978 hgcd->row[0].rp + k, hgcd->row[0].rsize - k, 1979 hgcd->row[1].rp + k, hgcd->row[1].rsize - k, 1980 quotients, tp, talloc); 1981 1982 if (res == 0) 1983 { 1984 /* The first remainder was small. Then there's a good chance 1985 that the remainder A % B is also small. */ 1986 res = euclid_step (hgcd, M, quotients); 1987 1988 if (res > 0) 1989 ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 0, 4); 1990 if (res >= 0) 1991 return res; 1992 1993 ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 0, 2); 1994 continue; 1995 } 1996 1997 if (res == 1) 1998 { 1999 mp_srcptr qp; 2000 mp_size_t qsize; 2001 2002 qstack_drop (quotients); 2003 2004 /* Compute possibly incorrect r2 and corresponding u2, v2. 2005 Incorrect matrix elements must be smaller than the 2006 correct ones, since they correspond to a too small q. */ 2007 qsize = qstack_get_0 (quotients, &qp); 2008 2009 ASSERT (qsize + hgcd->size <= hgcd->alloc); 2010 hgcd_update_r (hgcd->row, qp, qsize); 2011 hgcd->size = hgcd_update_uv (hgcd->row, hgcd->size, 2012 qp, qsize); 2013 ASSERT (hgcd->size < hgcd->alloc); 2014 2015 if (!MPN_LESS_P (hgcd->row[3].rp, hgcd->row[3].rsize, 2016 hgcd->row[2].rp, hgcd->row[2].rsize)) 2017 hgcd->size = hgcd_adjust (hgcd->row + 1, hgcd->size, quotients); 2018 2019 ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 0, 3); 2020 2021 if (hgcd->row[2].rsize <= M) 2022 { 2023 /* Backup one steps */ 2024 ASSERT (!hgcd_start_row_p (hgcd->row + 2, hgcd->size)); 2025 2026 return hgcd_small_1 (hgcd, M, quotients); 2027 } 2028 2029 HGCD_SWAP4_LEFT (hgcd->row); 2030 hgcd->sign = ~hgcd->sign; 2031 continue; 2032 } 2033 2034 /* Now r0 and r1 are always correct. */ 2035 2036 /* It's possible that first two "new" r:s are the same as the 2037 old ones. In that case skip recomputing them. */ 2038 2039 if (!hgcd_start_row_p (&R.row[0], R.size)) 2040 { 2041 /* Store new values in rows 2 and 3, to avoid overlap */ 2042 hgcd->row[2].rsize 2043 = mpn_hgcd_fix (k, hgcd->row[2].rp, hgcd->row[0].rsize + 1, 2044 R.sign, R.size, &R.row[0], 2045 hgcd->row[0].rp, hgcd->row[1].rp, 2046 tp, talloc); 2047 2048 hgcd->row[3].rsize 2049 = mpn_hgcd_fix (k, hgcd->row[3].rp, hgcd->row[1].rsize + 1, 2050 ~R.sign, R.size, &R.row[1], 2051 hgcd->row[0].rp, hgcd->row[1].rp, 2052 tp, talloc); 2053 2054 ASSERT (hgcd->row[2].rsize > M); 2055 ASSERT (hgcd->row[3].rsize > k); 2056 2057 hgcd->size = hgcd_mul (hgcd->row+2, hgcd->alloc, 2058 R.row, R.size, hgcd->row, hgcd->size, 2059 tp, talloc); 2060 hgcd->sign ^= R.sign; 2061 2062 ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 2, 4); 2063 2064 if (hgcd->row[3].rsize <= M) 2065 { 2066 /* Backup two steps */ 2067 2068 /* We don't use R.row[2] and R.row[3], so drop the 2069 corresponding quotients. */ 2070 qstack_drop (quotients); 2071 qstack_drop (quotients); 2072 2073 return hgcd_small_2 (hgcd, M, quotients); 2074 } 2075 2076 HGCD_SWAP4_2 (hgcd->row); 2077 2078 if (res == 2) 2079 { 2080 qstack_drop (quotients); 2081 qstack_drop (quotients); 2082 2083 continue; 2084 } 2085 } 2086 2087 ASSERT (res >= 3); 2088 2089 /* We already know the correct q */ 2090 qsize = qstack_get_1 (quotients, &qp); 2091 2092 ASSERT (qsize + hgcd->size <= hgcd->alloc); 2093 hgcd_update_r (hgcd->row, qp, qsize); 2094 hgcd->size = hgcd_update_uv (hgcd->row, hgcd->size, 2095 qp, qsize); 2096 ASSERT (hgcd->size < hgcd->alloc); 2097 2098 ASSERT (hgcd->row[2].rsize > k); 2099 if (hgcd->row[2].rsize <= M) 2100 { 2101 /* Discard r3 */ 2102 qstack_drop (quotients); 2103 return hgcd_small_1 (hgcd, M, quotients); 2104 } 2105 if (res == 3) 2106 { 2107 /* Drop quotient for r3 */ 2108 qstack_drop (quotients); 2109 hgcd->sign = ~hgcd->sign; 2110 HGCD_SWAP4_LEFT (hgcd->row); 2111 2112 continue; 2113 } 2114 2115 ASSERT (hgcd->row[2].rsize > M); 2116 ASSERT (res == 4); 2117 2118 /* We already know the correct q */ 2119 qsize = qstack_get_0 (quotients, &qp); 2120 2121 ASSERT (qsize + hgcd->size <= hgcd->alloc); 2122 hgcd_update_r (hgcd->row + 1, qp, qsize); 2123 hgcd->size = hgcd_update_uv (hgcd->row + 1, hgcd->size, 2124 qp, qsize); 2125 ASSERT (hgcd->size < hgcd->alloc); 2126 ASSERT (hgcd->row[3].rsize <= M + 1); 2127 2128 if (hgcd->row[3].rsize <= M) 2129 { 2130 #if WANT_ASSERT 2131 qstack_rotate (quotients, 0); 2132 #endif 2133 ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 0, 4); 2134 return hgcd_jebelean (hgcd, M); 2135 } 2136 2137 HGCD_SWAP4_2 (hgcd->row); 2138 } 2139 2140 ASSERT_HGCD (hgcd, ap, asize, bp, bsize, 0, 2); 2141 2142 return hgcd_final (hgcd, M, quotients); 2143 } -
mpn/generic/hgcd2.c
diff -N -u -r gmp-4.2.1/mpn/generic/hgcd2.c gmp-4.2.1-patched/mpn/generic/hgcd2.c
old new 1 /* hgcd2.c 2 3 THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY 4 SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST 5 GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. 6 7 Copyright 1996, 1998, 2000, 2001, 2002, 2003, 2004 Free Software Foundation, 8 Inc. 9 10 This file is part of the GNU MP Library. 11 12 The GNU MP Library is free software; you can redistribute it and/or modify 13 it under the terms of the GNU General Public License as published by 14 the Free Software Foundation; either version 2.1 of the License, or (at your 15 option) any later version. 16 17 The GNU MP Library is distributed in the hope that it will be useful, but 18 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 19 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public 20 License for more details. 21 22 You should have received a copy of the GNU General Public License 23 along with the GNU MP Library; see the file COPYING. If not, write to 24 the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, 25 MA 02111-1307, USA. */ 26 27 #include "gmp.h" 28 #include "gmp-impl.h" 29 #include "longlong.h" 30 31 #if GMP_NAIL_BITS == 0 32 33 /* Copied from mpn/generic/gcdext.c, and modified slightly to return 34 the remainder. */ 35 /* Two-limb division optimized for small quotients. */ 36 static inline mp_limb_t 37 div2 (mp_ptr rp, 38 mp_limb_t nh, mp_limb_t nl, 39 mp_limb_t dh, mp_limb_t dl) 40 { 41 mp_limb_t q = 0; 42 43 if ((mp_limb_signed_t) nh < 0) 44 { 45 int cnt; 46 for (cnt = 1; (mp_limb_signed_t) dh >= 0; cnt++) 47 { 48 dh = (dh << 1) | (dl >> (GMP_LIMB_BITS - 1)); 49 dl = dl << 1; 50 } 51 52 while (cnt) 53 { 54 q <<= 1; 55 if (nh > dh || (nh == dh && nl >= dl)) 56 { 57 sub_ddmmss (nh, nl, nh, nl, dh, dl); 58 q |= 1; 59 } 60 dl = (dh << (GMP_LIMB_BITS - 1)) | (dl >> 1); 61 dh = dh >> 1; 62 cnt--; 63 } 64 } 65 else 66 { 67 int cnt; 68 for (cnt = 0; nh > dh || (nh == dh && nl >= dl); cnt++) 69 { 70 dh = (dh << 1) | (dl >> (GMP_LIMB_BITS - 1)); 71 dl = dl << 1; 72 } 73 74 while (cnt) 75 { 76 dl = (dh << (GMP_LIMB_BITS - 1)) | (dl >> 1); 77 dh = dh >> 1; 78 q <<= 1; 79 if (nh > dh || (nh == dh && nl >= dl)) 80 { 81 sub_ddmmss (nh, nl, nh, nl, dh, dl); 82 q |= 1; 83 } 84 cnt--; 85 } 86 } 87 88 rp[0] = nl; 89 rp[1] = nh; 90 91 return q; 92 } 93 #else /* GMP_NAIL_BITS != 0 */ 94 /* Two-limb division optimized for small quotients. Input words 95 include nails, which must be zero. */ 96 static inline mp_limb_t 97 div2 (mp_ptr rp, 98 mp_limb_t nh, mp_limb_t nl, 99 mp_limb_t dh, mp_limb_t dl) 100 { 101 mp_limb_t q = 0; 102 int cnt; 103 104 ASSERT_LIMB(nh); 105 ASSERT_LIMB(nl); 106 ASSERT_LIMB(dh); 107 ASSERT_LIMB(dl); 108 109 /* FIXME: Always called with nh > 0 and dh >0. Then it should be 110 enough to look at the high limbs to select cnt. */ 111 for (cnt = 0; nh > dh || (nh == dh && nl >= dl); cnt++) 112 { 113 dh = (dh << 1) | (dl >> (GMP_NUMB_BITS - 1)); 114 dl = (dl << 1) & GMP_NUMB_MASK; 115 } 116 117 while (cnt) 118 { 119 dl = (dh << (GMP_NUMB_BITS - 1)) | (dl >> 1); 120 dh = dh >> 1; 121 dl &= GMP_NUMB_MASK; 122 123 q <<= 1; 124 if (nh > dh || (nh == dh && nl >= dl)) 125 { 126 /* FIXME: We could perhaps optimize this by unrolling the 127 loop 2^GMP_NUMB_BITS - 1 times? */ 128 nl -= dl; 129 nh -= dh; 130 nh -= (nl >> (GMP_LIMB_BITS - 1)); 131 nl &= GMP_NUMB_MASK; 132 133 q |= 1; 134 } 135 cnt--; 136 } 137 ASSERT (nh < dh || (nh == dh && nl < dl)); 138 rp[0] = nl; 139 rp[1] = nh; 140 141 return q; 142 } 143 #endif /* GMP_NAIL_BITS != 0 */ 144 145 #define SUB_2(w1,w0, x1,x0, y1,y0) \ 146 do { \ 147 ASSERT_LIMB (x1); \ 148 ASSERT_LIMB (x0); \ 149 ASSERT_LIMB (y1); \ 150 ASSERT_LIMB (y0); \ 151 \ 152 if (GMP_NAIL_BITS == 0) \ 153 sub_ddmmss (w1,w0, x1,x0, y1,y0); \ 154 else \ 155 { \ 156 mp_limb_t __w0, __c; \ 157 SUBC_LIMB (__c, __w0, x0, y0); \ 158 (w1) = ((x1) - (y1) - __c) & GMP_NUMB_MASK; \ 159 (w0) = __w0; \ 160 } \ 161 } while (0) 162 163 static inline void 164 qstack_push_0 (struct qstack *stack) 165 { 166 ASSERT_QSTACK (stack); 167 168 if (stack->size_next >= QSTACK_MAX_QUOTIENTS) 169 qstack_rotate (stack, 0); 170 171 stack->size[stack->size_next++] = 0; 172 } 173 174 static inline void 175 qstack_push_1 (struct qstack *stack, mp_limb_t q) 176 { 177 ASSERT (q >= 2); 178 179 ASSERT_QSTACK (stack); 180 181 if (stack->limb_next >= stack->limb_alloc) 182 qstack_rotate (stack, 1); 183 184 else if (stack->size_next >= QSTACK_MAX_QUOTIENTS) 185 qstack_rotate (stack, 0); 186 187 stack->size[stack->size_next++] = 1; 188 stack->limb[stack->limb_next++] = q; 189 190 ASSERT_QSTACK (stack); 191 } 192 193 /* Produce r_k from r_i and r_j, and push the corresponding 194 quotient. */ 195 #if __GMP_HAVE_TOKEN_PASTE 196 #define HGCD2_STEP(i, j, k) do { \ 197 SUB_2 (rh ## k, rl ## k, \ 198 rh ## i, rl ## i, \ 199 rh ## j, rl ## j); \ 200 \ 201 /* Could check here for the special case rh3 == 0, \ 202 but it's covered by the below condition as well */ \ 203 if ( rh ## k < rh ## j \ 204 || ( rh ## k == rh ## j \ 205 && rl ## k < rl ## j)) \ 206 { \ 207 /* Unit quotient */ \ 208 u ## k = u ## i + u ## j; \ 209 v ## k = v ## i + v ## j; \ 210 \ 211 if (quotients) \ 212 qstack_push_0 (quotients); \ 213 } \ 214 else \ 215 { \ 216 mp_limb_t r[2]; \ 217 mp_limb_t q = 1 + div2 (r, rh ## k, rl ## k, \ 218 rh ## j, rl ## j); \ 219 rl ## k = r[0]; rh ## k = r[1]; \ 220 u ## k = u ## i + q * u ## j; \ 221 v ## k = v ## i + q * v ## j; \ 222 \ 223 if (quotients) \ 224 qstack_push_1 (quotients, q); \ 225 } \ 226 } while (0) 227 #else /* ! __GMP_HAVE_TOKEN_PASTE */ 228 #define HGCD2_STEP(i, j, k) do { \ 229 SUB_2 (rh/**/k, rl/**/k, \ 230 rh/**/i, rl/**/i, \ 231 rh/**/j, rl/**/j); \ 232 \ 233 /* Could check here for the special case rh3 == 0, \ 234 but it's covered by the below condition as well */ \ 235 if ( rh/**/k < rh/**/j \ 236 || ( rh/**/k == rh/**/j \ 237 && rl/**/k < rl/**/j)) \ 238 { \ 239 /* Unit quotient */ \ 240 u/**/k = u/**/i + u/**/j; \ 241 v/**/k = v/**/i + v/**/j; \ 242 \ 243 if (quotients) \ 244 qstack_push_0 (quotients); \ 245 } \ 246 else \ 247 { \ 248 mp_limb_t r[2]; \ 249 mp_limb_t q = 1 + div2 (r, rh/**/k, rl/**/k, \ 250 rh/**/j, rl/**/j); \ 251 rl/**/k = r[0]; rh/**/k = r[1]; \ 252 u/**/k = u/**/i + q * u/**/j; \ 253 v/**/k = v/**/i + q * v/**/j; \ 254 \ 255 if (quotients) \ 256 qstack_push_1 (quotients, q); \ 257 } \ 258 } while (0) 259 #endif /* ! __GMP_HAVE_TOKEN_PASTE */ 260 261 /* Repeatedly divides A by B, until the remainder is a single limb. 262 Stores cofactors in HGCD, and pushes the quotients on STACK (unless 263 STACK is NULL). On success, HGCD->row[0, 1, 2] correspond to 264 remainders that are larger than one limb, while HGCD->row[3] 265 correspond to a remainder that fit in a single limb. 266 267 Return 0 on failure (if B or A mod B fits in a single limb). Return 268 1 if r0 and r1 are correct, but we still make no progress because 269 r0 = A, r1 = B. 270 271 Otherwise return 2, 3 or 4 depending on how many of the r:s that 272 satisfy Jebelean's criterion. */ 273 /* FIXME: There are two more micro optimizations that could be done to 274 this code: 275 276 The div2 function starts with checking the most significant bit of 277 the numerator. When we call div2, that bit is know in advance for 278 all but the one or two first calls, so we could split div2 in two 279 functions, and call the right one. 280 281 We could also have two versions of this code, with and without the 282 quotient argument, to avoid checking if it's NULL in the middle of 283 the loop. */ 284 285 int 286 mpn_hgcd2 (struct hgcd2 *hgcd, 287 mp_limb_t ah, mp_limb_t al, 288 mp_limb_t bh, mp_limb_t bl, 289 struct qstack *quotients) 290 { 291 /* For all divisions, we special case q = 1, which accounts for 292 approximately 41% of the quotients for random numbers (Knuth, 293 TAOCP 4.5.3) */ 294 295 /* Use scalar variables */ 296 mp_limb_t rh1, rl1, u1, v1; 297 mp_limb_t rh2, rl2, u2, v2; 298 mp_limb_t rh3, rl3, u3, v3; 299 300 ASSERT_LIMB(ah); 301 ASSERT_LIMB(al); 302 ASSERT_LIMB(bh); 303 ASSERT_LIMB(bl); 304 ASSERT (ah > bh || (ah == bh && al >= bl)); 305 306 if (bh == 0) 307 return 0; 308 309 { 310 mp_limb_t rh0, rl0, u0, v0; 311 312 /* Initialize first two rows */ 313 rh0 = ah; rl0 = al; u0 = 1; v0 = 0; 314 rh1 = bh; rl1 = bl; u1 = 0; v1 = 1; 315 316 SUB_2 (rh2, rl2, rh0, rl0, rh1, rl1); 317 318 if (rh2 == 0) 319 return 0; 320 321 if (rh2 < rh1 || (rh2 == rh1 && rl2 < rl1)) 322 { 323 /* Unit quotient */ 324 v2 = 1; 325 326 if (quotients) 327 qstack_push_0 (quotients); 328 } 329 else 330 { 331 mp_limb_t r[2]; 332 mp_limb_t q = 1 + div2 (r, rh2, rl2, rh1, rl1); 333 334 rl2 = r[0]; rh2 = r[1]; 335 336 if (rh2 == 0) 337 return 0; 338 339 v2 = q; 340 341 if (quotients) 342 qstack_push_1 (quotients, q); 343 } 344 345 u2 = 1; 346 347 /* The simple version of the loop is as follows: 348 | 349 | hgcd->sign = 0; 350 | for (;;) 351 | { 352 | (q, rh3, rl3]) = divmod (r1, r2); 353 | u[3] = u1 + q * u2; 354 | v[3] = v1 + q * v2; 355 | qstack_push_1 (quotients, q); 356 | 357 | if (rh3 == 0) 358 | break; 359 | 360 | HGCD2_SHIFT4_LEFT (hgcd->row); 361 | hgcd->sign = ~hgcd->sign; 362 | } 363 | 364 | But then we special case for q = 1, and unroll the loop four times 365 | to avoid data movement. */ 366 367 for (;;) 368 { 369 HGCD2_STEP (1, 2, 3); 370 if (rh3 == 0) 371 { 372 hgcd->row[0].u = u0; hgcd->row[0].v = v0; 373 374 hgcd->sign = 0; 375 376 break; 377 } 378 HGCD2_STEP (2, 3, 0); 379 if (rh0 == 0) 380 { 381 hgcd->row[0].u = u1; hgcd->row[0].v = v1; 382 383 rh1 = rh2; rl1 = rl2; u1 = u2; v1 = v2; 384 rh2 = rh3; rl2 = rl3; u2 = u3; v2 = v3; 385 rh3 = rh0; rl3 = rl0; u3 = u0; v3 = v0; 386 387 hgcd->sign = -1; 388 break; 389 } 390 391 HGCD2_STEP (3, 0, 1); 392 if (rh1 == 0) 393 { 394 hgcd->row[0].u = u2; hgcd->row[0].v = v2; 395 rh2 = rh0; rl2 = rl0; u2 = u0; v2 = v0; 396 397 MP_LIMB_T_SWAP (rh1, rh3); MP_LIMB_T_SWAP (rl1, rl3); 398 MP_LIMB_T_SWAP ( u1, u3); MP_LIMB_T_SWAP ( v1, v3); 399 400 hgcd->sign = 0; 401 break; 402 } 403 404 HGCD2_STEP (0, 1, 2); 405 if (rh2 == 0) 406 { 407 hgcd->row[0].u = u3; hgcd->row[0].v = v3; 408 409 rh3 = rh2; rl3 = rl2; u3 = u2; v3 = v2; 410 rh2 = rh1; rl2 = rl1; u2 = u1; v2 = v1; 411 rh1 = rh0; rl1 = rl0; u1 = u0; v1 = v0; 412 413 hgcd->sign = -1; 414 break; 415 } 416 } 417 } 418 419 ASSERT (rh1 != 0); 420 ASSERT (rh2 != 0); 421 ASSERT (rh3 == 0); 422 ASSERT (rh1 > rh2 || (rh1 == rh2 && rl1 > rl2)); 423 ASSERT (rh2 > rh3 || (rh2 == rh3 && rl2 > rl3)); 424 425 /* Coefficients to be returned */ 426 hgcd->row[1].u = u1; hgcd->row[1].v = v1; 427 hgcd->row[2].u = u2; hgcd->row[2].v = v2; 428 hgcd->row[3].u = u3; hgcd->row[3].v = v3; 429 430 /* Rows 1, 2 and 3 are used below, rh0, rl0, u0 and v0 are not. */ 431 #if GMP_NAIL_BITS == 0 432 { 433 mp_limb_t sh; 434 mp_limb_t sl; 435 mp_limb_t th; 436 mp_limb_t tl; 437 438 /* Check r2 */ 439 /* We always have r2 > u2, v2 */ 440 441 if (hgcd->sign >= 0) 442 { 443 /* Check if r1 - r2 >= u2 - u1 = |u2| + |u1| */ 444 sl = u2 + u1; 445 sh = (sl < u1); 446 } 447 else 448 { 449 /* Check if r1 - r2 >= v2 - v1 = |v2| + |v1| */ 450 sl = v2 + v1; 451 sh = (sl < v1); 452 } 453 454 sub_ddmmss (th, tl, rh1, rl1, rh2, rl2); 455 456 if (th < sh || (th == sh && tl < sl)) 457 return 2 - (hgcd->row[0].v == 0); 458 459 /* Check r3 */ 460 461 if (hgcd->sign >= 0) 462 { 463 /* Check r3 >= max (-u3, -v3) = |u3| */ 464 if (rl3 < u3) 465 return 3; 466 467 /* Check r3 - r2 >= v3 - v2 = |v2| + |v1|*/ 468 sl = v3 + v2; 469 sh = (sl < v2); 470 } 471 else 472 { 473 /* Check r3 >= max (-u3, -v3) = |v3| */ 474 if (rl3 < v3) 475 return 3; 476 477 /* Check r3 - r2 >= u3 - u2 = |u2| + |u1| */ 478 sl = u3 + u2; 479 sh = (sl < u2); 480 } 481 482 sub_ddmmss (th, tl, rh2, rl2, 0, rl3); 483 484 if (th < sh || (th == sh && tl < sl)) 485 return 3; 486 487 return 4; 488 } 489 #else /* GMP_NAIL_BITS > 0 */ 490 { 491 mp_limb_t sl; 492 mp_limb_t th; 493 mp_limb_t tl; 494 495 /* Check r2 */ 496 /* We always have r2 > u2, v2 */ 497 498 if (hgcd->sign >= 0) 499 { 500 /* Check if r1 - r2 >= u2 - u1 = |u2| + |u1| */ 501 sl = u2 + u1; 502 } 503 else 504 { 505 /* Check if r1 - r2 >= v2 - v1 = |v2| + |v1| */ 506 sl = v2 + v1; 507 } 508 509 tl = rl1 - rl2; 510 th = rh1 - rh2 - (tl >> (GMP_LIMB_BITS - 1)); 511 ASSERT_LIMB(th); 512 513 if (th < (CNST_LIMB(1) << GMP_NAIL_BITS) 514 && ((th << GMP_NUMB_BITS) | (tl & GMP_NUMB_MASK)) < sl) 515 return 2 - (hgcd->row[0].v == 0); 516 517 /* Check r3 */ 518 519 if (hgcd->sign >= 0) 520 { 521 /* Check r3 >= max (-u3, -v3) = |u3| */ 522 if (rl3 < u3) 523 return 3; 524 525 /* Check r3 - r2 >= v3 - v2 = |v2| + |v1|*/ 526 sl = v3 + v2; 527 } 528 else 529 { 530 /* Check r3 >= max (-u3, -v3) = |v3| */ 531 if (rl3 < v3) 532 return 3; 533 534 /* Check r3 - r2 >= u3 - u2 = |u2| + |u1| */ 535 sl = u3 + u2; 536 } 537 538 tl = rl2 - rl3; 539 th = rh2 - (tl >> (GMP_LIMB_BITS - 1)); 540 ASSERT_LIMB(th); 541 542 if (th < (CNST_LIMB(1) << GMP_NAIL_BITS) 543 && ((th << GMP_NUMB_BITS) | (tl & GMP_NUMB_MASK)) < sl) 544 return 3; 545 546 return 4; 547 } 548 #endif /* GMP_NAIL_BITS > 0 */ 549 } 550 551 mp_size_t 552 mpn_hgcd2_fix (mp_ptr rp, mp_size_t ralloc, 553 int sign, 554 mp_limb_t u, mp_srcptr ap, mp_size_t asize, 555 mp_limb_t v, mp_srcptr bp, mp_size_t bsize) 556 { 557 mp_size_t rsize; 558 mp_limb_t cy; 559 560 ASSERT_LIMB(u); 561 ASSERT_LIMB(v); 562 563 if (sign < 0) 564 { 565 MP_LIMB_T_SWAP (u,v); 566 MPN_SRCPTR_SWAP (ap, asize, bp, bsize); 567 } 568 569 ASSERT (u > 0); 570 571 ASSERT (asize <= ralloc); 572 rsize = asize; 573 cy = mpn_mul_1 (rp, ap, asize, u); 574 if (cy) 575 { 576 ASSERT (rsize < ralloc); 577 rp[rsize++] = cy; 578 } 579 580 if (v > 0) 581 { 582 ASSERT (bsize <= rsize); 583 cy = mpn_submul_1 (rp, bp, bsize, v); 584 if (cy) 585 { 586 ASSERT (bsize < rsize); 587 ASSERT_NOCARRY (mpn_sub_1 (rp + bsize, 588 rp + bsize, rsize - bsize, cy)); 589 } 590 591 MPN_NORMALIZE (rp, rsize); 592 } 593 return rsize; 594 } 595 596 #undef HGCD2_STEP -
mpn/generic/qstack.c
diff -N -u -r gmp-4.2.1/mpn/generic/qstack.c gmp-4.2.1-patched/mpn/generic/qstack.c
old new 1 /* qstack.c 2 3 THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY 4 SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST 5 GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. 6 7 Copyright 2003 Free Software Foundation, Inc. 8 9 This file is part of the GNU MP Library. 10 11 The GNU MP Library is free software; you can redistribute it and/or modify 12 it under the terms of the GNU General Public License as published by 13 the Free Software Foundation; either version 2.1 of the License, or (at your 14 option) any later version. 15 16 The GNU MP Library is distributed in the hope that it will be useful, but 17 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 18 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public 19 License for more details. 20 21 You should have received a copy of the GNU General Public License 22 along with the GNU MP Library; see the file COPYING. If not, write to 23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, 24 MA 02111-1307, USA. */ 25 26 #include "gmp.h" 27 #include "gmp-impl.h" 28 #include "longlong.h" 29 30 mp_size_t 31 qstack_itch (mp_size_t size) 32 { 33 /* Limit on the recursion depth */ 34 35 unsigned k = mpn_hgcd_max_recursion (size); 36 37 ASSERT (2 * k < QSTACK_MAX_QUOTIENTS); 38 39 return 2 * size + 12 * (k+1); 40 } 41 42 void 43 qstack_reset (struct qstack *stack, 44 mp_size_t asize) 45 { 46 /* Limit on the recursion depth */ 47 unsigned k = mpn_hgcd_max_recursion (asize); 48 49 stack->size_next = 0; 50 stack->limb_next= 0; 51 stack->nkeep = 2 * (k + 1); 52 ASSERT (stack->nkeep < QSTACK_MAX_QUOTIENTS); 53 54 ASSERT_QSTACK (stack); 55 } 56 57 void 58 qstack_init (struct qstack *stack, 59 mp_size_t asize, 60 mp_limb_t *limbs, mp_size_t alloc) 61 { 62 stack->limb = limbs; 63 stack->limb_alloc = alloc; 64 /* Should depend on input size, we need 2 * recursion depth */ 65 stack->nkeep = QSTACK_MAX_QUOTIENTS - 1; 66 67 qstack_reset (stack, asize); 68 } 69 70 /* Drop all but the nkeep latest quotients. Drop additional quotients 71 if needed to reclain at least SIZE limbs of storage. */ 72 void 73 qstack_rotate (struct qstack *stack, 74 mp_size_t size) 75 { 76 unsigned dropped_sizes; 77 unsigned kept; 78 unsigned i; 79 80 mp_size_t dropped_limbs; 81 82 ASSERT_QSTACK (stack); 83 84 if (stack->size_next > stack->nkeep) 85 dropped_sizes = stack->size_next - stack->nkeep; 86 else 87 dropped_sizes = 0; 88 89 for (i = 0, dropped_limbs = 0; i < dropped_sizes; i++) 90 dropped_limbs += stack->size[i]; 91 92 for (; dropped_limbs < size; dropped_sizes++) 93 { 94 ASSERT (dropped_sizes < stack->size_next); 95 dropped_limbs += stack->size[dropped_sizes]; 96 } 97 98 ASSERT (dropped_limbs <= stack->limb_next); 99 100 kept = stack->size_next - dropped_sizes; 101 102 if (dropped_sizes) 103 /* memmove isn't portable */ 104 for (i = 0; i < kept; i++) 105 stack->size[i] = stack->size[i + dropped_sizes]; 106 107 stack->size_next = kept; 108 109 if (dropped_limbs) 110 { 111 if (dropped_limbs < stack->limb_next) 112 { 113 MPN_COPY_INCR (stack->limb, stack->limb + dropped_limbs, 114 stack->limb_next - dropped_limbs); 115 ASSERT (dropped_limbs <= stack->limb_next); 116 stack->limb_next -= dropped_limbs; 117 } 118 else 119 stack->limb_next = 0; 120 } 121 ASSERT_QSTACK (stack); 122 } 123 124 #if WANT_ASSERT 125 void 126 __gmpn_qstack_sanity (struct qstack *stack) 127 { 128 mp_size_t next; 129 unsigned i; 130 131 for (i = 0, next = 0; i < stack->size_next; i++) 132 { 133 mp_size_t qsize = stack->size[i]; 134 ASSERT (next <= stack->limb_alloc); 135 136 ASSERT (qsize == 0 || stack->limb[next+qsize - 1] != 0); 137 next += qsize; 138 } 139 140 ASSERT (next == stack->limb_next); 141 } 142 #endif