5 files changed, 2580 insertions, 487 deletions
diff --git a/doc/README.dev b/doc/README.dev
index 80d502626..067423f99 100644
--- a/doc/README.dev
+++ b/doc/README.dev
@@ -440,6 +440,9 @@ List of macros used for checking MPFR:
                         for no timeout by default, and can be overridden
                         at "make check" time with the MPFR_TESTS_TIMEOUT
                         environment variable.
++ MPFR_COV_CHECK:       Define to enable value coverage checking (must not
+                        be used in production). This macro is for the MPFR
+                        developers, in order to improve the testsuite.
 
 ===========================================================================
 
diff --git a/doc/sum.txt b/doc/sum.txt
new file mode 100644
index 000000000..b51b95456
--- /dev/null
+++ b/doc/sum.txt
@@ -0,0 +1,643 @@
+Implementation of mpfr_sum: sum of n mpfr_t values
+══════════════════════════════════════════════════
+
+About the time/memory complexity: The original mpfr_sum algorithm was
+inefficient on some inputs, and in some simple cases with inputs of
+different orders of magnitude, it could be very slow and take all the
+memory, making the whole program (and possibly other programs on the
+system too, due to the lack of memory at some critical point) crash.
+The time/memory complexity must no longer depend on the value of the
+exponents of the inputs, i.e. their order of magnitude. Moreover, be
+careful with the carry propagation.
+
+Preliminary note: The previous mpfr_sum algorithm was a high-level one
+and could take too much memory and too much time in cases with numbers
+of different magnitudes and cancellation. Here we will use functions
+from the mpn layer of GMP only (no mpfr_add), as the algorithm is based
+on additions by blocks, involving virtual splitting of MPFR numbers.
+
+Concerning the sign of a zero result, the specification is:
+  * if n = 0, then the result is +0 (this choice is consistent with
+    IEEE 754-2008's sum reduction operation of vector length 0, and
+    with mpfr_set_ui and mpfr_set_si on the integer 0); otherwise
+  * if all the inputs have the same sign (i.e. all +0 or all -0),
+    then the result has the same sign as the inputs (for n = 1 or 2,
+    this choice is consistent with mpfr_set and mpfr_add respectively);
+  * otherwise, either because all inputs are zeros with at least a +0
+    and a -0, or because some inputs are non-zero (but they globally
+    cancel), the result is +0, except for the MPFR_RNDD rounding mode,
+    where it is -0 (this choice is consistent with mpfr_add for n = 2,
+    and more generally with a sequence of mpfr_add's).
+Additional details on:
+  https://sympa.inria.fr/sympa/arc/mpfr/2014-03/msg00000.html
+
+Let sq be the output precision.
+
+*** TODO ***
+The following is a bit obsolete. In particular the new helper function
+sum_raw (obtained from code refactoring for steps 3 to 6, and 8) should
+be introduced and described. Pre-rounding in (7) and rounding in (8)
+have been merged, and this [previously pre-rounding in (7)] is still
+done at the same time as sign handling with a single mpn operation.
+The table concerning the correction term and the ternary value in case
+of TMD should be changed to:
+
+     rnd   tmd    R    sst      correction   ternary
+     ───────────────────────────────────────────────
+      N     1     0     -            0          +
+      N     1     0     0            0          0
+      N     1     0     +            0          -
+      N     1     1     -           +1          +
+      N     1     1     0           +1          0
+      N     1     1     +           +1          -
+     ───────────────────────────────────────────────
+      N     2     0     -            0          -
+      N     2     0     0            ?          ?     (halfway case)
+      N     2     0     +           +1          +
+      N     2     1     -            0          -
+      N     2     1     0            ?          ?     (halfway case)
+      N     2     1     +           +1          +
+     ───────────────────────────────────────────────
+      D     1     0     -           -1          -
+      D     1     0     0            0          0
+      D     1     0     +            0          -
+      D     1     1     -            0          -
+      D     1     1     0           +1          0
+      D     1     1     +           +1          -
+     ───────────────────────────────────────────────
+      U     1     0     -            0          +
+      U     1     0     0            0          0
+      U     1     0     +           +1          +
+      U     1     1     -           +1          +
+      U     1     1     0           +1          0
+      U     1     1     +           +2          +
+     ───────────────────────────────────────────────
+
+The general ideas of the algorithm:
+
+0. Handle the case n <= 1 separately (+0 for n = 0, mpfr_set for n = 1).
+   Now assume that n >= 2.
+
+1. Look at the exponent field of each mpfr_t input (this is fast); also
+   track the signs of Inf's and zeros. This step allows one to:
+     * detect the singular cases, i.e. when there is either at least
+       a NaN or an Inf in the inputs (in case of a NaN or two Inf's of
+       different signs, we can immediately return NaN), or all inputs
+       are zeros;
+     * determine the maximum exponent maxexp of the regular inputs and
+       the number n' of regular inputs.
+
+2. A truncated sum will be computed in two's complement, stored in a
+   fixed memory area, called "accumulator", whose characteristics are
+   determined here.
+
+   Two's complement is preferred to the sign + magnitude representation
+   because the signs of the temporary (and final) results are not known
+   in advanced, and the computations (additions and subtractions) in
+   two's complement are more natural in this context. There will be a
+   conversion to sign + magnitude (representation used by MPFR numbers)
+   at the end, but this should not take much time compared to the other
+   calculations.
+
+   Only a fixed part of the inputs will be taken into account for the
+   truncated sum: the bits whose exponent is in some window (interval)
+   denoted [minexp,maxexp[.
+
+   We choose not to include maxexp in the interval in order to match the
+   floating-point representation chosen in MPFR, where the significand
+   is in [1/2,1[; this also means that the current minexp will be maxexp
+   at the next iteration, unless there is a "hole" between the inputs,
+   as explained below.
+
+   Let us define logn = ceil(log2(n')).
+
+   Due to the accumulation of values and the choice of two's complement
+   (a way to represent the sign), we will need some extra bits to avoid
+   overflows. The absolute value of the sum is less than n' * 2^maxexp,
+   taken up to logn extra bits, and one needs one more bit to be able
+   to determine the sign, so that cq = logn + 1 extra bits will be
+   considered.
+
+   For the other side, we define minexp = maxexp - sq - dq, where dq
+   will be chosen around logn to take into account the accumulation
+   of errors, i.e. from everything less significant than minexp. The
+   final choice for dq should be done after testing: in practice, one
+   may need a little more precision to handle partial cancellation at
+   this iteration, but important cancellation will always lead to other
+   iterations. For instance, we will choose for dq the smallest value
+   such that dq ≥ max(logn,4) (the bound 4 will be explained later),
+   and the accumulator bitsize wq, which is maxexp + cq - minexp, i.e.
+   cq + sq + dq, is a multiple of GMP_NUMB_BITS.
+
+   Accumulator:  [--------]-----------------------------------]
+                     cq   └─ maxexp      sq + dq      minexp ─┘
+
+3. Compute the truncated sum in two's complement by taking into account
+   the part of the inputs in the window [minexp,maxexp[.
+
+   In the same loop over the inputs, determine the maximum exponent
+   maxexp2 of the numbers formed starting with the most significant
+   represented bit that has been ignored: one will get either minexp
+   (if an input has been truncated at this iteration) or the maximum
+   exponent of the numbers that have been completely ignored. If no
+   bits have been ignored any longer, then maxexp2 has the special
+   value MPFR_EXP_MIN, which is the minimum value of the mpfr_exp_t
+   type and not a valid exponent; and the truncated sum is exact.
+
+   The accumulator during this iteration:
+
+                 [--------]-----------------------------------]
+                     cq   └─ maxexp                   minexp ─┘
+
+   If there is a additional iteration with maxexp2 = minexp - 4 and
+   a shift of 26 bits, here's the accumulator after the shift:
+
+                                     <------- 26 zeros ------->
+                 [-------------------0000000000000000000000000]
+   This iteration:          minexp ─┘   ├─ maxexp2            │
+   Next iteration:                      └─ maxexp     minexp ─┘
+
+4. Determine the number of cancelled bits, more precisely, define the
+   variable cancel = number of identical bits on the most significant
+   part of the accumulator.
+
+5. If the truncated sum (i.e. the value of the accumulator) is 0, i.e.
+   if all the words are 0, then:
+     * if maxexp2 == MPFR_EXP_MIN, then return with the exact value ±0
+       (its sign is determined from the rounding mode, as specified);
+     * else reiterate at (3) with maxexp = maxexp2 and minexp determined
+       from the initial value of cq (like in the first iteration).
+
+6. Let e = maxexp + cq - cancel, u = e - sq, and err = maxexp2 + logn.
+   Then the absolute value of the truncated sum is in [2^(e-1),2^e]
+   (binade closed on both ends due to two's complement), u is the
+   exponent of the ulp (a.k.a., quantum exponent) in the destination
+   (except in the exact case -2^e), and the absolute value of the error
+   is strictly less than the bound 2^err; if maxexp2 == MPFR_EXP_MIN,
+   then the error is 0, i.e. the sum in the accumulator is exact.
+
+   Here's a representation of the accumulator and the cancel bits,
+   with the two cases depending on the sign of the truncated sum,
+   where the x's correspond to the sq-1 represented bits following
+   the initial value bit 1 (0 if negative), r is the rounding bit,
+   and the bits f are the following bits:
+
+     ][------------------- accumulator -------------------]
+     ][---- cancel ----]----------------------------------]
+     ]0000000000000000001xxxxxxxxxxxxxxxxxxxxxrffffffffffff
+     ]1111111111111111110xxxxxxxxxxxxxxxxxxxxxrffffffffffff
+     └─ maxexp + cq    └─ e               u ─┘    minexp ─┘
+
+   If the cancellation is important, then the bit r and some of the
+   least significant bits x may fall outside of the accumulator, in
+   which case they are considered to be 0. The most degenerate case
+   is the following:
+
+     * If the cancel bits are 0's, then due to (5), e = minexp + 1,
+       i.e. a single bit (the leading 1) is represented.
+
+     * If the cancel bits are 1's, all the bits of the accumulator are
+       1's. Since the non-represented bits are considered to be 0, the
+       following bit is a 0; this implies that the computed value of
+       cancel is the right one, i.e. as if the following bits were
+       represented in the accumulator, and proves the above assertion
+       "the absolute value of the truncated sum is in [2^(e-1),2^e]".
+       Here we have e = minexp, and none of the bits are represented.
+
+   In the implementation, one must make sure that one does not try to
+   read non-represented bits.
+
+   The exponent and related values may change later due to the error,
+   but this doesn't really matter here.
+
+   If the error bound is too large due to the cancellation, e.g. if
+   err > u - 3 (the proofs below are done with this choice), then
+   one reiterates at (3) after shifting the truncated sum to the
+   left boundary (most significant part) of the accumulator, where:
+     * the shift count is determined in such a way to avoid overflows
+       at the next iteration, i.e. to be able to retrieve the sum with
+       its sign;
+     * the new value of cq is implied by this shift count and maxexp2
+       (the value of maxexp at the next iteration).
+
+   Concerning the choice of the shift count shiftq, letting only one
+   identical bit in the most significant part may not be sufficient;
+   for instance, if
+   a. maxexp2 = minexp;
+   b. the accumulator contains:
+
+        0000000011111111111111111111111111110101
+   This iteration:                     minexp ─┘
+
+   c. the accumulator is shifted to:
+
+        0111111111111111111111111111101010000000
+   Next iteration:                      └─ maxexp
+
+   d. there are at least 12 numbers to add at the next iteration;
+   then one could end up with something like:
+
+        1000000000000000000000000000000000010001
+                                        └─ maxexp
+
+   i.e. an overflow in two's complement. But leaving at least
+   2 + max(0, err - e) itentical bits in the most significant part,
+   such as
+
+        0011111111111111111111111111110101000000
+                                         └─ maxexp
+
+   is sufficient. The second term of the max covers cases like:
+
+                                       ┌─ err = maxexp2 + logn
+        0000000000000000000000000000000000000111
+                                         e ─┘
+
+   (2^err being a bound on the error term, thus a bound on the value
+   that will be added to the accumulator at the next iteration), for
+   which the accumulator can be shifted to:
+
+         ┌─ err
+        0000000111000000000000000000000000000000
+           e ─┘
+
+   without triggering an overflow at the next iteration, but
+
+        ┌─ err
+        0000001110000000000000000000000000000000
+          e ─┘
+
+   is incorrect as a 1 could appear in the MSB (making the accumulator
+   value negative) just with additions of positive numbers.
+
+   Said otherwise, leaving at least i identical bits allows one to
+   represent numbers in [-2^(e+i-1),2^(e+i-1)[. The current sum is in
+   [-2^e,2^e[, and taking into account the error terms, one wants to
+   be able to represent arbitrary numbers in ]-2^e-2^err,2^e+2^err[.
+   So, i must be chosen in such a way that 2^e + 2^err ≤ 2^(e+i-1),
+   i.e. 2^0 + 2^(err-e) ≤ 2^(i-1). The smallest power of two larger
+   than or equal to 2^0 + 2^(err-e) has exponent 1 + max(0, err - e).
+   Hence the choice i ≥ 2 + max(0, err - e).
+
+   So, we have: shiftq = cancel - 2 - max(0, err - e). And at the next
+   iteration,
+
+     cq' = wq - shiftq + (minexp - maxexp2)
+
+   Now one needs to prove that this makes sense, i.e. that shiftq > 0
+   and that cq' < wq; otherwise all the inputs will be ignored, making
+   a new iteration useless, with a possible infinite loop. So, shiftq
+   must be chosen so that
+
+     shiftq > minexp - maxexp2
+
+   Note that since maxexp2 ≤ minexp, this will also yield shiftq > 0.
+   The condition err > u - 3 gives:
+
+     maxexp2 + logn > maxexp + cq - cancel - sq - 3
+
+   Therefore, since maxexp - minexp = sq + dq,
+
+     cancel - 2 > minexp - maxexp2 - logn + cq + dq - 5
+
+   And since cq = logn + 1 and dq ≥ 4,
+
+     cancel - 2 > minexp - maxexp2                              [A]
+
+   We also have:
+
+     cancel - 2 - logn + e = maxexp - 2 - logn + cq > minexp
+
+   Therefore
+
+     cancel - 2 - (err - e) > minexp - maxexp2                  [B]
+
+   [A] and [B] give: shiftq > minexp - maxexp2. QED.
+
+   Note: It is expected that in general, the cancellation is not very
+   large, so that the new additions in (3) will occur only in a small
+   part of the accumulator, except in case of long carry propagation
+   (see below).
+
+7. Now, the accumulator contains the significand of a good approximation
+   to the exact sum, since the absolute value of the error is strictly
+   less than ulp(computed value) * 2^(-3). The corresponding exponent is
+   e and the sign is determined from the cancel bits. One will copy the
+   rounded significand to the destination, together with the exponent
+   and the sign. One has different cases:
+
+     * If all the bits of the inputs have been taken into account, then
+       the significand is exact (maxexp2 == MPFR_EXP_MIN), so that one
+       can round it correctly. Note: The TMD does not occur here.
+
+     * If the error bound is non-zero (because not all the bits of the
+       inputs have been taken into account) but the TMD does not occur,
+       then one can also round the significand correctly.
+
+     * Otherwise, one will round the significand, assuming that this is
+       the correct truncation up to the rounding bit and that the sticky
+       bit is 1: this yields the most probable correctly rounded result
+       if u - minexp > 2 (the most general case), based on a symmetrical
+       distribution of the error centered on 0; the destination will be
+       corrected later if need be.
+
+   For the last two cases, and before the rounding is done, one needs
+   to determine whether the TMD occurs. Let d = u - err, which is ≥ 3
+   and can be very large; however d is representable in a mpfr_exp_t
+   since u ≤ emax, err ≥ emin, and this type can hold the sum of two
+   valid exponents. The TMD occurs when the sum is close enough to a
+   breakpoint, which is either a machine number (i.e. a number whose
+   significand fits on sq bits) or a midpoint between two consecutive
+   machine numbers, depending on the rounding mode:
+                   ┌───────────────┬────────────────┐
+                   │ Rounding mode │   Breakpoint   │
+                   ├───────────────┼────────────────┤
+                   │  to nearest   │    midpoint    │
+                   │  to nearest   │ machine number │
+                   │   directed    │ machine number │
+                   └───────────────┴────────────────┘
+   (in the second case, the correctly rounded sum can be determined,
+   but not the ternary value). More precisely, the TMD occurs when:
+     * the d bits following the ulp one are identical in directed
+       rounding modes;
+     * the d-1 bits following the rounding bit are identical, in the
+       rounding-to-nearest mode.
+
+   At the same time, one determines whether the correctly rounded sum
+   is exact. If the TMD occurs, this value is currently assumed to be
+   inexact, as said above.
+
+   Let us recall that in case of an important cancellation, some bits
+   may not be represented in the accumulator. Such a case is possible
+   even in this step, when maxexp2 is much smaller than minexp.
+
+   A change of representation is needed since the sum has been computed
+   using two's complement representation while regular MPFR numbers are
+   represented in sign + absolute value (normalized). The sign bit is
+   directly obtained from the cancel bits. For the absolute value:
+
+     * If the cancel bits are 0's, the value is positive, so that the
+       absolute value is represented in the same way.
+
+     * If the cancel bits are 1's, the value is negative, so that the
+       absolute value is formally obtained by inverting all the bits
+       (this infinite binary expansion is not the canonical one, but
+       this does not matter as this is just for the discussion below).
+       Since there is no carry propagation at this point, the exponent
+       is not changed.
+
+   With the mpn layer of GMP, up to 3 operations are involved for the
+   handling of the significand:
+     * a shift (for the alignment), if the shift count is non-zero
+       (this is the most probable case);
+     * a negation if the value is negative;
+     * the initial rounding.
+   The shift should be done before the initial rounding, so that all
+   the bits are represented for the rounding. The copy itself should
+   be done together with the shift or the negation, because this is
+   where most of the limbs are changed in general. It will be done
+   with the shift, and see below for the other two operations.
+
+   The rounding and change of representation (as a whole) can be seen
+   in the following way: after scaling, a number written r = i + f in
+   two's complement, where i is the integer part and f the infinite
+   fractional part, has to be rounded to one of the enclosing integers
+   i and i + 1. Thus the rounding operation consists in replacing f
+   by an integer c = 0 or 1 (like a carry): the rounded value is the
+   integer i + c. When r is negative (i.e. the MSB of i is 1), the
+   following forms are equivalent:
+
+     |i + c| = - (i + c)                        [1]
+             = (- i) - c                        [2]
+             = rnd(-r) = rnd(~r) = ~i + ~c      [3]
+
+   This can lead to various possible implementations. In each of these
+   3 cases, one has two potentially multiple-precision operations.
+     [1] addition of a carry, followed by a negation;
+     [2] negation, followed by a subtraction of a carry;
+     [3] bit inversion, followed by an addition of a carry.
+   In average, the operation with a carry is in constant time; it is an
+   operation on a single word in most cases. But in the worst case, due
+   to possible long carry propagation, it may need a read/write on the
+   whole significand. If GMP had a negation-with-borrow operation, one
+   could merge these two multiple-operations in a single one; but this
+   is not the case. However one can notice that either c or ~c is 0, so
+   that one can select the operation depending on the value of c. As a
+   summary, consider the following cases:
+     * r positive:        |i + c| = i + c with mpn_add_1;
+     * r negative, c = 0: |i + c| = -i with mpn_neg ([1]/[2]);
+     * r negative, c = 1: |i + c| = ~i with mpn_com ([3]).
+   The first two cases can yield an increment of the exponent e.
+
+   One goal of this early copy/rounding is to free the accumulator in
+   case there is work to do in (8), though the needed memory for that
+   is limited. An alternative solution would be to reserve a bit more
+   memory for (8), and determine the way of rounding before the copy;
+   this could be slightly more efficient in supposedly rare cases
+   (when the correction yields a long carry propagation).
+
+   As said above, in rounding to nearest, both kinds of breakpoints
+   (machine numbers and midpoints) are taken into account for the TMD.
+   Since the final rounding and ternary value will depend on the kind
+   of breakpoint, one needs to remember this information in a variable
+   (as the accumulator will be overwritten). Thus a variable tmd will
+   contain:
+     * 0: the TMD does not occur;
+     * 1: the TMD occurs on a machine number;
+     * 2: the TMD occurs on a midpoint.
+
+   If the TMD does not occur (tmd = 0):
+     * Determine the ternary value as follows: if the sum is exact
+       (already known), the ternary value is 0; otherwise it is -1
+       if c = 0, and 1 if c = 1.
+     * Check the range and return as in (9).
+
+8. Reiterate like in (3) one or more times using a specific window as
+   if sq were 0, i.e. around maxexp + logn to maxexp - logn (wq - sq
+   may be a good choice); in short, determine the sign of a secondary
+   term, which is the difference between the exact sum and the closest
+   breakpoint.
+
+   During the iterations, when there are several identical bits on the
+   left (most significant part) of the accumulator, only one bit needs
+   to be represented during the computations. But to avoid overflows,
+   one will keep 2 or more at the start of an iteration, like what is
+   done in (6).
+
+   Let us see what needs to be done to prepare the first iteration.
+   Since d ≥ 3 and the TMD occurs, the secondary term is, in absolute
+   value, less than half the weight of the rounding bit, so that it can
+   be seen as the fractional part after the rounding bit; the leading
+   d-1 bits of the truncated secondary term are identical and give its
+   sign (in the two's complement meaning). Now, if err < minexp, then
+   at least one of these identical bits is not represented, meaning that
+   it is 0 and all these bits are 0's; thus, under this condition, the
+   accumulator is zeroed and minexp is determined from maxexp2 and the
+   initial value of cq, like in (5) and the first iteration. Otherwise
+   the identical bits are all represented, and similarly to (6), one
+   shifts the trailing bits, keeping the last 2 over the d-1 identical
+   bits, i.e. one shifts the bits from err+1 to minexp, and zeros the
+   remaining bits of the accumulator; the new minexp is obtained by
+   subtracting the shift count from it.
+
+   Once the sign of the secondary term is known, depending on:
+     * the rounding mode rnd (equivalent to N, D, U),
+     * the value of tmd (1 or 2 in rounding to nearest),
+     * the rounding bit R (0 or 1),
+     * the sign sst of the secondary term (negative, zero, positive),
+   correct the sum if need be (± 1 ulp) and determine the corresponding
+   ternary value, as in the table below; a question mark means that the
+   exact value is the middle of two consecutive machine numbers, and a
+   special rule is used to determine the rounding, equivalent to one of
+   the enclosing lines (the ones where sst is - or +):
+     * For nearest-even:
+         - last significand bit = 0: choose line where correction = 0;
+         - last significand bit = 1: choose line where correction ≠ 0.
+     * For nearest-away: choose line where error = +.
+
+     rnd   tmd    R    sst      correction   ternary
+     ───────────────────────────────────────────────
+      N     1     0     -           0           +
+      N     1     0     0           0           0
+      N     1     0     +           0           -
+      N     1     1     -           0           +
+      N     1     1     0           0           0
+      N     1     1     +           0           -
+     ───────────────────────────────────────────────
+      N     2     0     -           0           -
+      N     2     0     0           ?           ?     (halfway case)
+      N     2     0     +           +           +
+      N     2     1     -           -           -
+      N     2     1     0           ?           ?     (halfway case)
+      N     2     1     +           0           +
+     ───────────────────────────────────────────────
+      D     1     0     -           -           -
+      D     1     0     0           0           0
+      D     1     0     +           0           -
+      D     1     1     -           0           -
+      D     1     1     0           +           0
+      D     1     1     +           +           -
+     ───────────────────────────────────────────────
+      U     1     0     -           -           +
+      U     1     0     0           -           0
+      U     1     0     +           0           +
+      U     1     1     -           0           +
+      U     1     1     0           0           0
+      U     1     1     +           +           +
+     ───────────────────────────────────────────────
+
+9. Check the range (as usual), and return.
+
+An example:
+
+              [1]    [2]   A  [3]
+  u0 = *****   |      |    .   |
+  u1 =   ******|**    |    .   |
+  u2 = ********|***** |    .   |
+  u3 =         | *****|    .   |
+  u4 =         |      |    ****|**
+  u5 =         |      |     ***|*
+
+At iteration 1, minexp is determined to be [1]; thus u0, a part of u1,
+and a part of u2 are taken into account for the truncated sum. Then it
+appears that an important cancellation occurred, and another step (3)
+is needed. Since u1 was truncated, the new maxexp will be minexp, i.e.
+[1]. At iteration 2, minexp is determined to be [2]; thus a part of u1,
+a part of u2, and u3 are taken into account for the truncated sum. Now
+assume that on this example, the error is small enough, but its sign is
+unknown. Thus another step (3) is needed, with the conditions of (7).
+Since no numbers were truncated at the previous iteration, maxexp is
+the maximum exponent of the remaining numbers, here the one of u4, and
+minexp is determined to be [3]. Assume that the sign of the error can
+be determined now, so that we can return the rounded result with the
+ternary value.
+
+As a bonus, this will also solve overflow, underflow and normalization
+issues, since everything is done in fixed point and the output exponent
+will be considered only at the end (early overflow detection could also
+be done).
+
+A limb L = *zp in memory will generally contain a part of a significand.
+One can define its exponent ze, such that the actual value of this limb
+is L * 2^(ze-GMP_NUMB_BITS), i.e. ze is its exponent where the limb is
+regarded as a number in [1/2,1[. If an array of limbs zp[] is regarded
+as a significand in [1/2,1[, then the exponent of its actual value is
+also ze.
+
+Variables used in the implementation:
+  tp: pointer to a temporary area that will contain a shifted value.
+  wp: pointer to the accumulator.
+  ts: size of the temporary area, in limbs; then wp = tp + ts.
+  ws: size of the accumulator, in limbs.
+  wq: size of the accumulator, in bits.
+  rn: number n' of inputs that are regular numbers (regular inputs).
+  logn: ceil(log2(rn)).
+  cq: logn + 1.
+  sq: output precision (precision of the sum).
+  maxexp: the maximum exponent of the bit-window of the inputs that is
+          taken into account (for the current iteration), excluded;
+          after the sum, it gets the value of maxexp2 until the next
+          iteration.
+  minexp: the minimum exponent of the bit-window of the inputs that is
+          taken into account (for the current iteration), included.
+
+Note 1: Data locality can be improved after the first iteration if the
+shifted values are stored at the end of the temporary area instead of
+the beginning. The reason is that only the least significant part of
+the accumulator will be used once a good approximate value of the sum
+is known, and the accumulator lies just after the temporary area. But
+the gain would probably not be noticeable in practice.
+
+Note 2: At step (4), it was considered to determine the number of
+cancelled limbs instead of the number of cancelled bits in order to
+avoid a non-trivial shift at step (6), making this step a bit faster
+(but particularly in small precisions, with some possible drawbacks).
+Moreover, on platforms with a MMU, thus most platforms where GNU MPFR
+would be used, we could arrange to reserve a specific address space
+for the accumulator and choose the shift count to be a multiple of a
+memory block size (or actually not to do any shift/copy at all, just
+free the most significant part and allocate a new part when need be),
+so that the shift (copy) of the accumulator contents could just be
+MMU related changes, without any physical memory copy. However, in
+practice, only large precisions with specific inputs would benefit
+from such a choice, which would require non portable code, and the
+relative speedup would remain small since the sum itself is expected
+to take most of the time.
+
+Note 3: Compared with Ziv's strategy, the issue here is not really
+exact values that are very close to a breakpoint, but cancellation.
+Moreover, we do not need to recompute everything at each iteration.
+The issue with the value very close to a breakpoint actually occurs
+at step (8); however, contrary to the usual case, we do not want to
+reiterate with more precision as this could take too much time and
+memory (a much wider accumulator could be needed). Indeed, due to a
+possible "hole" between the inputs, the distance between the exact
+value and a breakpoint can be extremely small.
+
+*** To be considered in future versions ***
+It seems that carry propagation (mpn_add_1 & mpn_sub_1 in the code) is
+most often limited. But consider the following cases, where all inputs
+have the minimal precision 2, and the output precision is p:
+  u0 = 1
+  u_i = (-1)^i * 2^(-p) for i > 0
+Here long carry propagation will occur for each addition of the initial
+iteration, so that the complexity will be O(n*p) instead of O(n+p) if
+we choose to delay carry propagation (however such a choice may slower
+the average case and take more memory, such as around 3*p instead of
+2*p).
+When a new iteration is needed due to cancellation, a second accumulator
+was considered in some early version of the algorithm: the temporary
+results of the computations during the new iteration would be stored in
+this second accumulator, which would generally be small, thus limiting
+carry propagation; this method is actually equivalent to delaying carry
+propagation. It could help in some cases, such as:
+  u0 = 2^q with some q > 0
+  u1 = 1
+  u2 = -2^q
+  u_i = (-1)^i * 2^(-p) for i > 2
+but such examples are very specific cases, and as seen with the first
+example, a better way must be chosen if avoiding long carry propagation
+is regarded as important (in all cases). Moreover, while the use of two
+accumulators does not take much more memory (since both accumulators can
+occupy the same area, with a flexible limit between them), it probably
+makes the code a bit more complex, and noticeably slower if n is small.
diff --git a/src/mpfr-impl.h b/src/mpfr-impl.h
index 8c3af99d4..0caec6f18 100644
--- a/src/mpfr-impl.h
+++ b/src/mpfr-impl.h
@@ -2079,4 +2079,31 @@ __MPFR_DECLSPEC void mpfr_mpz_clear _MPFR_PROTO((mpz_ptr));
 #endif
 
 
+/******************************************************
+ *************  Value Coverage Checking  **************
+ ******************************************************/
+
+#ifdef MPFR_COV_CHECK
+
+/* Variable names should start with the __gmpfr_cov_ prefix. */
+
+#define MPFR_COV_SET(X) (__gmpfr_cov_ ## X = 1)
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+__MPFR_DECLSPEC extern int __gmpfr_cov_sum_tmd[MPFR_RND_MAX][2][2][3][2];
+
+#if defined (__cplusplus)
+}
+#endif
+
+#else
+
+#define MPFR_COV_SET(X) ((void) 0)
+
+#endif
+
+
 #endif
diff --git a/src/sum.c b/src/sum.c
index 1d30d9201..2c22ad7fb 100644
--- a/src/sum.c
+++ b/src/sum.c
@@ -1,6 +1,6 @@
 /* Sum -- efficiently sum a list of floating-point numbers
 
-Copyright 2004-2015 Free Software Foundation, Inc.
+Copyright 2014-2015 Free Software Foundation, Inc.
 Contributed by the AriC and Caramel projects, INRIA.
 
 This file is part of the GNU MPFR Library.
@@ -20,327 +20,1152 @@ along with the GNU MPFR Library; see the file COPYING.LESSER.  If not, see
 http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc.,
 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */
 
-/* Reference: James Demmel and Yozo Hida, Fast and accurate floating-point
-   summation with application to computational geometry, Numerical Algorithms,
-   volume 37, number 1-4, pages 101--112, 2004. */
-
 #define MPFR_NEED_LONGLONG_H
 #include "mpfr-impl.h"
 
-/* FIXME[VL]: mpfr_sum can take too much memory and too much time.
-   Rewrite using mpn, with additions by blocks (most significant first)?
-   The algorithm could be as follows:
-   1. Search the maximum exponent max_exp of the inputs.
-   2. Compute the truncated sum in a window around max_exp + log2(N) to
-      max_exp - output_prec - 128.
-   3. In case of cancellation, this determines a new maximum exponent;
-      so, reiterate at (2), or (1) if the truncated sum is zero (so that
-      "holes" will count for nothing).
-   4. Take care of the TMD (something like above, since there can still
-      be cancellations).
-   As a bonus, this will also solve overflow and underflow issues, since
-   everything is done in fixed point and the output exponent will be
-   considered only at the end (early overflow detection can also be done).
+/* See the doc/sum.txt file for the algorithm and a part of its proof
+(this will later go into algorithms.tex).
 
 Note: see the following paper and its references:
 http://www.eecs.berkeley.edu/~hdnguyen/public/papers/ARITH21_Fast_Sum.pdf
+VL: This is very different:
+          In MPFR             In the paper & references
+    arbitrary precision            fixed precision
+     correct rounding        just reproducible rounding
+    integer operations        floating-point operations
+        sequencial             parallel (& sequential)
 */
 
-/* I would really like to use "mpfr_srcptr const []" but the norm is buggy:
-   it doesn't automatically cast a "mpfr_ptr []" to "mpfr_srcptr const []"
-   if necessary. So the choice are:
-     mpfr_s **                : OK
-     mpfr_s *const*           : OK
-     mpfr_s **const           : OK
-     mpfr_s *const*const      : OK
-     const mpfr_s *const*     : no
-     const mpfr_s **const     : no
-     const mpfr_s *const*const: no
-   VL: this is not a bug, but a feature. See the reason here:
-     http://c-faq.com/ansi/constmismatch.html
-*/
-static void heap_sort (mpfr_srcptr *const, unsigned long, mpfr_srcptr *);
-static void count_sort (mpfr_srcptr *const, unsigned long, mpfr_srcptr *,
-                        mpfr_exp_t, mpfr_uexp_t);
-
-/* Either sort the tab in perm and returns 0
-   Or returns 1 for +INF, -1 for -INF and 2 for NAN.
-   Also set *maxprec to the maximal precision of tab[0..n-1] and of the
-   initial value of *maxprec.
-*/
-int
-mpfr_sum_sort (mpfr_srcptr *const tab, unsigned long n, mpfr_srcptr *perm,
-               mpfr_prec_t *maxprec)
+#ifdef MPFR_COV_CHECK
+int __gmpfr_cov_sum_tmd[MPFR_RND_MAX][2][2][3][2] = { 0 };
+#endif
+
+/* Update minexp after detecting a potential integer overflow in extreme
+   cases (only 32-bit machines may be concerned in practice). */
+#define UPDATE_MINEXP(E,SH)                     \
+  do                                            \
+    {                                           \
+      mpfr_prec_t sh = (SH);                    \
+      MPFR_ASSERTN ((E) >= MPFR_EXP_MIN + sh);  \
+      minexp = (E) - sh;                        \
+    }                                           \
+  while (0)
+
+/* Accumulate a new [minexp,maxexp[ block into (wp,ws). If e and err denote
+ * the exponents of the computed result and of the error bound respectively,
+ * while e - err is less than some given bound (due to cancellation), shift
+ * the accumulator and reiterate.
+ *   wp: pointer to the accumulator (least significant limb first).
+ *   ws: size of the accumulator.
+ *   wq: precision of the accumulator (ws * GMP_NUMB_BITS).
+ *   x: array of the input numbers.
+ *   n: size of this array (number of inputs).
+ *   minexp: exponent of the least significant bit of the block.
+ *   maxexp: exponent of the block (maximum exponent + 1).
+ *   tp: pointer to a temporary area.
+ *   ts: size of this temporary area.
+ *   logn: ceil(log2(rn)), where rn is the number of regular inputs.
+ *   prec: minimal value of e - err (see below).
+ *   ep: pointer to mpfr_exp_t (see below), or a null pointer.
+ *   minexpp: pointer to mpfr_exp_t (see below).
+ *   maxexpp: pointer to mpfr_exp_t (see below).
+ * This function returns the number of cancelled bits (>= 1), or 0
+ * if the accumulator is 0 (then the exact sum is necessarily 0).
+ * In the former case, the function also returns:
+ * - in ep: the exponent e of the computed result;
+ * - in minexpp: the last value of minexp.
+ * - in maxexpp: the new value of maxexp (for the next iteration).
+ * Notes:
+ * - minexp is also the exponent of the least significant bit of the
+ *   accumulator;
+ * - the temporary area must be large enough to hold a shifted input
+ *   block, and the value of ts is used only when the full assertions
+ *   are checked (i.e. with the --enable-assert configure option), to
+ *   check that a buffer overflow doesn't occur;
+ * - contrary to the returned value of minexp (the value in the last
+ *   iteration), the returned value of maxexp is the one for the next
+ *   iteration (= maxexp2 of the last iteration).
+ */
+static mpfr_prec_t
+sum_raw (mp_limb_t *wp, mp_size_t ws, mpfr_prec_t wq, mpfr_ptr *const x,
+         unsigned long n, mpfr_exp_t minexp, mpfr_exp_t maxexp,
+         mp_limb_t *tp, mp_size_t ts, int logn, mpfr_prec_t prec,
+         mpfr_exp_t *ep, mpfr_exp_t *minexpp, mpfr_exp_t *maxexpp)
 {
-  mpfr_exp_t min, max;
-  mpfr_uexp_t exp_num;
-  unsigned long i;
-  int sign_inf;
-
-  sign_inf = 0;
-  min = MPFR_EMIN_MAX;
-  max = MPFR_EMAX_MIN;
-  for (i = 0; i < n; i++)
+  MPFR_LOG_FUNC
+    (("ws=%Pd ts=%Pd prec=%Pd", (mpfr_prec_t) ws, (mpfr_prec_t) ts, prec),
+     ("", 0));
+
+  MPFR_ASSERTD (prec >= 0);
+
+  /* Consistency check. */
+  MPFR_ASSERTD (wq == (mpfr_prec_t) ws * GMP_NUMB_BITS);
+
+  while (1)
     {
-      if (MPFR_UNLIKELY (MPFR_IS_SINGULAR (tab[i])))
-        {
-          if (MPFR_IS_NAN (tab[i]))
-            return 2; /* Return NAN code */
-          else if (MPFR_IS_INF (tab[i]))
+      mpfr_exp_t maxexp2 = MPFR_EXP_MIN;
+      unsigned long i;
+
+      MPFR_LOG_MSG (("sum_raw loop: "
+                     "maxexp=%" MPFR_EXP_FSPEC "d "
+                     "minexp=%" MPFR_EXP_FSPEC "d\n",
+                     (mpfr_eexp_t) maxexp, (mpfr_eexp_t) minexp));
+
+      MPFR_ASSERTD (maxexp > minexp);
+
+      for (i = 0; i < n; i++)
+        if (! MPFR_IS_SINGULAR (x[i]))
+          {
+            mp_limb_t *dp, *vp;
+            mp_size_t ds, vs, vds;
+            mpfr_exp_t xe, vd;
+            mpfr_prec_t xq;
+            int tr;
+
+            xe = MPFR_GET_EXP (x[i]);
+            xq = MPFR_GET_PREC (x[i]);
+
+            vp = MPFR_MANT (x[i]);
+            vs = MPFR_PREC2LIMBS (xq);
+            vd = xe - vs * GMP_NUMB_BITS - minexp;
+            /* vd is the exponent of the least significant represented bit of
+               x[i] (including the trailing bits, whose value is 0) minus the
+               exponent of the least significant bit of the accumulator. To
+               make the code simpler, we won't try to filter out the trailing
+               bits of x[i]. */
+
+            if (vd < 0)
+              {
+                /* This covers the following cases:
+                 *     [-+- accumulator ---]
+                 *   [---|----- x[i] ------|--]
+                 *       |   [----- x[i] --|--]
+                 *       |                 |[----- x[i] -----]
+                 *       |                 |    [----- x[i] -----]
+                 *     maxexp           minexp
+                 */
+
+                if (xe <= minexp)
+                  {
+                    /* x[i] is entirely after the LSB of the accumulator,
+                       so that it will be ignored at this iteration. */
+                    if (xe > maxexp2)
+                      maxexp2 = xe;
+                    continue;
+                  }
+
+                /* If some significant bits of x[i] are after the LSB of the
+                   accumulator, then maxexp2 will necessarily be minexp. */
+                if (MPFR_LIKELY (xe - xq < minexp))
+                  maxexp2 = minexp;
+
+                /* We need to ignore the least |vd| significant bits of x[i].
+                   First, let's ignore the least vds = |vd| / GMP_NUMB_BITS
+                   limbs. */
+                vd = - vd;
+                vds = vd / GMP_NUMB_BITS;
+                vs -= vds;
+                MPFR_ASSERTD (vs > 0);  /* see xe <= minexp test above */
+                vp += vds;
+                vd -= vds * GMP_NUMB_BITS;
+                MPFR_ASSERTD (vd >= 0 && vd < GMP_NUMB_BITS);
+
+                if (xe > maxexp)
+                  {
+                    vs -= (xe - maxexp) / GMP_NUMB_BITS;
+                    MPFR_ASSERTD (vs > 0);
+                    tr = (xe - maxexp) % GMP_NUMB_BITS;
+                  }
+                else
+                  tr = 0;
+
+                if (vd != 0)
+                  {
+                    MPFR_ASSERTD (vs <= ts);
+                    mpn_rshift (tp, vp, vs, vd);
+                    vp = tp;
+                    tr += vd;
+                    if (tr >= GMP_NUMB_BITS)
+                      {
+                        vs--;
+                        tr -= GMP_NUMB_BITS;
+                      }
+                    MPFR_ASSERTD (tr >= 0 && tr < GMP_NUMB_BITS);
+                    if (tr != 0)
+                      {
+                        tp[vs-1] &= MPFR_LIMB_MASK (GMP_NUMB_BITS - tr);
+                        tr = 0;
+                      }
+                    /* Truncation has now been taken into account. */
+                    MPFR_ASSERTD (tr == 0);
+                  }
+
+                dp = wp;
+                ds = ws;
+              }
+            else  /* vd >= 0 */
+              {
+                /* This covers the following cases:
+                 *               [-+- accumulator ---]
+                 *   [- x[i] -]    |
+                 *             [---|-- x[i] ------]  |
+                 *          [------|-- x[i] ---------]
+                 *                 |   [- x[i] -]    |
+                 *               maxexp           minexp
+                 */
+
+                /* We need to ignore the least vd significant bits
+                   of the accumulator. First, let's ignore the least
+                   vds = vd / GMP_NUMB_BITS limbs. -> (dp,ds) */
+                vds = vd / GMP_NUMB_BITS;
+                ds = ws - vds;
+                if (ds <= 0)
+                  continue;
+                dp = wp + vds;
+                vd -= vds * GMP_NUMB_BITS;
+                MPFR_ASSERTD (vd >= 0 && vd < GMP_NUMB_BITS);
+
+                /* The low part of x[i] (to be determined) will have to be
+                   shifted vd bits to the left if vd != 0. */
+
+                if (xe > maxexp)
+                  {
+                    vs -= (xe - maxexp) / GMP_NUMB_BITS;
+                    if (vs <= 0)
+                      continue;
+                    tr = (xe - maxexp) % GMP_NUMB_BITS;
+                  }
+                else
+                  tr = 0;
+
+                MPFR_ASSERTD (tr >= 0 && tr < GMP_NUMB_BITS && vs > 0);
+
+                /* We need to consider the least significant vs limbs of x[i]
+                   except the most significant tr bits. */
+
+                if (vd != 0)
+                  {
+                    mp_limb_t carry;
+
+                    MPFR_ASSERTD (vs <= ts);
+                    carry = mpn_lshift (tp, vp, vs, vd);
+                    tr -= vd;
+                    if (tr < 0)
+                      {
+                        tr += GMP_NUMB_BITS;
+                        MPFR_ASSERTD (vs + 1 <= ts);
+                        tp[vs++] = carry;
+                      }
+                    MPFR_ASSERTD (tr >= 0 && tr < GMP_NUMB_BITS);
+                    vp = tp;
+                  }
+              }
+
+            MPFR_ASSERTD (vs > 0 && vs <= ds);
+
+            /* We can't truncate the most significant limb of the input
+               (in case it hasn't been shifted to the temporary area).
+               So, let's ignore it now. It will be taken into account
+               via carry propagation after the addition. */
+            if (tr != 0)
+              vs--;
+
+            if (MPFR_IS_POS (x[i]))
+              {
+                mp_limb_t carry;
+
+                carry = vs > 0 ? mpn_add_n (dp, dp, vp, vs) : 0;
+                MPFR_ASSERTD (carry <= 1);
+                if (tr != 0)
+                  carry += vp[vs] & MPFR_LIMB_MASK (GMP_NUMB_BITS - tr);
+                mpn_add_1 (dp + vs, dp + vs, ds - vs, carry);
+              }
+            else
+              {
+                mp_limb_t borrow;
+
+                borrow = vs > 0 ? mpn_sub_n (dp, dp, vp, vs) : 0;
+                MPFR_ASSERTD (borrow <= 1);
+                if (tr != 0)
+                  borrow += vp[vs] & MPFR_LIMB_MASK (GMP_NUMB_BITS - tr);
+                mpn_sub_1 (dp + vs, dp + vs, ds - vs, borrow);
+              }
+          }
+
+      {
+        mpfr_prec_t cancel;  /* number of cancelled bits */
+        mp_size_t wi;        /* index in the accumulator */
+        mp_limb_t a, b;
+        int cnt;
+
+        cancel = 0;
+        wi = ws - 1;
+        MPFR_ASSERTD (wi >= 0);
+        a = wp[wi] >> (GMP_NUMB_BITS - 1) ? MPFR_LIMB_MAX : MPFR_LIMB_ZERO;
+
+        while (wi >= 0)
+          if ((b = wp[wi]) == a)
             {
-              if (sign_inf == 0) /* No previous INF */
-                sign_inf = MPFR_SIGN (tab[i]);
-              else if (sign_inf != MPFR_SIGN (tab[i]))
-                return 2; /* Return NAN */
+              cancel += GMP_NUMB_BITS;
+              wi--;
+            }
+          else
+            {
+              b ^= a;
+              MPFR_ASSERTD (b != 0);
+              count_leading_zeros (cnt, b);
+              cancel += cnt;
+              break;
             }
-        }
-      else
-        {
-          MPFR_ASSERTD (MPFR_IS_PURE_FP (tab[i]));
-          if (MPFR_GET_EXP (tab[i]) < min)
-            min = MPFR_GET_EXP(tab[i]);
-          if (MPFR_GET_EXP (tab[i]) > max)
-            max = MPFR_GET_EXP(tab[i]);
-        }
-      if (MPFR_PREC (tab[i]) > *maxprec)
-        *maxprec = MPFR_PREC (tab[i]);
-    }
-  if (MPFR_UNLIKELY (sign_inf != 0))
-    return sign_inf;
 
-  exp_num = max - min + 1;
-  /* FIXME : better test */
-  if (exp_num > n * MPFR_INT_CEIL_LOG2 (n))
-    heap_sort (tab, n, perm);
-  else
-    count_sort (tab, n, perm, min, exp_num);
-  return 0;
+        if (wi >= 0 || a != MPFR_LIMB_ZERO)  /* accumulator != 0 */
+          {
+            mpfr_exp_t e;        /* exponent of the computed result */
+            mpfr_exp_t err;      /* exponent of the error bound */
+
+            MPFR_LOG_MSG (("accumulator %s 0, cancel=%Pd\n",
+                           a != MPFR_LIMB_ZERO ? "<" : ">", cancel));
+
+            MPFR_ASSERTD (cancel > 0);
+            e = minexp + wq - cancel;
+            MPFR_ASSERTD (e >= minexp);
+            err = maxexp2 + logn;  /* OK even if maxexp2 == MPFR_EXP_MIN */
+
+            /* The absolute value of the truncated sum is in the binade
+               [2^(e-1),2^e] (closed on both ends due to two's complement).
+               The error is strictly less than 2^err (and is 0 if
+               maxexp2 == MPFR_EXP_MIN). */
+
+            /* This basically tests whether err <= e - prec without
+               potential integer overflow (since prec >= 0)... */
+            if (err <= e && SAFE_DIFF (mpfr_uexp_t, e, err) >= prec)
+              {
+                MPFR_LOG_MSG (("(err=%" MPFR_EXP_FSPEC "d) <= (e=%"
+                               MPFR_EXP_FSPEC "d) - (prec=%Pd)\n",
+                               (mpfr_eexp_t) err, (mpfr_eexp_t) e, prec));
+                if (ep != NULL)
+                  *ep = e;
+                *minexpp = minexp;
+                *maxexpp = maxexp2;
+                MPFR_LOG_MSG (("return with minexp=%" MPFR_EXP_FSPEC
+                               "d maxexp2=%" MPFR_EXP_FSPEC "d%s\n",
+                               (mpfr_eexp_t) minexp, (mpfr_eexp_t) maxexp2,
+                               maxexp2 == MPFR_EXP_MIN ?
+                               " (MPFR_EXP_MIN)" : ""));
+                return cancel;
+              }
+            else
+              {
+                mpfr_exp_t diffexp;
+                mpfr_prec_t shiftq;
+                mpfr_size_t shifts;
+                int shiftc;
+
+                MPFR_LOG_MSG (("e=%" MPFR_EXP_FSPEC "d err=%" MPFR_EXP_FSPEC
+                               "d maxexp2=%" MPFR_EXP_FSPEC "d%s\n",
+                               (mpfr_eexp_t) e, (mpfr_eexp_t) err,
+                               (mpfr_eexp_t) maxexp2,
+                               maxexp2 == MPFR_EXP_MIN ?
+                               " (MPFR_EXP_MIN)" : ""));
+
+                diffexp = err - e;
+                if (diffexp < 0)
+                  diffexp = 0;
+                /* diffexp = max(0, err - e) */
+
+                MPFR_LOG_MSG (("diffexp=%" MPFR_EXP_FSPEC "d\n",
+                                (mpfr_eexp_t) diffexp));
+
+                MPFR_ASSERTD (diffexp < cancel - 2);
+                shiftq = cancel - 2 - (mpfr_prec_t) diffexp;
+                MPFR_ASSERTD (shiftq > 0);
+                shifts = shiftq / GMP_NUMB_BITS;
+                shiftc = shiftq % GMP_NUMB_BITS;
+                MPFR_LOG_MSG (("shiftq = %Pd = %Pd * GMP_NUMB_BITS + %d\n",
+                               shiftq, (mpfr_prec_t) shifts, shiftc));
+                if (MPFR_LIKELY (shiftc != 0))
+                  mpn_lshift (wp + shifts, wp, ws - shifts, shiftc);
+                else
+                  MPN_COPY_DECR (wp + shifts, wp, ws - shifts);
+                MPN_ZERO (wp, shifts);
+                minexp -= shiftq;
+              }
+          }
+        else if (maxexp2 == MPFR_EXP_MIN)
+          {
+            MPFR_LOG_MSG (("accumulator = 0, maxexp2 = MPFR_EXP_MIN\n", 0));
+            return 0;
+          }
+        else
+          {
+            MPFR_LOG_MSG (("accumulator = 0, reiterate\n", 0));
+            UPDATE_MINEXP (maxexp2, wq - (logn + 1));
+            /* Note: the logn + 1 corresponds to cq in the main code. */
+          }
+      }
+
+      maxexp = maxexp2;
+    }
 }
 
-#define GET_EXP1(x) (MPFR_IS_ZERO (x) ? min : MPFR_GET_EXP (x))
-/* Performs a count sort of the entries */
-static void
-count_sort (mpfr_srcptr *const tab, unsigned long n,
-            mpfr_srcptr *perm, mpfr_exp_t min, mpfr_uexp_t exp_num)
+/**********************************************************************/
+
+/* Generic case: all the inputs are finite numbers,
+   with at least 3 regular numbers. */
+static int
+sum_aux (mpfr_ptr sum, mpfr_ptr *const x, unsigned long n, mpfr_rnd_t rnd,
+         mpfr_exp_t maxexp, unsigned long rn)
 {
-  unsigned long *account;
-  unsigned long target_rank, i;
-  MPFR_TMP_DECL(marker);
+  mp_limb_t *sump;
+  mp_limb_t *tp;  /* pointer to a temporary area */
+  mp_limb_t *wp;  /* pointer to the accumulator */
+  mp_size_t ts;   /* size of the temporary area, in limbs */
+  mp_size_t ws;   /* size of the accumulator, in limbs */
+  mpfr_prec_t wq; /* size of the accumulator, in bits */
+  int logn;       /* ceil(log2(rn)) */
+  int cq;
+  mpfr_prec_t sq;
+  int inex;
+  MPFR_TMP_DECL (marker);
+
+  MPFR_LOG_FUNC
+    (("n=%lu rnd=%d maxexp=%" MPFR_EXP_FSPEC "d rn=%lu",
+      n, rnd, (mpfr_eexp_t) maxexp, rn),
+     ("sum[%Pu]=%.*Rg", mpfr_get_prec (sum), mpfr_log_prec, sum));
+
+  MPFR_ASSERTD (rn >= 3 && rn <= n);
+
+  /* In practice, no integer overflow on the exponent. */
+  MPFR_STAT_STATIC_ASSERT (MPFR_EXP_MAX - MPFR_EMAX_MAX >=
+                           sizeof (unsigned long) * CHAR_BIT);
+
+  /* Set up some variables and the accumulator. */
 
-  /* Reserve a place for potential 0 (with EXP min-1)
-     If there is no zero, we only lose one unused entry */
-  min--;
-  exp_num++;
+  sump = MPFR_MANT (sum);
+
+  /* rn is the number of regular inputs (the singular ones will be
+     ignored). Compute logn = ceil(log2(rn)). */
+  logn = MPFR_INT_CEIL_LOG2 (rn);
+  MPFR_ASSERTD (logn >= 2);
+
+  MPFR_LOG_MSG (("logn=%d maxexp=%" MPFR_EXP_FSPEC "d\n",
+                 logn, (mpfr_eexp_t) maxexp));
+
+  sq = MPFR_GET_PREC (sum);
+  cq = logn + 1;
+
+  /* First determine the size of the accumulator. */
+  ws = MPFR_PREC2LIMBS (cq + sq + logn + 2);
+  wq = (mpfr_prec_t) ws * GMP_NUMB_BITS;
+  MPFR_ASSERTD (wq - cq - sq >= 4);
+
+  MPFR_LOG_MSG (("cq=%d sq=%Pd logn=%d wq=%Pd\n", cq, sq, logn, wq));
+
+  /* An input block will have up to wq - cq bits, and its shifted value
+     (to be correctly aligned) may take GMP_NUMB_BITS - 1 additional bits. */
+  ts = MPFR_PREC2LIMBS (wq - cq + GMP_NUMB_BITS - 1);
 
-  /* Performs a counting sort of the entries */
   MPFR_TMP_MARK (marker);
-  account = (unsigned long *) MPFR_TMP_ALLOC (exp_num * sizeof *account);
-  for (i = 0; i < exp_num; i++)
-    account[i] = 0;
-  for (i = 0; i < n; i++)
-    account[GET_EXP1 (tab[i]) - min]++;
-  for (i = exp_num - 1; i >= 1; i--)
-    account[i - 1] += account[i];
-  for (i = 0; i < n; i++)
-    {
-      target_rank = --account[GET_EXP1 (tab[i]) - min];
-      perm[target_rank] = tab[i];
-    }
+
+  tp = MPFR_TMP_LIMBS_ALLOC (ts + ws);
+  wp = tp + ts;
+
+  MPN_ZERO (wp, ws);  /* zero the accumulator */
+
+  {
+    mpfr_exp_t minexp;   /* exponent of the LSB of the block */
+    mpfr_prec_t cancel;  /* number of cancelled bits */
+    mpfr_exp_t e;        /* temporary exponent of the result */
+    mpfr_exp_t u;        /* temporary exponent of the ulp (quantum) */
+    mp_limb_t rbit;      /* rounding bit (corrected in halfway case) */
+    int corr;            /* correction term (from -1 to 2) */
+    int sd, sh;          /* shift counts */
+    mp_size_t sn;        /* size of the output number */
+    int tmd;             /* 0: the TMD does not occur
+                            1: the TMD occurs on a machine number
+                            2: the TMD occurs on a midpoint */
+    int pos;             /* 0 if negative sum, 1 if positive */
+
+    MPFR_LOG_MSG (("Compute an approximation with sum_raw...\n", 0));
+
+    UPDATE_MINEXP (maxexp, wq - cq);
+    cancel = sum_raw (wp, ws, wq, x, n, minexp, maxexp, tp, ts,
+                      logn, sq + 3, &e, &minexp, &maxexp);
+
+    if (MPFR_UNLIKELY (cancel == 0))
+      {
+        /* The exact sum is zero. Since not all inputs are 0, the sum
+         * is +0 except in MPFR_RNDD, as specified according to the
+         * IEEE 754 rules for the addition of two numbers.
+         */
+        MPFR_SET_SIGN (sum, (rnd != MPFR_RNDD ?
+                             MPFR_SIGN_POS : MPFR_SIGN_NEG));
+        MPFR_SET_ZERO (sum);
+        MPFR_TMP_FREE (marker);
+        MPFR_RET (0);
+      }
+
+    /* The absolute value of the truncated sum is in the binade
+       [2^(e-1),2^e] (closed on both ends due to two's complement).
+       The error is strictly less than 2^(maxexp + logn) (and is 0
+       if maxexp == MPFR_EXP_MIN). */
+
+    u = e - sq;  /* e being the exponent, u is the ulp of the target */
+
+    MPFR_LOG_MSG (("cancel=%Pd"
+                   " e=%" MPFR_EXP_FSPEC "d"
+                   " u=%" MPFR_EXP_FSPEC "d"
+                   " maxexp=%" MPFR_EXP_FSPEC "d%s\n",
+                   cancel, (mpfr_eexp_t) e, (mpfr_eexp_t) u,
+                   (mpfr_eexp_t) maxexp,
+                   maxexp == MPFR_EXP_MIN ? " (MPFR_EXP_MIN)" : ""));
+
+    /* Let's copy/shift the bits [max(u,minexp),e) to the
+       most significant part of the destination, and zero
+       the least significant part (there can be one only if
+       u < minexp). The trailing bits of the destination may
+       contain garbage at this point. Then, at the same time,
+       take the absolute value and do an initial rounding,
+       zeroing the trailing bits at this point.
+       TODO: This may be improved by merging some operations
+       is particular case. The average speed-up may not be
+       significant, though. To be tested... */
+
+    sn = MPFR_PREC2LIMBS (sq);
+    sd = (mpfr_prec_t) sn * GMP_NUMB_BITS - sq;
+    sh = cancel % GMP_NUMB_BITS;
+
+    MPFR_ASSERTD (sd >= 0 && sd < GMP_NUMB_BITS);
+
+    if (MPFR_LIKELY (u > minexp))
+      {
+        mpfr_prec_t tq;
+        mp_size_t ei, fi, wi;
+        int td;
+
+        tq = u - minexp;
+        MPFR_ASSERTD (tq > 0); /* number of trailing bits */
+        MPFR_LOG_MSG (("tq=%Pd\n", tq));
+
+        wi = tq / GMP_NUMB_BITS;
+
+        if (MPFR_LIKELY (sh != 0))
+          {
+            ei = (e - minexp) / GMP_NUMB_BITS;
+            fi = ei - (sn - 1);
+            MPFR_ASSERTD (fi == wi || fi == wi + 1);
+            mpn_lshift (sump, wp + fi, sn, sh);
+            if (fi != wi)
+              sump[0] |= wp[wi] >> (GMP_NUMB_BITS - sh);
+          }
+        else
+          {
+            MPFR_ASSERTD ((mpfr_prec_t) (ws - (wi + sn)) * GMP_NUMB_BITS
+                          == cancel);
+            MPN_COPY (sump, wp + wi, sn);
+          }
+
+        /* Determine the rounding bit, which is represented. */
+        td = tq % GMP_NUMB_BITS;
+        rbit = td >= 1 ? ((wp[wi] >> (td - 1)) & MPFR_LIMB_ONE) :
+          (MPFR_ASSERTD (wi >= 1), wp[wi-1] >> (GMP_NUMB_BITS - 1));
+        MPFR_ASSERTD (rbit == 0 || rbit == 1);
+        MPFR_LOG_MSG (("rbit=%d\n", (int) rbit));
+
+        if (maxexp == MPFR_EXP_MIN)
+          {
+            /* The sum in the accumulator is exact. Determine inex:
+               inex = 0 if the final sum is exact, else 1, i.e.
+               inex = rounding bit || sticky bit. In round to nearest,
+               also determine the rounding direction: obtained from
+               the rounding bit possibly except in halfway cases. */
+            if (MPFR_LIKELY (rbit == 0 ||
+                             (rnd == MPFR_RNDN && ((wp[wi] >> td) & 1) == 0)))
+              {
+                /* We need to determine the sticky bit, either to set inex
+                   (if the rounding bit is 0) or to possibly "correct" rbit
+                   (round to nearest, halfway case rounded downward) from
+                   which the rounding direction will be determined. */
+                MPFR_LOG_MSG (("Determine the sticky bit...\n", 0));
+
+                inex = td >= 2 ? (wp[wi] & MPFR_LIMB_MASK (td - 1)) != 0
+                  : td == 0 ?
+                  (MPFR_ASSERTD (wi >= 1),
+                   (wp[--wi] & MPFR_LIMB_MASK (GMP_NUMB_BITS - 1)) != 0)
+                  : 0;
+
+                if (!inex)
+                  {
+                    mp_size_t wj = wi;
+
+                    while (!inex && wj > 0)
+                      inex = wp[--wj] != 0;
+                    if (!inex && rbit != 0)
+                      {
+                        /* sticky bit = 0, rounding bit = 1,
+                           i.e. halfway case, which will be
+                           rounded downward (see earlier if). */
+                        MPFR_ASSERTD (rnd == MPFR_RNDN);
+                        inex = 1;
+                        rbit = 0;  /* even rounding downward */
+                        MPFR_LOG_MSG (("Halfway case rounded downward;"
+                                       " set inex=1 rbit=0\n", 0));
+                      }
+                  }
+              }
+            else
+              inex = 1;
+            tmd = 0;  /* We can round correctly -> no TMD. */
+          }
+        else  /* maxexp > MPFR_EXP_MIN */
+          {
+            mpfr_exp_t d;
+            mp_limb_t limb, mask;
+            int nbits;
+
+            inex = 1;  /* We do not know whether the sum is exact. */
+
+            MPFR_ASSERTD (u <= MPFR_EMAX_MAX);
+            d = u - (maxexp + logn);  /* representable */
+            MPFR_ASSERTD (d >= 3);
+
+            /* Let's see whether the TMD occurs by looking at the d bits
+               following the ulp bit, or the d-1 bits after the rounding
+               bit. */
+
+            /* First chunk after the rounding bit... It starts at:
+               (wi,td-2) if td >= 2,
+               (wi-1,td-2+GMP_NUMB_BITS) if td < 2. */
+            if (td == 0)
+              {
+                MPFR_ASSERTD (wi >= 1);
+                limb = wp[--wi];
+                mask = MPFR_LIMB_MASK (GMP_NUMB_BITS - 1);
+                nbits = GMP_NUMB_BITS;
+              }
+            else if (td == 1)
+              {
+                limb = wi >= 1 ? wp[--wi] : MPFR_LIMB_ZERO;
+                mask = MPFR_LIMB_MAX;
+                nbits = GMP_NUMB_BITS + 1;
+              }
+            else  /* td >= 2 */
+              {
+                MPFR_ASSERTD (td >= 2);
+                limb = wp[wi];
+                mask = MPFR_LIMB_MASK (td - 1);
+                nbits = td;
+              }
+
+            /* nbits: number of bits of the first chunk + 1
+               (the +1 is for the rounding bit). */
+
+            if (nbits > d)
+              {
+                /* Some low significant bits must be ignored. */
+                limb >>= nbits - d;
+                mask >>= nbits - d;
+                d = 0;
+              }
+            else
+              {
+                d -= nbits;
+                MPFR_ASSERTD (d >= 0);
+              }
+
+            limb &= mask;
+            tmd =
+              limb == MPFR_LIMB_ZERO ?
+                (rbit == 0 ? 1 : rnd == MPFR_RNDN ? 2 : 0) :
+              limb == mask ?
+                (limb = MPFR_LIMB_MAX,
+                 rbit != 0 ? 1 : rnd == MPFR_RNDN ? 2 : 0) : 0;
+
+            while (tmd != 0 && d != 0)
+              {
+                mp_limb_t limb2;
+
+                MPFR_ASSERTD (d > 0);
+                if (wi == 0)
+                  {
+                    /* The non-represented bits are 0's. */
+                    if (limb != MPFR_LIMB_ZERO)
+                      tmd = 0;
+                    break;
+                  }
+                MPFR_ASSERTD (wi > 0);
+                limb2 = wp[--wi];
+                if (d < GMP_NUMB_BITS)
+                  {
+                    int c = GMP_NUMB_BITS - d;
+                    MPFR_ASSERTD (c > 0 && c < GMP_NUMB_BITS);
+                    if ((limb2 >> c) != (limb >> c))
+                      tmd = 0;
+                    break;
+                  }
+                if (limb2 != limb)
+                  tmd = 0;
+                d -= GMP_NUMB_BITS;
+              }
+          }
+      }
+    else  /* u <= minexp */
+      {
+        mp_size_t en;
+
+        en = (e - minexp + (GMP_NUMB_BITS - 1)) / GMP_NUMB_BITS;
+        if (MPFR_LIKELY (sh != 0))
+          mpn_lshift (sump + sn - en, wp, en, sh);
+        else if (MPFR_UNLIKELY (en > 0))
+          MPN_COPY (sump + sn - en, wp, en);
+        if (sn > en)
+          MPN_ZERO (sump, sn - en);
+
+        /* The exact value of the accumulator has been copied.
+         * The TMD occurs if and only if there are bits still
+         * not taken into account, and if it occurs, this is
+         * necessarily on a machine number (-> tmd = 1).
+         */
+        rbit = 0;
+        inex = tmd = maxexp != MPFR_EXP_MIN;
+      }
+
+    /* Leading bit: 1 if positive, 0 if negative. */
+    pos = sump[sn-1] >> (GMP_NUMB_BITS - 1);
+
+    MPFR_ASSERTD (rbit == 0 || rbit == 1);
+
+    MPFR_LOG_MSG (("tmd=%d rbit=%d inex=%d pos=%d\n",
+                   tmd, (int) rbit, inex, pos));
+
+    /* Here, if the final sum is known to be exact, inex = 0, otherwise
+     * inex = 1. We have a truncated significand, a trailing term t such
+     * that 0 <= t < 1 ulp, and an error on the trailing term bounded by
+     * t' in absolute value. Thus the error e on the truncated significand
+     * satisfies -t' <= e < 1 ulp + t'. Thus one has 4 correction cases
+     * denoted by a corr value between -1 and 2 depending on e, pos, rbit,
+     * and the rounding mode:
+     *   -1: equivalent to nextbelow;
+     *    0: the truncated significand is not corrected;
+     *    1: add 1 ulp;
+     *    2: add 1 ulp, then nextabove.
+     * The nextbelow and nextabove are used here since there may be a
+     * change of the binade.
+     */
+
+    if (tmd == 0)  /* no TMD */
+      {
+        switch (rnd)
+          {
+          case MPFR_RNDD:
+            corr = 0;
+            break;
+          case MPFR_RNDU:
+            corr = inex;
+            break;
+          case MPFR_RNDZ:
+            corr = inex && !pos;
+            break;
+          case MPFR_RNDA:
+            corr = inex && pos;
+            break;
+          default:
+            MPFR_ASSERTN (rnd == MPFR_RNDN);
+            /* Note: for halfway cases (maxexp == MPFR_EXP_MIN) that are
+               rounded downward, rbit has been changed to 0 so that corr
+               is set correctly. */
+            corr = rbit;
+          }
+        MPFR_ASSERTD (corr == 0 || corr == 1);
+        if (inex && corr == 0)  /* two's complement significand decreased */
+          inex = -1;
+      }
+    else
+      {
+        mpfr_exp_t err;  /* exponent of the error bound */
+        mp_size_t zs;
+        int sst;  /* sign of the secondary term */
+
+        MPFR_ASSERTD (maxexp > MPFR_EXP_MIN);
+        MPFR_ASSERTD (tmd == 1 || tmd == 2);
+
+        /* New accumulator size */
+        ws = MPFR_PREC2LIMBS (wq - sq);
+        wq = (mpfr_prec_t) ws * GMP_NUMB_BITS;
+
+        err = maxexp + logn;
+
+        MPFR_LOG_MSG (("TMD with"
+                       " maxexp=%" MPFR_EXP_FSPEC "d"
+                       " err=%" MPFR_EXP_FSPEC "d"
+                       " ws=%Pd"
+                       " wq=%Pd\n",
+                       (mpfr_eexp_t) maxexp, (mpfr_eexp_t) err,
+                       (mpfr_prec_t) ws, wq));
+
+        /* The d-1 bits from u-2 to u-d (= err) are identical. */
+
+        if (err >= minexp)
+          {
+            mpfr_prec_t tq;
+            mp_size_t wi;
+            int td;
+
+            /* Let's keep the last 2 over the d-1 identical bits and the
+               following bits, i.e. the bits from err+1 to minexp. */
+            tq = err - minexp + 2;  /* tq = number of such bits */
+            MPFR_LOG_MSG (("[TMD] tq=%Pd\n", tq));
+            MPFR_ASSERTD (tq >= 2);
+
+            wi = tq / GMP_NUMB_BITS;
+            td = tq % GMP_NUMB_BITS;
+
+            if (td != 0)
+              {
+                wi++;  /* number of words with represented bits */
+                td = GMP_NUMB_BITS - td;
+                zs = ws - wi;
+                MPFR_ASSERTD (zs >= 0 && zs < ws);
+                mpn_lshift (wp + zs, wp, wi, td);
+              }
+            else
+              {
+                MPFR_ASSERTD (wi > 0);
+                zs = ws - wi;
+                MPFR_ASSERTD (zs >= 0 && zs < ws);
+                if (zs > 0)
+                  MPN_COPY_INCR (wp + zs, wp, wi);
+              }
+
+            UPDATE_MINEXP (minexp, zs * GMP_NUMB_BITS + td);
+            MPFR_ASSERTD (minexp == err + 2 - wq);
+          }
+        else  /* err < minexp */
+          {
+            /* At least one of the identical bits is not represented,
+               meaning that it is 0 and all these bits are 0's. Thus
+               the accumulator will be 0. The new minexp is determined
+               from maxexp, with cq bits reserved to avoid an overflow
+               (as in the early steps). */
+            MPFR_LOG_MSG (("[TMD] err < minexp\n", 0));
+            zs = ws;
+
+            /* minexp = maxexp + cq - wq */
+            UPDATE_MINEXP (maxexp, wq - cq);
+            MPFR_ASSERTD (minexp == err + 1 - wq);
+          }
+
+        MPN_ZERO (wp, zs);
+
+        cancel = sum_raw (wp, ws, wq, x, n, minexp, maxexp, tp, ts,
+                          logn, 0, NULL, &minexp, &maxexp);
+
+        if (cancel != 0)
+          sst = MPFR_LIMB_MSB (wp[ws-1]) == 0 ? 1 : -1;
+        else if (tmd == 1)
+          sst = 0;
+        else
+          {
+            /* For halfway cases, let's virtually eliminate them
+               by setting a sst equivalent to a non-halfway case,
+               which depends on the last bit of the pre-rounded
+               result. */
+            MPFR_ASSERTD (rnd == MPFR_RNDN && tmd == 2);
+            sst = (sump[0] & (MPFR_LIMB_ONE << sd)) ? 1 : -1;
+          }
+
+        MPFR_LOG_MSG (("[TMD] tmd=%d rbit=%d sst=%d\n",
+                       tmd, (int) rbit, sst));
+
+        /* Do not consider the corrected sst for MPFR_COV_SET */
+        MPFR_COV_SET (sum_tmd[(int) rnd][tmd-1][rbit]
+                      [cancel == 0 ? 1 : sst+1][pos]);
+
+        inex =
+          MPFR_IS_LIKE_RNDD (rnd, pos ? 1 : -1) ? (sst ? -1 : 0) :
+          MPFR_IS_LIKE_RNDU (rnd, pos ? 1 : -1) ? (sst ?  1 : 0) :
+          (MPFR_ASSERTD (rnd == MPFR_RNDN),
+           tmd == 1 ? - sst : sst);
+
+        if (tmd == 2 && sst == (rbit ? -1 : 1))
+          corr = 1 - (int) rbit;
+        else if (MPFR_IS_LIKE_RNDD (rnd, pos ? 1 : -1) && sst == -1)
+          corr = (int) rbit - 1;
+        else if (MPFR_IS_LIKE_RNDU (rnd, pos ? 1 : -1) && sst == +1)
+          corr = (int) rbit + 1;
+        else
+          corr = (int) rbit;
+      }
+
+    MPFR_LOG_MSG (("pos=%d corr=%d inex=%d\n", pos, corr, inex));
+
+    /* Sign handling (-> absolute value and sign), together with
+       rounding. The most common cases are corr = 0 and corr = 1
+       as this is necessarily the case when the TMD did not occur. */
+
+    MPFR_ASSERTD (corr >= -1 && corr <= 2);
+
+    if (pos)  /* positive result */
+      {
+        MPFR_ASSERTD (MPFR_LIMB_MSB (sump[sn-1]) != 0);
+        MPFR_SET_POS (sum);
+        sump[0] &= ~ MPFR_LIMB_MASK (sd);
+
+        if (corr > 0)
+          {
+            mp_limb_t corr2, carry_out;
+
+            corr2 = (mp_limb_t) corr << sd;
+            /* If corr == 2 && sd == GMP_NUMB_BITS - 1, this overflows
+               to corr2 = 0. This case is taken into account below. */
+
+            carry_out = corr2 != 0 ? mpn_add_1 (sump, sump, sn, corr2) :
+              (MPFR_ASSERTD (sn > 1),
+               mpn_add_1 (sump + 1, sump + 1, sn - 1, MPFR_LIMB_ONE));
+
+            MPFR_ASSERTD (sump[sn-1] >> (GMP_NUMB_BITS - 1) == !carry_out);
+
+            if (MPFR_UNLIKELY (carry_out))
+              {
+                /* Note: The | is important when sump[sn-1] is not 0
+                   (this can occur with sn = 1 and corr = 2). TODO:
+                   Add something to make sure that this is tested. */
+                sump[sn-1] |= MPFR_LIMB_HIGHBIT;
+                e++;
+              }
+          }
+
+        if (corr < 0)
+          {
+            mpn_sub_1 (sump, sump, sn, MPFR_LIMB_ONE << sd);
+
+            if (MPFR_UNLIKELY (MPFR_LIMB_MSB (sump[sn-1]) == 0))
+              {
+                sump[sn-1] |= MPFR_LIMB_HIGHBIT;
+                e--;
+              }
+          }
+      }
+    else  /* negative result */
+      {
+        MPFR_ASSERTD (MPFR_LIMB_MSB (sump[sn-1]) == 0);
+        MPFR_SET_NEG (sum);
+
+        /* abs(x + corr) = - (x + corr) = com(x) + (1 - corr) */
+
+        /* We want to avoid separate mpn_com (or mpn_neg) and mpn_add_1
+           (or mpn_sub_1) operations, as they could yield two loops in
+           some particular cases involving a long sequence of 0's in
+           the low significant bits (except the least significant bit,
+           which doesn't matter). */
+
+        if (corr <= 1)
+          {
+            mp_limb_t corr2;
+
+            /* Here we can just do the correction operation on the
+               least significant limb, then do either a mpn_com or
+               a mpn_neg on the remaining limbs, depending on the
+               carry (BTW, mpn_neg is just a mpn_com with an initial
+               carry propagation: after some point, mpn_neg does a
+               complement). */
+
+            corr2 = (mp_limb_t) (1 - corr) << sd;
+            /* Note: If corr = -1, this can overflow to corr2 = 0.
+               This case is taken into account below. */
+
+            sump[0] = (~ (sump[0] | MPFR_LIMB_MASK (sd))) + corr2;
+
+            if (sump[0] < corr2 || (corr2 == 0 && corr < 0))
+              {
+                if (sn == 1 || ! mpn_neg (sump + 1, sump + 1, sn - 1))
+                  {
+                    /* Note: The | is important when sump[sn-1] is not 0
+                       (this can occur with sn = 1 and corr = -1). TODO:
+                       Add something to make sure that this is tested. */
+                    sump[sn-1] |= MPFR_LIMB_HIGHBIT;
+                    e++;
+                  }
+              }
+            else if (sn > 1)
+              mpn_com (sump + 1, sump + 1, sn - 1);
+          }
+        else  /* corr == 2 */
+          {
+            mp_limb_t corr2, c;
+            mp_size_t i = 1;
+
+            /* We want to compute com(x) - 1, but GMP doesn't have an
+               operation for that. The fact is that a sequence of low
+               significant bits 1 is invariant. Starting at the first
+               low significant bit 0, we can do the complement with
+               mpn_com. */
+
+            corr2 = MPFR_LIMB_ONE << sd;
+            c = ~ (sump[0] | MPFR_LIMB_MASK (sd));
+            sump[0] = c - corr2;
+
+            if (c == 0)
+              {
+                while (MPFR_ASSERTD (i < sn), sump[i] == MPFR_LIMB_MAX)
+                  i++;
+                sump[i] = (~ sump[i]) - 1;
+                i++;
+              }
+
+            if (i < sn)
+              mpn_com (sump + i, sump + i, sn - i);
+            else if (MPFR_UNLIKELY (MPFR_LIMB_MSB (sump[sn-1]) == 0))
+              {
+                /* Happens on 01111...111, whose complement is
+                   10000...000, and com(x) - 1 is 01111...111. */
+                sump[sn-1] |= MPFR_LIMB_HIGHBIT;
+                e--;
+              }
+          }
+      }
+
+    MPFR_ASSERTD (MPFR_LIMB_MSB (sump[sn-1]) != 0);
+    MPFR_LOG_MSG (("Set exponent e=%" MPFR_EXP_FSPEC "d\n", (mpfr_eexp_t) e));
+    /* e may be outside the current exponent range, but this will be checked
+       with mpfr_check_range below. */
+    MPFR_EXP (sum) = e;
+  }  /* main block */
+
   MPFR_TMP_FREE (marker);
+  return mpfr_check_range (sum, inex, rnd);
 }
 
+/**********************************************************************/
 
-#define GET_EXP2(x) (MPFR_IS_ZERO (x) ? MPFR_EMIN_MIN : MPFR_GET_EXP (x))
-
-/* Performs a heap sort of the entries */
-static void
-heap_sort (mpfr_srcptr *const tab, unsigned long n, mpfr_srcptr *perm)
+int
+mpfr_sum (mpfr_ptr sum, mpfr_ptr *const x, unsigned long n, mpfr_rnd_t rnd)
 {
-  unsigned long dernier_traite;
-  unsigned long i, pere;
-  mpfr_srcptr tmp;
-  unsigned long fils_gauche, fils_droit, fils_indigne;
-  /* Reminder of a heap structure :
-     node(i) has for left son node(2i +1) and right son node(2i)
-     and father(node(i)) = node((i - 1) / 2)
-  */
-
-  /* initialize the permutation to identity */
-  for (i = 0; i < n; i++)
-    perm[i] = tab[i];
-
-  /* insertion phase */
-  for (dernier_traite = 1; dernier_traite < n; dernier_traite++)
+  MPFR_LOG_FUNC
+    (("n=%lu rnd=%d", n, rnd),
+     ("sum[%Pu]=%.*Rg", mpfr_get_prec (sum), mpfr_log_prec, sum));
+
+  if (MPFR_UNLIKELY (n <= 2))
     {
-      i = dernier_traite;
-      while (i > 0)
+      if (n == 0)
         {
-          pere = (i - 1) / 2;
-          if (GET_EXP2 (perm[pere]) > GET_EXP2 (perm[i]))
-            {
-              tmp = perm[pere];
-              perm[pere] = perm[i];
-              perm[i] = tmp;
-              i = pere;
-            }
-          else
-            break;
+          MPFR_SET_ZERO (sum);
+          MPFR_SET_POS (sum);
+          MPFR_RET (0);
         }
+      else if (n == 1)
+        return mpfr_set (sum, x[0], rnd);
+      else
+        return mpfr_add (sum, x[0], x[1], rnd);
     }
-
-  /* extraction phase */
-  for (dernier_traite = n - 1; dernier_traite > 0; dernier_traite--)
+  else
     {
-      tmp = perm[0];
-      perm[0] = perm[dernier_traite];
-      perm[dernier_traite] = tmp;
+      mpfr_exp_t maxexp = MPFR_EXP_MIN;  /* max(Empty) */
+      unsigned long i;
+      unsigned long rn = 0;  /* will be the number of regular inputs */
+      /* sign of infinities and zeros (0: currently unknown) */
+      int sign_inf = 0, sign_zero = 0;
+
+      MPFR_LOG_MSG (("Check for special inputs (n = %lu >= 3)\n", n));
 
-      i = 0;
-      while (1)
+      for (i = 0; i < n; i++)
         {
-          fils_gauche = 2 * i + 1;
-          fils_droit = fils_gauche + 1;
-          if (fils_gauche < dernier_traite)
+          if (MPFR_UNLIKELY (MPFR_IS_SINGULAR (x[i])))
             {
-              if (fils_droit < dernier_traite)
+              if (MPFR_IS_NAN (x[i]))
                 {
-                  if (GET_EXP2(perm[fils_droit]) < GET_EXP2(perm[fils_gauche]))
-                    fils_indigne = fils_droit;
-                  else
-                    fils_indigne = fils_gauche;
-
-                  if (GET_EXP2 (perm[i]) > GET_EXP2 (perm[fils_indigne]))
-                    {
-                      tmp = perm[i];
-                      perm[i] = perm[fils_indigne];
-                      perm[fils_indigne] = tmp;
-                      i = fils_indigne;
-                    }
-                  else
-                    break;
+                  /* The current value x[i] is NaN. Then the sum is NaN. */
+                nan:
+                  MPFR_SET_NAN (sum);
+                  MPFR_RET_NAN;
                 }
-              else /* on a un fils gauche, pas de fils droit */
+              else if (MPFR_IS_INF (x[i]))
                 {
-                  if (GET_EXP2 (perm[i]) > GET_EXP2 (perm[fils_gauche]))
-                    {
-                      tmp = perm[i];
-                      perm[i] = perm[fils_gauche];
-                      perm[fils_gauche] = tmp;
-                    }
-                  break;
+                  /* The current value x[i] is an infinity.
+                     There are two cases:
+                     1. This is the first infinity value (sign_inf == 0).
+                        Then set sign_inf to its sign, and go on.
+                     2. All the infinities found until now have the same
+                        sign sign_inf. If this new infinity has a different
+                        sign, then return NaN immediately, else go on. */
+                  if (sign_inf == 0)
+                    sign_inf = MPFR_SIGN (x[i]);
+                  else if (MPFR_SIGN (x[i]) != sign_inf)
+                    goto nan;
+                }
+              else if (MPFR_UNLIKELY (rn == 0))
+                {
+                  /* The current value x[i] is a zero. The code below matters
+                     only when all values found until now are zeros, otherwise
+                     it is harmless (the test rn == 0 above is just a minor
+                     optimization).
+                     Here we track the sign of the zero result when all inputs
+                     are zeros: if all zeros have the same sign, the result
+                     will have this sign, otherwise (i.e. if there is at least
+                     a zero of each sign), the sign of the zero result depends
+                     only on the rounding mode (note that this choice is
+                     sticky when new zeros are considered). */
+                  MPFR_ASSERTD (MPFR_IS_ZERO (x[i]));
+                  if (sign_zero == 0)
+                    sign_zero = MPFR_SIGN (x[i]);
+                  else if (MPFR_SIGN (x[i]) != sign_zero)
+                    sign_zero = rnd == MPFR_RNDD ? -1 : 1;
                 }
             }
-          else /* on n'a pas de fils */
-            break;
+          else
+            {
+              /* The current value x[i] is a regular number. */
+              mpfr_exp_t e = MPFR_GET_EXP (x[i]);
+              if (e > maxexp)
+                maxexp = e;  /* maximum exponent found until now */
+              rn++;  /* current number of regular inputs */
+            }
         }
-    }
-}
-
 
-/* Sum a list of float with order given by permutation perm,
- * intermediate size set to F. Return non-zero if at least one of
- * the operations is inexact (thus 0 implies that the sum is exact).
- * Internal use function.
- */
-static int
-sum_once (mpfr_ptr ret, mpfr_srcptr *const tab, unsigned long n, mpfr_prec_t F)
-{
-  mpfr_t sum;
-  unsigned long i;
-  int error_trap;
+      MPFR_LOG_MSG (("rn=%lu sign_inf=%d sign_zero=%d\n",
+                     rn, sign_inf, sign_zero));
 
-  MPFR_ASSERTD (n >= 2);
+      /* At this point the result cannot be NaN (this case has already
+         been filtered out). */
 
-  mpfr_init2 (sum, F);
-  error_trap = mpfr_set (sum, tab[0], MPFR_RNDN);
-  for (i = 1; i < n - 1; i++)
-    {
-      MPFR_ASSERTD (!MPFR_IS_NAN (sum) && !MPFR_IS_INF (sum));
-      if (mpfr_add (sum, sum, tab[i], MPFR_RNDN))
-        error_trap = 1;
-    }
-  if (mpfr_add (ret, sum, tab[n - 1], MPFR_RNDN))
-    error_trap = 1;
-  mpfr_clear (sum);
-  return error_trap;
-}
+      if (MPFR_UNLIKELY (sign_inf != 0))
+        {
+          /* At least one infinity, and all of them have the same sign
+             sign_inf. The sum is the infinity of this sign. */
+          MPFR_SET_INF (sum);
+          MPFR_SET_SIGN (sum, sign_inf);
+          MPFR_RET (0);
+        }
 
-/* Sum a list of floating-point numbers.
- * If the return value is 0, then the sum is exact.
- * Otherwise the return value gives no information.
- */
-int
-mpfr_sum (mpfr_ptr ret, mpfr_ptr *const tab_p, unsigned long n, mpfr_rnd_t rnd)
-{
-  mpfr_t cur_sum;
-  mpfr_prec_t prec;
-  mpfr_srcptr *perm, *const tab = (mpfr_srcptr *) tab_p;
-  int k, error_trap;
-  MPFR_ZIV_DECL (loop);
-  MPFR_SAVE_EXPO_DECL (expo);
-  MPFR_TMP_DECL (marker);
+      /* At this point, all the inputs are finite numbers. */
 
-  if (MPFR_UNLIKELY (n <= 1))
-    {
-      if (n < 1)
+      if (MPFR_UNLIKELY (rn == 0))
         {
-          MPFR_SET_ZERO (ret);
-          MPFR_SET_POS (ret);
-          return 0;
+          /* All the numbers were zeros (and there is at least one).
+             The sum is zero with sign sign_zero. */
+          MPFR_ASSERTD (sign_zero != 0);
+          MPFR_SET_ZERO (sum);
+          MPFR_SET_SIGN (sum, sign_zero);
+          MPFR_RET (0);
         }
-      else
-        return mpfr_set (ret, tab[0], rnd);
-    }
 
-  /* Sort and treat special cases */
-  MPFR_TMP_MARK (marker);
-  perm = (mpfr_srcptr *) MPFR_TMP_ALLOC (n * sizeof *perm);
-  prec = MPFR_PREC (ret);
-  error_trap = mpfr_sum_sort (tab, n, perm, &prec);
-  /* Check if there was a NAN or a INF */
-  if (MPFR_UNLIKELY (error_trap != 0))
-    {
-      MPFR_TMP_FREE (marker);
-      if (error_trap == 2)
+      /* Optimize the case where there are only two regular numbers. */
+      if (MPFR_UNLIKELY (rn <= 2))
         {
-          MPFR_SET_NAN (ret);
-          MPFR_RET_NAN;
-        }
-      MPFR_SET_INF (ret);
-      MPFR_SET_SIGN (ret, error_trap);
-      MPFR_RET (0);
-    }
+          unsigned long h = ULONG_MAX;
 
-  /* Initial precision is max(prec(ret),prec(tab[0]),...,prec(tab[n-1])) */
-  k = MPFR_INT_CEIL_LOG2 (n) + 1;
-  prec += k + 2;
-  mpfr_init2 (cur_sum, prec);
+          for (i = 0; i < n; i++)
+            if (! MPFR_IS_SINGULAR (x[i]))
+              {
+                if (rn == 1)
+                  return mpfr_set (sum, x[i], rnd);
+                if (h != ULONG_MAX)
+                  return mpfr_add (sum, x[h], x[i], rnd);
+                h = i;
+              }
+          MPFR_RET_NEVER_GO_HERE();
+        }
 
-  /* Ziv Loop */
-  MPFR_SAVE_EXPO_MARK (expo);
-  MPFR_ZIV_INIT (loop, prec);
-  for (;;)
-    {
-      error_trap = sum_once (cur_sum, perm, n, prec + k);
-      if (MPFR_LIKELY (error_trap == 0 ||
-                       (!MPFR_IS_ZERO (cur_sum) &&
-                        mpfr_can_round (cur_sum, prec - 2,
-                                        MPFR_RNDN, rnd, MPFR_PREC (ret)))))
-        break;
-      MPFR_ZIV_NEXT (loop, prec);
-      mpfr_set_prec (cur_sum, prec);
+      return sum_aux (sum, x, n, rnd, maxexp, rn);
     }
-  MPFR_ZIV_FREE (loop);
-  MPFR_TMP_FREE (marker);
-
-  if (mpfr_set (ret, cur_sum, rnd))
-    error_trap = 1;
-  mpfr_clear (cur_sum);
-
-  MPFR_SAVE_EXPO_FREE (expo);
-  if (mpfr_check_range (ret, 0, rnd))
-    error_trap = 1;
-  return error_trap; /* It doesn't return the ternary value */
 }
-
-/* __END__ */
diff --git a/tests/tsum.c b/tests/tsum.c
index ead15a0dc..2eee92755 100644
--- a/tests/tsum.c
+++ b/tests/tsum.c
@@ -20,219 +20,119 @@ along with the GNU MPFR Library; see the file COPYING.LESSER.  If not, see
 http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc.,
 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */
 
-/* TODO: add some generic random test with cancellations. Something like:
-   1. Generate random numbers with random precisions.
-   2. Compute the sum s at some random precision and some rounding mode.
-   3. While s != 0:
-   4. Include -s in the array.
-   5. Reorder the terms randomly.
-   6. Recompute a new sum s' at some random precision and some rounding mode.
-   7. Check that |s'| < ulp(s), with a factor 1/2 for MPFR_RNDN.
-   8. Reiterate at (3) with s = s'.
-   Also add tests with intermediate overflows and tests with underflows
-   (this matters here as we don't have subnormals).
-   Note: partly done in the cancel() test (r8857); the result is not tested
-   yet, but this test already shows an efficiency problem.
-*/
-
 #include "mpfr-test.h"
 
 #ifndef MPFR_NCANCEL
 #define MPFR_NCANCEL 10
 #endif
 
-static int
-check_is_sorted (unsigned long n, mpfr_srcptr *perm)
-{
-  unsigned long i;
-
-  for (i = 0; i < n - 1; i++)
-    if (MPFR_GET_EXP(perm[i]) < MPFR_GET_EXP(perm[i+1]))
-      return 0;
-  return 1;
-}
-
-static int
-sum_tab (mpfr_ptr ret, mpfr_t *tab, unsigned long n, mpfr_rnd_t rnd)
+static mpfr_prec_t
+get_prec_max (mpfr_t *t, int n)
 {
-  mpfr_ptr *tabtmp;
-  unsigned long i;
-  int inexact;
-  MPFR_TMP_DECL(marker);
+  mpfr_exp_t e, min, max;
+  int i;
 
-  MPFR_TMP_MARK(marker);
-  tabtmp = (mpfr_ptr *) MPFR_TMP_ALLOC(n * sizeof(mpfr_srcptr));
+  min = MPFR_EMAX_MAX;
+  max = MPFR_EMIN_MIN;
   for (i = 0; i < n; i++)
-    tabtmp[i] = tab[i];
+    if (MPFR_IS_PURE_FP (t[i]))
+      {
+        e = MPFR_GET_EXP (t[i]);
+        if (e > max)
+          max = e;
+        e -= MPFR_GET_PREC (t[i]);
+        if (e < min)
+          min = e;
+      }
 
-  inexact = mpfr_sum (ret, tabtmp, n, rnd);
-  MPFR_TMP_FREE(marker);
-  return inexact;
+  return min > max ? MPFR_PREC_MIN /* no pure FP values */
+    : max - min + __gmpfr_ceil_log2 (n);
 }
 
-
-static mpfr_prec_t
-get_prec_max (mpfr_t *tab, unsigned long n, mpfr_prec_t f)
-{
-  mpfr_prec_t res;
-  mpfr_exp_t min, max;
-  unsigned long i;
-
-  i = 0;
-  while (MPFR_IS_ZERO (tab[i]))
-    {
-      i++;
-      if (i == n)
-        return MPFR_PREC_MIN;  /* all values are 0 */
-    }
-
-  if (! mpfr_check (tab[i]))
-    {
-      printf ("tab[%lu] is not valid.\n", i);
-      exit (1);
-    }
-  MPFR_ASSERTN (MPFR_IS_FP (tab[i]));
-  min = max = MPFR_GET_EXP(tab[i]);
-  for (i++; i < n; i++)
-    {
-      if (! mpfr_check (tab[i]))
-        {
-          printf ("tab[%lu] is not valid.\n", i);
-          exit (1);
-        }
-      MPFR_ASSERTN (MPFR_IS_FP (tab[i]));
-      if (! MPFR_IS_ZERO (tab[i]))
-        {
-          if (MPFR_GET_EXP(tab[i]) > max)
-            max = MPFR_GET_EXP(tab[i]);
-          if (MPFR_GET_EXP(tab[i]) < min)
-            min = MPFR_GET_EXP(tab[i]);
-        }
-    }
-  res = max - min;
-  res += f;
-  res += __gmpfr_ceil_log2 (n) + 1;
-  return res;
-}
-
-
 static void
-algo_exact (mpfr_t somme, mpfr_t *tab, unsigned long n, mpfr_prec_t f)
+get_exact_sum (mpfr_t sum, mpfr_t *tab, int n)
 {
-  unsigned long i;
-  mpfr_prec_t prec_max;
+  int i;
 
-  prec_max = get_prec_max(tab, n, f);
-  mpfr_set_prec (somme, prec_max);
-  mpfr_set_ui (somme, 0, MPFR_RNDN);
+  mpfr_set_prec (sum, get_prec_max (tab, n));
+  mpfr_set_ui (sum, 0, MPFR_RNDN);
   for (i = 0; i < n; i++)
-    {
-      if (mpfr_add(somme, somme, tab[i], MPFR_RNDN))
-        {
-          printf ("FIXME: algo_exact is buggy.\n");
-          exit (1);
-        }
-    }
+    if (mpfr_add (sum, sum, tab[i], MPFR_RNDN))
+      {
+        printf ("FIXME: get_exact_sum is buggy.\n");
+        exit (1);
+      }
 }
 
-/* Test the sorting function */
 static void
-test_sort (mpfr_prec_t f, unsigned long n)
+generic_tests (void)
 {
-  mpfr_t *tab;
-  mpfr_ptr *tabtmp;
-  mpfr_srcptr *perm;
-  unsigned long i;
-  mpfr_prec_t prec = MPFR_PREC_MIN;
-
-  /* Init stuff */
-  tab = (mpfr_t *) (*__gmp_allocate_func) (n * sizeof (mpfr_t));
-  for (i = 0; i < n; i++)
-    mpfr_init2 (tab[i], f);
-  tabtmp = (mpfr_ptr *) (*__gmp_allocate_func) (n * sizeof(mpfr_ptr));
-  perm = (mpfr_srcptr *) (*__gmp_allocate_func) (n * sizeof(mpfr_srcptr));
+  mpfr_t exact_sum, sum1, sum2;
+  mpfr_t *t;
+  mpfr_ptr *p;
+  mpfr_prec_t precmax = 444;
+  int i, m, nmax = 500;
+  int rnd_mode;
 
-  for (i = 0; i < n; i++)
+  t = (mpfr_t *) (*__gmp_allocate_func) (nmax * sizeof(mpfr_t));
+  p = (mpfr_ptr *) (*__gmp_allocate_func) (nmax * sizeof(mpfr_ptr));
+  for (i = 0; i < nmax; i++)
     {
-      mpfr_urandomb (tab[i], RANDS);
-      tabtmp[i] = tab[i];
+      mpfr_init2 (t[i], precmax);
+      p[i] = t[i];
     }
+  mpfr_inits2 (precmax, exact_sum, sum1, sum2, (mpfr_ptr) 0);
 
-  mpfr_sum_sort ((mpfr_srcptr *)tabtmp, n, perm, &prec);
-
-  if (check_is_sorted (n, perm) == 0)
+  for (m = 0; m < 4000; m++)
     {
-      printf ("mpfr_sum_sort incorrect.\n");
-      for (i = 0; i < n; i++)
-        mpfr_dump (perm[i]);
-      exit (1);
-    }
+      int non_uniform, n;
+      mpfr_prec_t prec;
 
-  /* Clear stuff */
-  for (i = 0; i < n; i++)
-    mpfr_clear (tab[i]);
-  (*__gmp_free_func) (tab, n * sizeof (mpfr_t));
-  (*__gmp_free_func) (tabtmp, n * sizeof(mpfr_ptr));
-  (*__gmp_free_func) (perm, n * sizeof(mpfr_srcptr));
-}
-
-static void
-test_sum (mpfr_prec_t f, unsigned long n)
-{
-  mpfr_t sum, real_sum, real_non_rounded;
-  mpfr_t *tab;
-  unsigned long i;
-  int rnd_mode;
-
-  /* Init */
-  tab = (mpfr_t *) (*__gmp_allocate_func) (n * sizeof(mpfr_t));
-  for (i = 0; i < n; i++)
-    mpfr_init2 (tab[i], f);
-  mpfr_inits2 (f, sum, real_sum, real_non_rounded, (mpfr_ptr) 0);
+      non_uniform = randlimb () % 10;
+      n = (randlimb () % nmax) + 1;
+      prec = MPFR_PREC_MIN + (randlimb () % (precmax - MPFR_PREC_MIN + 1));
+      mpfr_set_prec (sum1, prec);
+      mpfr_set_prec (sum2, prec);
 
-  /* First Uniform */
-  for (i = 0; i < n; i++)
-    mpfr_urandomb (tab[i], RANDS);
-  algo_exact (real_non_rounded, tab, n, f);
-  for (rnd_mode = 0; rnd_mode < MPFR_RND_MAX; rnd_mode++)
-    {
-      sum_tab (sum, tab, n, (mpfr_rnd_t) rnd_mode);
-      mpfr_set (real_sum, real_non_rounded, (mpfr_rnd_t) rnd_mode);
-      if (mpfr_cmp (real_sum, sum) != 0)
+      for (i = 0; i < n; i++)
         {
-          printf ("mpfr_sum incorrect.\n");
-          mpfr_dump (real_sum);
-          mpfr_dump (sum);
-          exit (1);
+          mpfr_set_prec (t[i], MPFR_PREC_MIN +
+                         (randlimb () % (precmax - MPFR_PREC_MIN + 1)));
+          mpfr_urandomb (t[i], RANDS);
+          if (m % 8 != 0 && (m % 8 == 1 || (randlimb () & 1)))
+            mpfr_neg (t[i], t[i], MPFR_RNDN);
+          if (non_uniform && MPFR_NOTZERO (t[i]))
+            mpfr_set_exp (t[i], randlimb () % 1000);
+          /* putchar ("-0+"[SIGN (mpfr_sgn (t[i])) + 1]); */
         }
-    }
-
-  /* Then non uniform */
-  for (i = 0; i < n; i++)
-    {
-      mpfr_urandomb (tab[i], RANDS);
-      if (! mpfr_zero_p (tab[i]))
-        mpfr_set_exp (tab[i], randlimb () % 1000);
-    }
-  algo_exact (real_non_rounded, tab, n, f);
-  for (rnd_mode = 0; rnd_mode < MPFR_RND_MAX; rnd_mode++)
-    {
-      sum_tab (sum, tab, n, (mpfr_rnd_t) rnd_mode);
-      mpfr_set (real_sum, real_non_rounded, (mpfr_rnd_t) rnd_mode);
-      if (mpfr_cmp (real_sum, sum) != 0)
+      /* putchar ('\n'); */
+      get_exact_sum (exact_sum, t, n);
+      RND_LOOP (rnd_mode)
         {
-          printf ("mpfr_sum incorrect.\n");
-          mpfr_dump (real_sum);
-          mpfr_dump (sum);
-          exit (1);
+          int inex1, inex2;
+
+          inex1 = mpfr_set (sum1, exact_sum, (mpfr_rnd_t) rnd_mode);
+          inex2 = mpfr_sum (sum2, p, n, (mpfr_rnd_t) rnd_mode);
+          if (!(mpfr_equal_p (sum1, sum2) && SAME_SIGN (inex1, inex2)))
+            {
+              printf ("generic_tests failed on m = %d, %s\n", m,
+                      mpfr_print_rnd_mode ((mpfr_rnd_t) rnd_mode));
+              printf ("Expected ");
+              mpfr_dump (sum1);
+              printf ("with inex = %d\n", inex1);
+              printf ("Got      ");
+              mpfr_dump (sum2);
+              printf ("with inex = %d\n", inex2);
+              exit (1);
+            }
         }
     }
 
-  /* Clear stuff */
-  for (i = 0; i < n; i++)
-    mpfr_clear (tab[i]);
-  mpfr_clears (sum, real_sum, real_non_rounded, (mpfr_ptr) 0);
-  (*__gmp_free_func) (tab, n * sizeof(mpfr_t));
+  for (i = 0; i < nmax; i++)
+    mpfr_clear (t[i]);
+  mpfr_clears (exact_sum, sum1, sum2, (mpfr_ptr) 0);
+  (*__gmp_free_func) (t, nmax * sizeof(mpfr_t));
+  (*__gmp_free_func) (p, nmax * sizeof(mpfr_ptr));
 }
 
 static
@@ -318,12 +218,422 @@ void check_special (void)
   mpfr_clears (tab[0], tab[1], tab[2], r, (mpfr_ptr) 0);
 }
 
+#define NC 7
+#define NS 6
+
+static void
+check_more_special (void)
+{
+  char *str[NC] = { "NaN", "+Inf", "-Inf", "+0", "-0", "+1", "-1" };
+  int i, r, k[NS];
+  mpfr_t c[NC], s[NS], sum;
+  mpfr_ptr p[NS];
+  int inex;
+
+  for (i = 0; i < NC; i++)
+    {
+      int ret;
+      mpfr_init2 (c[i], 8);
+      ret = mpfr_set_str (c[i], str[i], 0, MPFR_RNDN);
+      MPFR_ASSERTN (ret == 0);
+    }
+  for (i = 0; i < NS; i++)
+    mpfr_init2 (s[i], 8);
+  mpfr_init2 (sum, 8);
+
+  RND_LOOP(r)
+    {
+      i = 0;
+      while (1)
+        {
+          while (i < NS)
+            {
+              p[i] = c[0];
+              mpfr_set_nan (s[i]);
+              k[i++] = 0;
+            }
+          inex = mpfr_sum (sum, p, NS, (mpfr_rnd_t) r);
+          if (! ((MPFR_IS_NAN (sum) && MPFR_IS_NAN (s[NS-1])) ||
+                 (mpfr_equal_p (sum, s[NS-1]) &&
+                  MPFR_SIGN (sum) == MPFR_SIGN (s[NS-1]))) || inex != 0)
+            {
+              printf ("Error in check_more_special on %s",
+                      mpfr_print_rnd_mode ((mpfr_rnd_t) r));
+              for (i = 0; i < NS; i++)
+                printf (" %d", k[i]);
+              printf (" with\n");
+              for (i = 0; i < NS; i++)
+                {
+                  printf ("  p[%d] = %s = ", i, str[k[i]]);
+                  mpfr_dump (p[i]);
+                }
+              printf ("Expected ");
+              mpfr_dump (s[NS-1]);
+              printf ("with inex = 0\n");
+              printf ("Got      ");
+              mpfr_dump (sum);
+              printf ("with inex = %d\n", inex);
+              exit (1);
+            }
+          while (k[--i] == NC-1)
+            if (i == 0)
+              goto next_rnd;
+          p[i] = c[++k[i]];
+          if (i == 0)
+            mpfr_set (s[i], p[i], (mpfr_rnd_t) r);
+          else
+            mpfr_add (s[i], s[i-1], p[i], (mpfr_rnd_t) r);
+          i++;
+        }
+    next_rnd: ;
+    }
+
+  for (i = 0; i < NC; i++)
+    mpfr_clear (c[i]);
+  for (i = 0; i < NS; i++)
+    mpfr_clear (s[i]);
+  mpfr_clear (sum);
+}
+
+/* i * 2^46 + j * 2^45 + k * 2^44 + f * 2^(-2),
+   with -1 <= i, j, k <= 1, i != 0, -3 <= f <= 3,
+   ulp(exact sum) = 2^0 and ulp(exact sum) = 2^44. */
+static void
+check1 (void)
+{
+  mpfr_t sum1, sum2, s1, s2, s3, t[4];
+  mpfr_ptr p[4];
+  int i, j, k, f, prec, r, inex1, inex2;
+
+  mpfr_init2 (sum1, 47);
+  mpfr_init2 (sum2, 47);
+  mpfr_init2 (s1, 3);
+  mpfr_init2 (s2, 3);
+  mpfr_init2 (s3, 49);
+  for (i = 0; i < 4; i++)
+    {
+      mpfr_init2 (t[i], 2);
+      p[i] = t[i];
+    }
+
+  for (i = -1; i <= 1; i += 2)
+    {
+      mpfr_set_si_2exp (t[0], i, 46, MPFR_RNDN);
+      for (j = -1; j <= 1; j++)
+        {
+          mpfr_set_si_2exp (t[1], j, 45, MPFR_RNDN);
+          inex1 = mpfr_add (s1, t[0], t[1], MPFR_RNDN);
+          MPFR_ASSERTN (inex1 == 0);
+          for (k = -1; k <= 1; k++)
+            {
+              mpfr_set_si_2exp (t[2], k, 44, MPFR_RNDN);
+              inex1 = mpfr_add (s2, s1, t[2], MPFR_RNDN);
+              MPFR_ASSERTN (inex1 == 0);
+              for (f = -3; f <= 3; f++)
+                {
+                  mpfr_set_si_2exp (t[3], f, -2, MPFR_RNDN);
+                  inex1 = mpfr_add (s3, s2, t[3], MPFR_RNDN);
+                  MPFR_ASSERTN (inex1 == 0);
+                  for (prec = mpfr_get_exp (s3);
+                       prec >= MPFR_PREC_MIN;
+                       prec -= 44)
+                    {
+                      mpfr_set_prec (sum1, prec);
+                      mpfr_set_prec (sum2, prec);
+                      RND_LOOP (r)
+                        {
+                          inex1 = mpfr_set (sum1, s3, (mpfr_rnd_t) r);
+                          inex2 = mpfr_sum (sum2, p, 4, (mpfr_rnd_t) r);
+                          MPFR_ASSERTN (mpfr_check (sum1));
+                          MPFR_ASSERTN (mpfr_check (sum2));
+                          if (!(mpfr_equal_p (sum1, sum2) &&
+                                SAME_SIGN (inex1, inex2)))
+                            {
+                              printf ("Error in check1 on %s, prec = %d, "
+                                      "i = %d, j = %d, k = %d, f = %d\n",
+                                      mpfr_print_rnd_mode ((mpfr_rnd_t) r),
+                                      prec, i, j, k, f);
+                              printf ("Expected ");
+                              mpfr_dump (sum1);
+                              printf ("with inex = %d\n", inex1);
+                              printf ("Got      ");
+                              mpfr_dump (sum2);
+                              printf ("with inex = %d\n", inex2);
+                              exit (1);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+  for (i = 0; i < 4; i++)
+    mpfr_clear (t[i]);
+  mpfr_clears (sum1, sum2, s1, s2, s3, (mpfr_ptr) 0);
+}
+
+/* With N = 2 * GMP_NUMB_BITS:
+   i * 2^N + j + k * 2^(-1) + f1 * 2^(-N) + f2 * 2^(-N),
+   with i = -1 or 1, j = 0 or i, -1 <= k <= 1, -1 <= f1 <= 1, -1 <= f2 <= 1
+   ulp(exact sum) = 2^0. */
+static void
+check2 (void)
+{
+  mpfr_t sum1, sum2, s1, s2, s3, s4, t[5];
+  mpfr_ptr p[5];
+  int i, j, k, f1, f2, prec, r, inex1, inex2;
+
+#define N (2 * GMP_NUMB_BITS)
+
+  mpfr_init2 (sum1, N+1);
+  mpfr_init2 (sum2, N+1);
+  mpfr_init2 (s1, N+1);
+  mpfr_init2 (s2, N+2);
+  mpfr_init2 (s3, 2*N+1);
+  mpfr_init2 (s4, 2*N+1);
+  for (i = 0; i < 5; i++)
+    {
+      mpfr_init2 (t[i], 2);
+      p[i] = t[i];
+    }
+
+  for (i = -1; i <= 1; i += 2)
+    {
+      mpfr_set_si_2exp (t[0], i, N, MPFR_RNDN);
+      for (j = 0; j != 2*i; j += i)
+        {
+          mpfr_set_si (t[1], j, MPFR_RNDN);
+          inex1 = mpfr_add (s1, t[0], t[1], MPFR_RNDN);
+          MPFR_ASSERTN (inex1 == 0);
+          for (k = -1; k <= 1; k++)
+            {
+              mpfr_set_si_2exp (t[2], k, -1, MPFR_RNDN);
+              inex1 = mpfr_add (s2, s1, t[2], MPFR_RNDN);
+              MPFR_ASSERTN (inex1 == 0);
+              for (f1 = -1; f1 <= 1; f1++)
+                {
+                  mpfr_set_si_2exp (t[3], f1, -N, MPFR_RNDN);
+                  inex1 = mpfr_add (s3, s2, t[3], MPFR_RNDN);
+                  MPFR_ASSERTN (inex1 == 0);
+                  for (f2 = -1; f2 <= 1; f2++)
+                    {
+                      mpfr_set_si_2exp (t[4], f2, -N, MPFR_RNDN);
+                      inex1 = mpfr_add (s4, s3, t[4], MPFR_RNDN);
+                      MPFR_ASSERTN (inex1 == 0);
+                      prec = mpfr_get_exp (s4);
+                      mpfr_set_prec (sum1, prec);
+                      mpfr_set_prec (sum2, prec);
+                      RND_LOOP (r)
+                        {
+                          inex1 = mpfr_set (sum1, s4, (mpfr_rnd_t) r);
+                          inex2 = mpfr_sum (sum2, p, 5, (mpfr_rnd_t) r);
+                          MPFR_ASSERTN (mpfr_check (sum1));
+                          MPFR_ASSERTN (mpfr_check (sum2));
+                          if (!(mpfr_equal_p (sum1, sum2) &&
+                                SAME_SIGN (inex1, inex2)))
+                            {
+                              printf ("Error in check2 on %s, prec = %d, "
+                                      "i = %d, j = %d, k = %d, f1 = %d, "
+                                      "f2 = %d\n",
+                                      mpfr_print_rnd_mode ((mpfr_rnd_t) r),
+                                      prec, i, j, k, f1, f2);
+                              printf ("Expected ");
+                              mpfr_dump (sum1);
+                              printf ("with inex = %d\n", inex1);
+                              printf ("Got      ");
+                              mpfr_dump (sum2);
+                              printf ("with inex = %d\n", inex2);
+                              exit (1);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+  for (i = 0; i < 5; i++)
+    mpfr_clear (t[i]);
+  mpfr_clears (sum1, sum2, s1, s2, s3, s4, (mpfr_ptr) 0);
+}
+
+/* t[i] = (2^17 - 1) * 2^(17*(i-8)) for 0 <= i <= 16.
+ * t[17] = 2^(17*9+1) * j for -4 <= j <= 4.
+ * t[18] = 2^(-1) * k for -1 <= k <= 1.
+ * t[19] = 2^(-17*8) * m for -3 <= m <= 3.
+ * prec = 17*9+4
+ */
+static void
+check3 (void)
+{
+  mpfr_t sum1, sum2, s1, s2, s3, s4, t[20];
+  mpfr_ptr p[20];
+  int i, s, j, k, m, r, inex1, inex2;
+  int prec = 17*9+4;
+
+  mpfr_init2 (sum1, prec);
+  mpfr_init2 (sum2, prec);
+  mpfr_init2 (s1, 17*17);
+  mpfr_init2 (s2, 17*17+4);
+  mpfr_init2 (s3, 17*17+4);
+  mpfr_init2 (s4, 17*17+5);
+  mpfr_set_ui (s1, 0, MPFR_RNDN);
+  for (i = 0; i < 20; i++)
+    {
+      mpfr_init2 (t[i], 20);
+      p[i] = t[i];
+      if (i < 17)
+        {
+          mpfr_set_ui_2exp (t[i], 0x1ffff, 17*(i-8), MPFR_RNDN);
+          inex1 = mpfr_add (s1, s1, t[i], MPFR_RNDN);
+          MPFR_ASSERTN (inex1 == 0);
+        }
+    }
+
+  for (s = 1; s >= -1; s -= 2)
+    {
+      for (j = -4; j <= 4; j++)
+        {
+          mpfr_set_si_2exp (t[17], j, 17*9+1, MPFR_RNDN);
+          inex1 = mpfr_add (s2, s1, t[17], MPFR_RNDN);
+          MPFR_ASSERTN (inex1 == 0);
+          for (k = -1; k <= 1; k++)
+            {
+              mpfr_set_si_2exp (t[18], k, -1, MPFR_RNDN);
+              inex1 = mpfr_add (s3, s2, t[18], MPFR_RNDN);
+              MPFR_ASSERTN (inex1 == 0);
+              for (m = -3; m <= 3; m++)
+                {
+                  mpfr_set_si_2exp (t[19], m, -17*8, MPFR_RNDN);
+                  inex1 = mpfr_add (s4, s3, t[19], MPFR_RNDN);
+                  MPFR_ASSERTN (inex1 == 0);
+                  RND_LOOP (r)
+                    {
+                      inex1 = mpfr_set (sum1, s4, (mpfr_rnd_t) r);
+                      inex2 = mpfr_sum (sum2, p, 20, (mpfr_rnd_t) r);
+                      MPFR_ASSERTN (mpfr_check (sum1));
+                      MPFR_ASSERTN (mpfr_check (sum2));
+                      if (!(mpfr_equal_p (sum1, sum2) &&
+                            SAME_SIGN (inex1, inex2)))
+                        {
+                          printf ("Error in check3 on %s, "
+                                  "s = %d, j = %d, k = %d, m = %d\n",
+                                  mpfr_print_rnd_mode ((mpfr_rnd_t) r),
+                                  s, j, k, m);
+                          printf ("Expected ");
+                          mpfr_dump (sum1);
+                          printf ("with inex = %d\n", inex1);
+                          printf ("Got      ");
+                          mpfr_dump (sum2);
+                          printf ("with inex = %d\n", inex2);
+                          exit (1);
+                        }
+                    }
+                }
+            }
+        }
+      for (i = 0; i < 17; i++)
+        mpfr_neg (t[i], t[i], MPFR_RNDN);
+      mpfr_neg (s1, s1, MPFR_RNDN);
+    }
+
+  for (i = 0; i < 20; i++)
+    mpfr_clear (t[i]);
+  mpfr_clears (sum1, sum2, s1, s2, s3, s4, (mpfr_ptr) 0);
+}
+
+/* Test of s * (q * 2^(n-1) - 2^k) + h + i * 2^(-2) + j * 2^(-2)
+ * with h = -1 or 1, -1 <= i odd <= j <= 3, 2 <= q <= 3, s = -1 or 1,
+ * prec n-k.
+ * On a 64-bit machine:
+ * MPFR_RNDN, tmd=2, rbit=0, sst=0, negative is checked with the inputs
+ *   -3*2^58, 2^5, -1, 2^(-2), 3*2^(-2)
+ * MPFR_RNDN, tmd=2, rbit=0, sst=1, negative is checked with the inputs
+ *   -3*2^58, 2^5, -1, 3*2^(-2), 3*2^(-2)
+ */
+static void
+check4 (void)
+{
+  mpfr_t sum1, sum2, s1, s2, s3, s4, t[5];
+  mpfr_ptr p[5];
+  int h, i, j, k, n, q, r, s, prec, inex1, inex2;
+
+  mpfr_inits2 (257, sum1, sum2, s1, s2, s3, s4, (mpfr_ptr) 0);
+  for (i = 0; i < 5; i++)
+    {
+      mpfr_init2 (t[i], 2);
+      p[i] = t[i];
+    }
+
+  /* No GNU style for the many nested loops... */
+  for (k = 1; k <= 64; k++) {
+    mpfr_set_si_2exp (t[0], -1, k, MPFR_RNDN);
+    for (n = k + 2; n <= k + 65; n++) {
+      prec = n - k;
+      mpfr_set_prec (sum1, prec);
+      mpfr_set_prec (sum2, prec);
+      for (q = 2; q <= 3; q++) {
+        mpfr_set_si_2exp (t[1], q, n - 1, MPFR_RNDN);
+        inex1 = mpfr_add (s1, t[0], t[1], MPFR_RNDN);
+        MPFR_ASSERTN (inex1 == 0);
+        for (s = -1; s <= 1; s += 2) {
+          mpfr_neg (t[0], t[0], MPFR_RNDN);
+          mpfr_neg (t[1], t[1], MPFR_RNDN);
+          mpfr_neg (s1, s1, MPFR_RNDN);
+          for (h = -1; h <= 1; h += 2) {
+            mpfr_set_si (t[2], h, MPFR_RNDN);
+            inex1 = mpfr_add (s2, s1, t[2], MPFR_RNDN);
+            MPFR_ASSERTN (inex1 == 0);
+            for (i = -1; i <= 3; i += 2) {
+              mpfr_set_si_2exp (t[3], i, -2, MPFR_RNDN);
+              inex1 = mpfr_add (s3, s2, t[3], MPFR_RNDN);
+              MPFR_ASSERTN (inex1 == 0);
+              for (j = i; j <= 3; j++) {
+                mpfr_set_si_2exp (t[4], j, -2, MPFR_RNDN);
+                inex1 = mpfr_add (s4, s3, t[4], MPFR_RNDN);
+                MPFR_ASSERTN (inex1 == 0);
+                RND_LOOP (r) {
+                  inex1 = mpfr_set (sum1, s4, (mpfr_rnd_t) r);
+                  inex2 = mpfr_sum (sum2, p, 5, (mpfr_rnd_t) r);
+                  MPFR_ASSERTN (mpfr_check (sum1));
+                  MPFR_ASSERTN (mpfr_check (sum2));
+                  if (!(mpfr_equal_p (sum1, sum2) &&
+                        SAME_SIGN (inex1, inex2)))
+                    {
+                      printf ("Error in check4 on %s, "
+                              "k = %d, n = %d (prec %d), "
+                              "q = %d, s = %d, h = %d, i = %d, j = %d\n",
+                              mpfr_print_rnd_mode ((mpfr_rnd_t) r),
+                              k, n, prec, q, s, h, i, j);
+                      printf ("Expected ");
+                      mpfr_dump (sum1);
+                      printf ("with inex = %d\n", inex1);
+                      printf ("Got      ");
+                      mpfr_dump (sum2);
+                      printf ("with inex = %d\n", inex2);
+                      exit (1);
+                    }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  for (i = 0; i < 5; i++)
+    mpfr_clear (t[i]);
+  mpfr_clears (sum1, sum2, s1, s2, s3, s4, (mpfr_ptr) 0);
+}
+
 /* bug reported by Joseph S. Myers on 2013-10-27
    https://sympa.inria.fr/sympa/arc/mpfr/2013-10/msg00015.html */
 static void
 bug20131027 (void)
 {
-  mpfr_t r, t[4];
+  mpfr_t sum, t[4];
   mpfr_ptr p[4];
   char *s[4] = {
     "0x1p1000",
@@ -331,35 +641,99 @@ bug20131027 (void)
     "-0x1p947",
     "0x1p880"
   };
-  int i;
+  int i, r;
+
+  mpfr_init2 (sum, 53);
 
-  mpfr_init2 (r, 53);
   for (i = 0; i < 4; i++)
     {
       mpfr_init2 (t[i], i == 0 ? 53 : 1000);
       mpfr_set_str (t[i], s[i], 0, MPFR_RNDN);
       p[i] = t[i];
     }
-  mpfr_sum (r, p, 4, MPFR_RNDN);
 
-  if (MPFR_NOTZERO (r))
+  RND_LOOP(r)
     {
-      printf ("mpfr_sum incorrect in bug20131027: expected 0, got\n");
-      mpfr_dump (r);
-      exit (1);
+      int expected_sign = (mpfr_rnd_t) r == MPFR_RNDD ? -1 : 1;
+      int inex;
+
+      inex = mpfr_sum (sum, p, 4, (mpfr_rnd_t) r);
+
+      if (MPFR_NOTZERO (sum) || MPFR_SIGN (sum) != expected_sign || inex != 0)
+        {
+          printf ("mpfr_sum incorrect in bug20131027 for %s:\n"
+                  "expected %c0 with inex = 0, got ",
+                  mpfr_print_rnd_mode ((mpfr_rnd_t) r),
+                  expected_sign > 0 ? '+' : '-');
+          mpfr_dump (sum);
+          printf ("with inex = %d\n", inex);
+          exit (1);
+        }
     }
 
   for (i = 0; i < 4; i++)
     mpfr_clear (t[i]);
-  mpfr_clear (r);
+  mpfr_clear (sum);
 }
 
+/* Occurs in branches/new-sum/src/sum.c@9344 on a 64-bit machine. */
+static void
+bug20150327 (void)
+{
+  mpfr_t sum1, sum2, t[3];
+  mpfr_ptr p[3];
+  char *s[3] = { "0.10000111110101000010101011100001", "1E-100", "0.1E95" };
+  int i, r;
+
+  mpfr_inits2 (58, sum1, sum2, (mpfr_ptr) 0);
+
+  for (i = 0; i < 3; i++)
+    {
+      mpfr_init2 (t[i], 64);
+      mpfr_set_str (t[i], s[i], 2, MPFR_RNDN);
+      p[i] = t[i];
+    }
+
+  RND_LOOP(r)
+    {
+      int inex1, inex2;
+
+      mpfr_set (sum1, t[2], MPFR_RNDN);
+      inex1 = -1;
+      if (MPFR_IS_LIKE_RNDU ((mpfr_rnd_t) r, 1))
+        {
+          mpfr_nextabove (sum1);
+          inex1 = 1;
+        }
+
+      inex2 = mpfr_sum (sum2, p, 3, (mpfr_rnd_t) r);
+
+      if (!(mpfr_equal_p (sum1, sum2) && SAME_SIGN (inex1, inex2)))
+        {
+          printf ("mpfr_sum incorrect in bug20150327 for %s:\n",
+                  mpfr_print_rnd_mode ((mpfr_rnd_t) r));
+          printf ("Expected ");
+          mpfr_dump (sum1);
+          printf ("with inex = %d\n", inex1);
+          printf ("Got      ");
+          mpfr_dump (sum2);
+          printf ("with inex = %d\n", inex2);
+          exit (1);
+        }
+    }
+
+  for (i = 0; i < 3; i++)
+    mpfr_clear (t[i]);
+  mpfr_clears (sum1, sum2, (mpfr_ptr) 0);
+}
+
+/* TODO: A test with more inputs (but can't be compared to mpfr_add). */
 static void
 check_extreme (void)
 {
   mpfr_t u, v, w, x, y;
   mpfr_ptr t[2];
-  int i, inex, r;
+  int i, inex1, inex2, r;
 
   t[0] = u;
   t[1] = v;
@@ -371,19 +745,21 @@ check_extreme (void)
   RND_LOOP (r)
     for (i = 0; i < 2; i++)
       {
-        mpfr_sum (x, t, 2, (mpfr_rnd_t) r);
-        mpfr_set_prec (y, 64);
-        inex = mpfr_add (y, u, w, MPFR_RNDN);
-        MPFR_ASSERTN (inex == 0);
-        mpfr_prec_round (y, 32, (mpfr_rnd_t) r);
-        if (! mpfr_equal_p (x, y))
+        mpfr_set_prec (x, 64);
+        inex1 = mpfr_add (x, u, w, MPFR_RNDN);
+        MPFR_ASSERTN (inex1 == 0);
+        inex1 = mpfr_prec_round (x, 32, (mpfr_rnd_t) r);
+        inex2 = mpfr_sum (y, t, 2, (mpfr_rnd_t) r);
+        if (!(mpfr_equal_p (x, y) && SAME_SIGN (inex1, inex2)))
           {
             printf ("Error in check_extreme (%s, i = %d)\n",
                     mpfr_print_rnd_mode ((mpfr_rnd_t) r), i);
             printf ("Expected ");
-            mpfr_dump (y);
-            printf ("Got      ");
             mpfr_dump (x);
+            printf ("with inex = %d\n", inex1);
+            printf ("Got      ");
+            mpfr_dump (y);
+            printf ("with inex = %d\n", inex2);
             exit (1);
           }
         mpfr_neg (v, v, MPFR_RNDN);
@@ -392,19 +768,19 @@ check_extreme (void)
   mpfr_clears (u, v, w, x, y, (mpfr_ptr) 0);
 }
 
+/* Generic random tests with cancellations */
 static void
 cancel (void)
 {
-  mpfr_t x[2 * MPFR_NCANCEL];
+  mpfr_t x[2 * MPFR_NCANCEL], bound;
   mpfr_ptr px[2 * MPFR_NCANCEL];
   int i, j, n;
 
-  /* FIXME: re-enable and improve once mpfr_sum has been fixed.
-     check_extreme() is currently sufficient to show problems. */
-  return;
+  mpfr_init2 (bound, 2);
 
   for (i = 0; i < 8; i++)
     {
+      mpfr_set_inf (bound, 1);
       for (n = 0; n < numberof (x); n++)
         {
           mpfr_prec_t p;
@@ -445,35 +821,254 @@ cancel (void)
                       mpfr_print_rnd_mode (rnd), mpfr_get_prec (x[n]));
 #endif
               mpfr_sum (x[n], px, n, rnd);
+              if (mpfr_zero_p (x[n]))
+                {
+                  n++;
+                  break;
+                }
               mpfr_neg (x[n], x[n], MPFR_RNDN);
+              if (mpfr_greater_p (x[n], bound))
+                {
+                  printf ("Error in cancel on i = %d, n = %d\n", i, n);
+                  exit (1);
+                }
+              mpfr_set_ui_2exp (bound, 1,
+                                mpfr_get_exp (x[n]) - p - (rnd == MPFR_RNDN),
+                                MPFR_RNDN);
+              /* The next sum will be <= bound in absolute value
+                 (the equality can be obtained in all rounding modes
+                 since the sum will be rounded). */
             }
         }
 
       while (--n >= 0)
         mpfr_clear (x[n]);
     }
+
+  mpfr_clear (bound);
+}
+
+#ifndef NOVFL
+# define NOVFL 30
+#endif
+
+static void
+check_overflow (void)
+{
+  mpfr_t sum1, sum2, x, y;
+  mpfr_ptr t[2 * NOVFL];
+  mpfr_exp_t emin, emax;
+  int i, r;
+
+  emin = mpfr_get_emin ();
+  emax = mpfr_get_emax ();
+  set_emin (MPFR_EMIN_MIN);
+  set_emax (MPFR_EMAX_MAX);
+
+  mpfr_inits2 (32, sum1, sum2, x, y, (mpfr_ptr) 0);
+  mpfr_setmax (x, mpfr_get_emax ());
+  mpfr_neg (y, x, MPFR_RNDN);
+
+  for (i = 0; i < 2 * NOVFL; i++)
+    t[i] = i < NOVFL ? x : y;
+
+  /* Two kinds of test:
+   *   i = 1: overflow.
+   *   i = 2: intermediate overflow (exact sum is 0).
+   */
+  for (i = 1; i <= 2; i++)
+    RND_LOOP(r)
+      {
+        int inex1, inex2;
+
+        inex1 = mpfr_add (sum1, x, i == 1 ? x : y, (mpfr_rnd_t) r);
+        inex2 = mpfr_sum (sum2, t, i * NOVFL, (mpfr_rnd_t) r);
+        MPFR_ASSERTN (mpfr_check (sum1));
+        MPFR_ASSERTN (mpfr_check (sum2));
+        if (!(mpfr_equal_p (sum1, sum2) && SAME_SIGN (inex1, inex2)))
+          {
+            printf ("Error in check_overflow on %s, i = %d\n",
+                    mpfr_print_rnd_mode ((mpfr_rnd_t) r), i);
+            printf ("Expected ");
+            mpfr_dump (sum1);
+            printf ("with inex = %d\n", inex1);
+            printf ("Got      ");
+            mpfr_dump (sum2);
+            printf ("with inex = %d\n", inex2);
+            exit (1);
+          }
+      }
+
+  mpfr_clears (sum1, sum2, x, y, (mpfr_ptr) 0);
+
+  set_emin (emin);
+  set_emax (emax);
+}
+
+#ifndef NUNFL
+# define NUNFL 9
+#endif
+
+/* t[0] = 2^(-k) - sum(t[i],i=1..n)
+ */
+static void
+check_underflow (void)
+{
+  mpfr_t sum1, sum2, t[NUNFL];
+  mpfr_ptr p[NUNFL];
+  mpfr_prec_t precmax = 444;
+  mpfr_exp_t emin, emax;
+  unsigned int ex_flags, flags;
+  int c, i;
+
+  emin = mpfr_get_emin ();
+  emax = mpfr_get_emax ();
+  set_emin (MPFR_EMIN_MIN);
+  set_emax (MPFR_EMAX_MAX);
+
+  ex_flags = MPFR_FLAGS_UNDERFLOW | MPFR_FLAGS_INEXACT;
+
+  mpfr_init2 (sum1, MPFR_PREC_MIN);
+  mpfr_init2 (sum2, precmax);
+
+  for (i = 0; i < NUNFL; i++)
+    {
+      mpfr_init2 (t[i], precmax);
+      p[i] = t[i];
+    }
+
+  for (c = 0; c < 8; c++)
+    {
+      mpfr_prec_t fprec;
+      int n, neg, r;
+
+      fprec = MPFR_PREC_MIN + (randlimb () % (precmax - MPFR_PREC_MIN + 1));
+      n = 3 + (randlimb () % (NUNFL - 2));
+      MPFR_ASSERTN (n <= NUNFL);
+
+      mpfr_set_prec (sum2, (randlimb () & 1) ? MPFR_PREC_MIN : precmax);
+      mpfr_set_prec (t[0], fprec + 64);
+      mpfr_set_zero (t[0], 1);
+
+      for (i = 1; i < n; i++)
+        {
+          int inex;
+
+          mpfr_set_prec (t[i], MPFR_PREC_MIN +
+                         (randlimb () % (fprec - MPFR_PREC_MIN + 1)));
+          do
+            mpfr_urandomb (t[i], RANDS);
+          while (MPFR_IS_ZERO (t[i]));
+          mpfr_set_exp (t[i], MPFR_EMIN_MIN);
+          inex = mpfr_sub (t[0], t[0], t[i], MPFR_RNDN);
+          MPFR_ASSERTN (inex == 0);
+        }
+
+      neg = randlimb () & 1;
+      if (neg)
+        mpfr_nextbelow (t[0]);
+      else
+        mpfr_nextabove (t[0]);
+
+      RND_LOOP(r)
+        {
+          int inex1, inex2;
+
+          mpfr_set_zero (sum1, 1);
+          if (neg)
+            mpfr_nextbelow (sum1);
+          else
+            mpfr_nextabove (sum1);
+          inex1 = mpfr_div_2ui (sum1, sum1, 2, (mpfr_rnd_t) r);
+
+          mpfr_clear_flags ();
+          inex2 = mpfr_sum (sum2, p, n, (mpfr_rnd_t) r);
+          flags = __gmpfr_flags;
+
+          MPFR_ASSERTN (mpfr_check (sum1));
+          MPFR_ASSERTN (mpfr_check (sum2));
+
+          if (flags != ex_flags)
+            {
+              printf ("Bad flags in check_underflow on %s, c = %d\n",
+                      mpfr_print_rnd_mode ((mpfr_rnd_t) r), c);
+              printf ("Expected flags:");
+              flags_out (ex_flags);
+              printf ("Got flags:     ");
+              flags_out (flags);
+              printf ("sum = ");
+              mpfr_dump (sum2);
+              exit (1);
+            }
+
+          if (!(mpfr_equal_p (sum1, sum2) && SAME_SIGN (inex1, inex2)))
+            {
+              printf ("Error in check_underflow on %s, c = %d\n",
+                      mpfr_print_rnd_mode ((mpfr_rnd_t) r), c);
+              printf ("Expected ");
+              mpfr_dump (sum1);
+              printf ("with inex = %d\n", inex1);
+              printf ("Got      ");
+              mpfr_dump (sum2);
+              printf ("with inex = %d\n", inex2);
+              exit (1);
+            }
+        }
+    }
+
+  for (i = 0; i < NUNFL; i++)
+    mpfr_clear (t[i]);
+  mpfr_clears (sum1, sum2, (mpfr_ptr) 0);
+
+  set_emin (emin);
+  set_emax (emax);
+}
+
+static void
+check_coverage (void)
+{
+#ifdef MPFR_COV_CHECK
+  int r, i, j, k, p;
+  int err = 0;
+
+  for (r = 0; r < MPFR_RND_MAX; r++)
+    for (i = 0; i < 1 + ((mpfr_rnd_t) r == MPFR_RNDN); i++)
+      for (j = 0; j < 2; j++)
+        for (k = 0; k < 3; k++)
+          for (p = 0; p < 2; p++)
+            if (!__gmpfr_cov_sum_tmd[r][i][j][k][p])
+              {
+                printf ("TMD not tested on %s, tmd=%d, rbit=%d, sst=%d, %s\n",
+                        mpfr_print_rnd_mode ((mpfr_rnd_t) r), i+1, j, k-1,
+                        p ? "positive" : "negative");
+                err = 1;
+              }
+
+  if (err)
+    exit (1);
+#endif
 }
 
 int
 main (void)
 {
-  mpfr_prec_t p;
-  unsigned long n;
-
   tests_start_mpfr ();
 
   check_special ();
+  check_more_special ();
+  check1 ();
+  check2 ();
+  check3 ();
+  check4 ();
   bug20131027 ();
-  test_sort (1764, 1026);
-  for (p = 2 ; p < 444 ; p += 17)
-    for (n = 2 ; n < 1026 ; n += 42 + p)
-      test_sum (p, n);
-  /* FIXME: remove the MPFR_SKIP_TSUM test once the new mpfr_sum version
-     is out. */
-  if (getenv ("MPFR_SKIP_EXTREME") == NULL)
-    check_extreme ();
+  bug20150327 ();
+  generic_tests ();
+  check_extreme ();
   cancel ();
+  check_overflow ();
+  check_underflow ();
 
+  check_coverage ();
   tests_end_mpfr ();
   return 0;
 }