\documentclass {article} \usepackage[a4paper]{geometry} \usepackage[utf8]{inputenc} \usepackage[T1]{fontenc} \usepackage{amsmath,amssymb} \usepackage{url} \usepackage[notref,notcite]{showkeys} \newcommand {\corr}[1]{\widetilde {#1}} \newcommand {\appro}[1]{\overline {#1}} \newcommand {\mpc}{{\tt mpc}} \newcommand {\mpfr}{{\tt mpfr}} \newcommand {\ulp}[1]{#1~ulp} \newcommand {\Ulp}{{\operatorname {ulp}}} \DeclareMathOperator{\Exp}{\operatorname {Exp}} \newcommand {\atantwo}{\operatorname {atan2}} \newcommand{\error}{\operatorname {error}} \newcommand{\relerror}{\operatorname {relerror}} \newcommand{\Norm}{\operatorname {N}} \newcommand {\round}{\operatorname {\circ}} \DeclareMathOperator{\pinf}{\bigtriangleup} \DeclareMathOperator{\minf}{\bigtriangledown} \DeclareMathOperator{\N}{\mathcal N} \DeclareMathOperator{\A}{\mathcal A} \newcommand {\Z}{\mathbb Z} \newcommand {\Q}{\mathbb Q} \newcommand {\R}{\mathbb R} \renewcommand {\epsilon}{\varepsilon} \renewcommand {\theta}{\vartheta} \renewcommand {\leq}{\leqslant} \renewcommand {\geq}{\geqslant} \newtheorem{theorem}{Theorem} \newtheorem{lemma}[theorem]{Lemma} \newtheorem{definition}[theorem]{Definition} \newtheorem{prop}[theorem]{Proposition} \newtheorem{conj}[theorem]{Conjecture} \newenvironment{proof}{\noindent{\bf Proof:}}{{\hspace* {\fill}$\blacksquare$}} \newcommand {\enumi}[1]{(\alph {#1})} \renewcommand {\labelenumi}{\enumi {enumi}} \newcommand {\enumii}[1]{(\roman {#1})} \renewcommand {\labelenumii}{\enumii {enumii}} \title {MPC: Algorithms and Error Analysis} \author {Andreas Enge \and Philippe Th\'eveny \and Paul Zimmermann} \date {June 17, 2009} \begin {document} \maketitle \tableofcontents \section {Error propagation} \subsection {Introduction and notation} This section is devoted to the analysis of error propagation: Given a function whose input arguments already have a certain error, what is the error bound on the function output? The output error usually consists of two components: the error propagated from the input, which may be arbitrarily amplified (or, if one is lucky, shrunk); and an additional small error accounting for the rounding of the output. The results are needed to give a cumulated error analysis for algorithms that combine several elementary arithmetic operations. \subsubsection {Ulp calculus} \begin {definition} \label {def:exp} Let $x$ be a real number, which can be written uniquely as $x = m \cdot 2^e$ with $\frac{1}{2} \le |m| < 1$. The {\em exponent} of $x$ is $\Exp(x) = e = \lfloor \log_2 |x| \rfloor + 1$. The number is {\em representable at precision~$p$} if $2^p m$ is an integer. We denote the rounding of $x$ to one of the at most two representable numbers in the open interval $(x - 2^{-p}, x + 2^{-p})$ by $\round (x) = \round_p (x)$, with rounding being to nearest, up, down, towards zero or away from zero if there is a choice. \end {definition} \begin {prop} \label {prop:expmuldiv} If $x_1$ and $x_2$ are two real numbers, then \begin {gather*} \Exp (x_1) + \Exp (x_2) - 1 \leq \Exp (x_1 x_2) \leq \Exp (x_1) + \Exp (x_2), \\ \Exp (x_1) - \Exp (x_2) \leq \Exp \left( \frac {x_1}{x_2} \right) \leq \Exp (x_1) - \Exp (x_2) + 1. \end {gather*} \end {prop} \begin {proof} Write $x_n = m_n 2^{\Exp (x_n)}$ and $x = x_1 x_2 = m 2^{\Exp x} = m_1 m_2 2^{\Exp (x_1) + \Exp (x_2)}$ with $\frac {1}{2} \leq m_n, m < 1$. Then $m = m_1 m_2$ if the product is at least $\frac {1}{2}$ and $m = 2 m_1 m_2$ if the product is less than $\frac {1}{2}$, which yields the first line of inequalities. The other inequalities are derived in the same way from $\frac {1}{2} < \frac {m_1}{m_2} < 2$. \end {proof} \begin {prop} \label {prop:expround} For any real number $x$, \[ \Exp (x) \leq \Exp (\round (x)) \leq \Exp (x) + 1, \] with equality occurring on the right if and only if $|x|$ has been rounded up to $|\round (x)| = 2^{\Exp (x)}$. \end {prop} \begin {proof} Letting $x = m 2^{\Exp (x)}$, we have $\frac {1}{2} \cdot 2^{\Exp (x)} \leq \round (x) \leq 1 \cdot 2^{\Exp (x)}$, since these two numbers are representable (independently of the precision). \end {proof} \begin {definition} \label {def:ulp} Let $x$ be a real number which is representable at precision~$p$. Its associated {\em unit in the last place} is $\Ulp(x) = \Ulp_p (x) = 2^{\Exp(x) - p}$, so that adding $\Ulp(x)$ to $x$ corresponds to adding $1$ to the integer $2^p m$. \end {definition} \subsubsection {Absolute error} In the remainder of this chapter, all complex numbers are denoted by the letter $z$ with subscripts and mathematical accents, decomposed in Cartesian coordinates as $z = x + i y$ with the same diacritics applied to $x$ and $y$ as to $z$. All representable real numbers are supposed to have the same precision~$p$. We apply the following error definition of real numbers separately to the two coordinates of a complex number. \begin {definition} \label {def:error} Given a correct real number $\corr x$ and its approximation $\appro x$, we define the {\em absolute error} of $\appro x$ as $\error (\appro x) = | \corr x - \appro x |$. \end {definition} Notice that in the following, the absolute error is usually expressed in terms of $\Ulp$, which is itself a relative measure with respect to the exponent of the number. Let $\corr z = f (\corr {z_1}, \ldots) = \corr x + i \corr y$ be the correct result of a complex function applied to the correct arguments $\corr {z_n}$. We assume that the $\corr {z_n}$ themselves are not known, but only approximate input values $\appro {z_n} = \appro {x_n} + i \appro {y_n}$; for instance, the $\corr {z_n}$ may be the exact results of some formul\ae, whereas the $\appro {z_n}$ are the outcome of the corresponding computation and affected by rounding errors. We suppose that error bounds $\error (\appro {x_n}) \leq k_{R, n} 2^{\Exp (\appro {x_n}) - p}$ and $\error (\appro {y_n}) \leq k_{I, n} 2^{\Exp (\appro {y_n}) - p}$ for some $k_{R, n}$ and $k_{I, n}$ are known. (This particular notation becomes more comprehensible when $\appro {x_n}$ and $\appro {y_n}$ are representable at precision~$p$, since then the units of the error measure become $\Ulp (\appro {x_n})$ and $\Ulp (\appro {y_n})$, respectively; however, there is no need to restrict the results of this chapter to representable numbers.) Our aim is to determine the propagated error in the output value $\appro z = \appro x + i \appro y = f (\appro {z_1}, \ldots)$, which is given by \begin {equation} \label {eq:properror} \error (\appro x) \leq | \Re (f (\corr {z_1}, \ldots)) - \Re (f (\appro {z_1}, \ldots)) | \end {equation} and an analogous formula for $\error (\appro y)$. In general, we are looking for $k_R$ and $k_I$ such that \[ \error (\appro x) \leq k_R 2^{\Exp (\appro x) - p} \text { and } \error (\appro y) \leq k_I 2^{\Exp (\appro y) - p}. \] Moreover, we are interested in the cumulated error if additionally $\appro z$ is rounded coordinatewise at the target precision~$p$ to $\round (\appro z)$. This operation adds an error of $c_R \Ulp (\round (\appro x))$ to the real and of $c_I \Ulp (\round (\appro y))$ to the imaginary part, where $c_X \leq 1$ when $\round$ stands for rounding up, down, to zero or to infinity, and $c_X \leq \frac {1}{2}$ when $\round$ stands for rounding to nearest. Then, via Proposition~\ref {prop:expround}, \[ \error (\round (\appro x)) \leq (k_R + c_R) \Ulp (\appro x) \text { and } \error (\round (\appro y)) \leq (k_I + c_I) \Ulp (\appro y). \] \subsubsection {Real relative error} It can sometimes be useful to determine errors not absolutely as differences (close to~$0$), but relatively as multiplicative factors (close to~$1$). \begin {definition} \label {def:relerror} Given a correct real number $\corr x$ and its approximation $\appro x$ of the same sign, we define the {\em lower and upper relative errors} of $\appro x$ as the smallest non-negative real numbers $\relerror^- (\appro x) = \epsilon^-$ and $\relerror^+ (\appro x) = \epsilon^+$ such that \[ 1 - \epsilon^- \leq \frac {|\corr x|}{|\appro x|} \leq 1 + \epsilon^+ \] or, equivalently, \[ - \epsilon^- \leq \frac {|\corr x| - |\appro x|}{| \appro x| } \leq \epsilon^+. \] The {\em relative error} of $\appro x$ is \[ \relerror (\appro x) = \epsilon = \max (\epsilon^-, \epsilon^+) = \frac {\error (\appro x)}{|\appro x|}. \] \end {definition} Notice that $\epsilon^- = 0$ whenever $|\appro x| \leq |\corr x|$ and $\epsilon^+ = 0$ whenever $|\appro x| \geq |\corr x|$, so that at least one of $\epsilon^+$ and $\epsilon^-$ is zero. The definition of relative error carries over immediately to complex numbers, see \S\ref {sssec:comrelerror}. However, in the following we usually argument separately for the two coordinates, so we use corresponding $\epsilon$-values with subscript $R$ and $I$ for the real and imaginary part, respectively. When an absolute error is expressed in the relative unit $\Ulp$, then it is easy to switch back and forth between absolute and relative errors. \begin {prop} \label {prop:relerror} Let $\appro x$ be representable at precision $p$. \begin {enumerate} \item If $\error (\appro x) \leq k \Ulp (\appro x)$, then $\relerror (\appro x) \leq k 2^{1 - p}$. \item If $\relerror (\appro x) \leq k 2^{-p}$, then $\error (\appro x) \leq k \Ulp (\appro x)$. \end {enumerate} These assertions remain valid if $\appro x$ is not representable at precision~$p$ and $\Ulp (\appro x)$ is replaced by $2^{\Exp (\appro x) - p}$. \end {prop} \begin {proof} Concerning the first assertion, we have $ \relerror (\appro x) = \frac {\error (\appro x)}{|\appro x|} \leq \frac {k \Ulp (\appro x)}{|\appro x|}. $ Plugging in from Definition~\ref {def:ulp} that $\Ulp (\appro x) = 2^{\Exp (\appro x) - p}$ and $|\appro x| \geq 2^{\Exp (\appro x) - 1}$ finishes the proof. The second assertion is proved in the same manner, using $|\appro x| \leq 2^{\Exp (\appro x)}$. \end {proof} \subsubsection {Complex relative error} \label {sssec:comrelerror} Some care must be taken when generalising the real relative error to complex approximations; by keeping the absolute values in Definition~\ref {def:relerror}, all information is lost. \begin {definition} \label {def:comrelerror} Given a correct complex number $\corr z$ and its non-zero approximation $\appro z$, let \[ \theta = \frac {\corr z - \appro z}{\appro z}, \text { or } \corr z = (1 + \theta) \appro z. \] Then the {\em relative error} of $\appro z$ is \[ \relerror (\appro z) = \epsilon = | \theta | = \left| \frac {\corr z - \appro z}{\appro z} \right|. \] \end {definition} Notice that this definition coincides with Definition~\ref {def:relerror} for real numbers of the same sign. The following result gives a coarse estimate of the relative errors of the real and imaginary parts in terms of the complex relative error, and vice versa. \begin {prop} \label {prop:comrelerror} Let $\corr z = \corr x + i \corr y$, $\appro z = \appro x + i \appro y$, $\epsilon = \relerror (\appro z)$, $\epsilon_R = \relerror (\appro x)$ and $\epsilon_I = \relerror (\appro y)$, and assume that the closed circle around $\appro z$ of radius $\epsilon |\appro z|$ is contained in one quadrant of the complex plane. Then \begin {align*} \epsilon_R &\leq 2^{\max (0.5, \Exp (\appro y) - \Exp (\appro x) + 1.5)} \epsilon \\ \epsilon_I &\leq 2^{\max (0.5, \Exp (\appro x) - \Exp (\appro y) + 1.5)} \epsilon \\ \epsilon &\leq \sqrt 2 (\epsilon_R + \epsilon_I) \end {align*} \end {prop} \begin {proof} By assumption, the correct value $\corr z$ lies in the same quadrant as $\appro z$, so that $\corr x$ and $\appro x$ resp. $\corr y$ and $\appro y$ have the same sign. Write $\theta = \frac {\corr z - \appro z}{\appro z} = \theta_R + i \theta_I$. Then $\corr x - \appro x = \Re (\appro z \theta) = \appro x \theta_R - \appro y \theta_I$, and \begin {align*} \epsilon_R &= \left| \frac {\corr x - \appro x}{\appro x} \right| \leq |\theta_R| + \left| \frac {\appro y}{\appro x} \right| |\theta_I| \leq \max \left( 1, \left| \frac {\appro y}{\appro x} \right| \right) (|\theta_R| + |\theta_I|) \\ &\leq 2^{\max \left( 0, \Exp (\appro y) - \Exp (\appro x) + 1 \right)} \cdot \sqrt 2 |\theta| \end {align*} by Proposition~\ref {prop:expmuldiv}. The second inequality is proved in the same way. For the converse direction, write \[ |\theta_I| = \left| \Re \left( \frac {(\corr z - \appro z) \overline z}{\appro x^2 + \appro y^2} \right) \right| \leq \left| \frac {- (\corr x - \appro x) \appro y + (\corr y - \appro y) \appro y} {\appro x \appro y} \right| \leq \left| \frac {\corr x - \appro x}{\appro x} \right| + \left| \frac {\corr y - \appro y}{\appro y} \right| \leq \epsilon_R + \epsilon_I, \] and similarly for $\theta_R$, which finishes the proof. \end {proof} \subsection {Real functions} In this section, we derive for later use results on error propagation for functions with real arguments and values. Those already contained in \cite{MPFRAlgorithms} are simply quoted for the sake of self-containedness. \subsubsection {Division} \label {sssec:proprealdiv} Let \[ \appro x = \frac {\appro {x_1}}{\appro {x_2}}. \] Then \[ \error (\appro x) = \left| \frac {\corr {x_1}}{\corr {x_2}} - \frac {\appro {x_1}}{\appro {x_2}} \right| = \left| \frac {\appro {x_1}}{\appro {x_2}} \right| \cdot \left| 1 - \left| \frac {\corr {x_1}}{\appro {x_1}} \right| \cdot \left| \frac {\appro {x_2}}{\corr {x_2}} \right| \right| = | \appro x | \cdot \left| 1 - \left| \frac {\corr {x_1}}{\appro {x_1}} \right| \cdot \left| \frac {\appro {x_2}}{\corr {x_2}} \right| \right| \] Using the notation introduced in Definition~\ref {def:relerror} together with the obvious subscripts to the $\epsilon$, we obtain the bounds \[ - \frac {\epsilon_1^+ + \epsilon_2^-}{1 - \epsilon_2^-} = 1 - \frac {1 + \epsilon_1^+}{1 - \epsilon_2^-} \leq 1 - \left| \frac {\corr {x_1}}{\appro {x_1}} \right| \cdot \left| \frac {\appro {x_2}}{\corr {x_2}} \right| \leq 1 - \frac {1 - \epsilon_1^-}{1 + \epsilon_2^+} = \frac {\epsilon_1^- + \epsilon_2^+}{1 + \epsilon_2^+} \] We need to make the assumption that $\epsilon_2^- < 1$, which is reasonable since otherwise the absolute error on $\appro {x_2}$ would exceed the number itself. Then \[ \error (\appro x) \leq \max \left( \frac {\epsilon_1^+ + \epsilon_2^-}{1 - \epsilon_2^-}, \frac {\epsilon_1^- + \epsilon_2^+}{1 + \epsilon_2^+} \right) |\appro x| \leq \frac {\epsilon_1 + \epsilon_2}{1 - \epsilon_2^-} 2^{\Exp (\appro x)} \] Using the estimation of the relative in terms of the absolute error of Proposition~\ref {prop:relerror}, this bound can be translated into \begin {equation} \label {eq:proprealdiv} \error (\appro x) \leq \frac {2 (k_1 + k_2)}{1 - \epsilon_2^-} 2^{\Exp (\appro x) - p} \leq \frac {2 (k_1 + k_2)}{1 - k_2 2^{1 - p}} 2^{\Exp (\appro x) - p} \end {equation} \subsubsection {Square root} \label {sssec:proprealsqrt} Let \[ \appro x = \sqrt {\appro {x_1}}. \] Then by \cite[\S1.7]{MPFRAlgorithms}, \begin {equation} \label {eq:proprealsqrt} \error (\appro x) \leq \frac {2 k_1}{1 + \sqrt {1 + \epsilon_1^-}} 2^{\Exp (\appro x) - p}. \end {equation} \subsection {Complex functions} \subsubsection {Addition/subtraction} Using the notation introduced above, we consider \[ \appro z = \appro {z_1} + \appro {z_2}. \] By \eqref {eq:properror}, we obtain \begin{align*} \error (\appro x) & \leq | (\corr {x_1} + \corr {x_2}) - (\appro {x_1} + \appro {x_2})| \\ & \leq | \corr {x_1} - \appro {x_1} | + | \corr {x_2} - \appro {x_2}| \\ & \leq k_{R,1} 2^{\Exp (\appro {x_1}) - p} + k_{R,2} 2^{\Exp (\appro {x_2}) - p} \\ & \leq \left( k_{R,1} 2^{d_{R,1}} + k_{R,2} 2^{d_{R,2}} \right) 2^{\Exp (\appro x) - p}, \end{align*} where $d_{R,n}=\Exp(\appro {x_n})-\Exp(\appro x)$. Otherwise said, the absolute errors add up, but their relative expression in terms of $\Ulp$ of the result grows if the result has a smaller exponent than the operands, that is, if cancellation occurs. If $\appro {x_1}$ and $\appro {x_2}$, have the same sign, then there is no cancellation, $d_{R, n} \leq 0$ and \[ \error (\appro x) \leq (k_{R,1} + k_{R,2} + c_R) 2^{\Exp (\appro x) - p}. \] An analogous error bound holds for the imaginary part. For subtraction, the same bounds are obtained, except that the simpler bound now holds whenever $\appro {x_1}$ and $\appro {x_2}$ resp. $\appro {y_1}$ and $\appro {y_2}$ have different signs. \subsubsection {Multiplication} \label {sssec:propmul} Let \[ \appro z = \appro {z_1} \times \appro {z_2}, \] so that \begin {align*} \appro x & = \appro {x_1} \appro {x_2} - \appro {y_1} \appro {y_2}, \\ \appro y & = \appro {x_1} \appro {y_2} + \appro {x_2} \appro {y_1}. \end {align*} Then \[ \error (\appro x) \leq | \Re (\corr {z_1} \times \corr {z_2}) - \Re (\appro {z_1} \times \appro {z_2})| \leq | \corr {x_1} \corr {x_2} - \appro {x_1} \appro {x_2}| + | \corr {y_1} \corr {y_2} - \appro {y_1} \appro {y_2}|. \] The first term on the right hand side can be bounded as follows, where we use the short-hand notation $\epsilon_{R, 1}^+$ for $\relerror^+ (\appro {x_1})$, and analogously for other relative errors: \begin{align*} | \corr {x_1} \corr {x_2} - \appro {x_1} \appro {x_2}| & \leq \frac{1}{2} \left( |\appro {x_1} - \corr {x_1}| (|\appro {x_2}| + |\corr {x_2}|) + |\appro {x_2} - \corr {x_2}| (|\appro {x_1}| + |\corr {x_1}|) \right) \\ & \leq \frac {1}{2} \left( \epsilon_{R, 1} |\appro {x_1}| |\appro {x_2}| \left( 1 + \frac {|\corr {x_2}|}{|\appro {x_2}|} \right) + \epsilon_{R, 2} |\appro {x_2}| |\appro {x_1}| \left( 1 + \frac {|\corr {x_1}|}{|\appro {x_1}|} \right) \right) \\ & \leq \left( k_{R, 1} \left( 1 + \frac {|\corr {x_2}|}{|\appro {x_2}|} \right) + k_{R, 2} \left( 1 + \frac {|\corr {x_1}|}{|\appro {x_1}|} \right) \right) |\appro {x_1} \appro {x_2}| \, 2^{-p} \text { by Proposition~\ref {prop:relerror}} \\ & \leq \left( k_{R, 1} (2 + \epsilon_{R, 2}^+) + k_{R, 2} (2 + \epsilon_{R, 1}^+) \right) 2^{\Exp (\appro {x_1} \appro {x_2}) - p}. \end{align*} In the same way, we obtain \[ | \corr {y_1} \corr {y_2} - \appro {y_1} \appro {y_2}| \leq \left( k_{I, 1} (2 + \epsilon_{I, 2}^+) + k_{I, 2} (2 + \epsilon_{I, 1}^+) \right) 2^{\Exp (\appro {y_1} \appro {y_2}) - p}. \] It remains to estimate $\Exp (\appro {x_1} \appro {x_2})$ and $\Exp (\appro {y_1} \appro {y_2})$ with respect to $\Exp (x)$ to obtain a bound in terms of $\Ulp (\appro x)$. This becomes problematic when, due to the subtraction, cancellation occurs. In all generality, let $d = \Exp (\appro {x_1} \appro {x_2}) - \Exp (\appro x) \leq \Exp (\appro {x_1}) + \Exp (\appro {x_2}) - \Exp (\appro x)$ by Proposition~\ref {prop:expmuldiv} and $d' = \Exp( \appro {y_1} \appro {y_2}) - \Exp (\appro x) \leq \Exp (\appro {y_1}) + \Exp (\appro {y_2}) - \Exp (\appro x)$. Then \begin {equation} \label {eq:propmulre} \error( \appro x) \leq \left( \left( k_{R, 1} (2 + \epsilon_{R, 2}^+) + k_{R, 2} (2 + \epsilon_{R, 1}^+) \right) 2^d + \left( k_{I, 1} (2 + \epsilon_{I, 2}^+) + k_{I, 2} (2 + \epsilon_{I, 1}^+) \right) 2^{d'} \right) 2^{\Exp (\appro x) - p}. \end {equation} If $\appro {x_1} \appro {x_2}$ and $\appro {y_1} \appro {y_2}$ have different signs, then there is no cancellation, and, using the monotonicity of the exponent with respect to the absolute value, we obtain \[ \Exp (\appro x) = \Exp (\appro {x_1} \appro {x_2} - \appro {y_1} \appro {y_2}) = \Exp (|\appro {x_1} \appro {x_2}| + |\appro {y_1} \appro {y_2}|) \geq \Exp (|\appro {x_1} \appro {x_2}|), \Exp (|\appro {y_1} \appro {y_2}|), \] so that $d$, $d' \leq 0$ and the error bound simplifies as \[ \error( \appro x) \leq \left( k_{R, 1} (2 + \epsilon_{R, 2}^+) + k_{R, 2} (2 + \epsilon_{R, 1}^+) + k_{I, 1} (2 + \epsilon_{I, 2}^+) + k_{I, 2} (2 + \epsilon_{I, 1}^+) \right) 2^{\Exp (\appro x) - p}. \] The same approach yields the error of the imaginary part. Letting $\delta = \Exp (\appro {x_1} \appro {y_2}) - \Exp (\appro y) \leq \Exp( \appro {x_1}) + \Exp (\appro {y_2}) - \Exp (\appro y)$ and $\delta' = \Exp (\appro {x_2} \appro {y_1}) - \Exp (\appro {y}) \leq \Exp (\appro {x_2}) + \Exp (\appro {y_1}) - \Exp (\appro y)$, it becomes \begin {equation} \label {eq:propmulim} \error( \appro y) \leq \left( \left( k_{R, 1} (2 + \epsilon_{I, 2}^+) + k_{I, 2} (2 + \epsilon_{R, 1}^+) \right) 2^{\delta} + \left( k_{I, 1} (2 + \epsilon_{R, 2}^+) + k_{R, 2} (2 + \epsilon_{I, 1}^+) \right) 2^{\delta'} \right) 2^{\Exp (\appro y) - p}. \end {equation} If $\appro {x_1} \appro {y_2}$ and $\appro {x_2} \appro {y_1}$ have the same sign, then $\delta$, $\delta' \leq 0$ and \[ \error( \appro y) \leq \left( k_{R, 1} (2 + \epsilon_{I, 2}^+) + k_{I, 2} (2 + \epsilon_{R, 1}^+) + k_{I, 1} (2 + \epsilon_{R, 2}^+) + k_{R, 2} (2 + \epsilon_{I, 1}^+) \right) 2^{\Exp (\appro y) - p}. \] Notice that $x_1 x_2$ and $y_1 y_2$ have the same sign if and only if $y_1 x_2$ and $x_1 y_2$ do. So there is always cancellation in precisely one of the real or the imaginary part. The different values $\epsilon_{X, n}^+$ for $X \in \{ R, I \}$ and $n \in \{ 1, 2 \}$ in the formul{\ae} above may be bounded by $k_{X, n} 2^{1 - p}$ according to Proposition~\ref {prop:relerror}. If some $|\appro {x_n}| \geq |\corr {x_n}|$ resp. $|\appro {y_n}| \geq |\corr {y_n}|$ (for instance, because they have been computed by rounding away from zero), then the corresponding $\epsilon_{X, n}^+$ are zero. A coarser bound may be obtained more easily by considering complex relative errors. Write $\corr {z_n} = (1 + \theta_n) \appro {z_n}$ with $\epsilon_n = | \theta_n |$. Then $\corr z = (1 + \theta) \appro z$ with $\theta = \theta_1 + \theta_2 + \theta_1 \theta_2$ and $\epsilon = |\theta| \leq \epsilon_1 + \epsilon_2 + \epsilon_1 \epsilon_2$. By Proposition~\ref {prop:relerror}, we have $\epsilon_{X, n} \leq k_{X, n} 2^{1-p}$ for $X \in \{ R, I \}$, and by Proposition~\ref {prop:comrelerror}, $\epsilon_n \leq (k_{R, n} + k_{I, n}) 2^{1.5 - p}$. Under normal circumstances, $\epsilon_1 \epsilon_2$ should be negligible, that is, $\epsilon_1 \epsilon_2 \leq (k_{R, 1} + k_{I, 1}) (k_{R, 2} + k_{I, 2}) 2^{3 - 2 p} \leq 2^{1.5 - p}$, so that $\epsilon \leq (k_{R, 1} + k_{I, 1} + k_{R, 2} + k_{I, 2} + 1) 2^{1.5 - p}$. Applying Propositions~\ref {prop:comrelerror} and~\ref {prop:relerror} in the converse direction yields, under the assumption that $\corr z$ and $\appro z$ lie in the same quadrant of the complex plane, \begin {equation} \label {eq:propmulcomrel} \begin {array}{rl} \error (\appro x) &\leq (k_{R, 1} + k_{I, 1} + k_{R, 2} + k_{I, 2} + 1) 2^{\max (2, \Exp (\appro y) - \Exp (\appro x) + 3)} \cdot 2^{\Exp (\appro x) - p} \\ \error (\appro y) &\leq (k_{R, 1} + k_{I, 1} + k_{R, 2} + k_{I, 2} + 1) 2^{\max (2, \Exp (\appro x) - \Exp (\appro y) + 3)} \cdot 2^{\Exp (\appro y) - p} \end {array} \end {equation} \subsubsection {Norm} \label {sssec:propnorm} Let \[ \appro x = \Norm (\appro {z_1}) = |\appro {z_1}|^2 = \appro {x_1}^2 + \appro {y_1}^2. \] Then \[ \error (\appro x) \leq | \Norm (\corr {z_1}) - \Norm (\appro {z_1}) | \leq | \corr {x_1}^2 - \appro {x_1}^2 | + | \corr {y_1}^2 - \appro {y_1}^2 |. \] The first term can be bounded by \begin {align*} | \corr {x_1}^2 - \appro {x_1}^2 | & = |\appro {x_1}| \left| 1 + \frac {|\corr {x_1}|}{|\appro {x_1}|} \right| |\corr {x_1} - \appro {x_1}| \\ & \leq 2^{\Exp (\appro {x_1})} (2 + \epsilon_{R, 1}^+) k_{R, 1} 2^{\Exp (\appro {x_1}) - p} \\ & \leq k_{R, 1} (2 + \epsilon_{R, 1}^+) 2^{\Exp (\appro {x_1}^2) + 1 - p} \text { by Proposition~\ref {prop:expmuldiv}} \\ & \leq 2 k_{R, 1} (2 + \epsilon_{R, 1}^+) 2^{\Exp (\appro x) - p} \text { by the monotonicity of the exponent.} \end {align*} The analogous bound for the second error term yields \begin {equation} \label {eq:propnorm} \error (\appro x) \leq 2 \left( k_{R, 1} (2 + \epsilon_{R, 1}^+) + k_{I, 1} (2 + \epsilon_{I, 1}^+) \right) 2^{\Exp (\appro x) - p} \end {equation} The values $\epsilon_{X, 1}^+$ may be estimated as explained at the end of \S\ref {sssec:propmul}. We also need the relative lower error in the following. This can be obtained by writing \[ \appro {x_1}^2 - \corr {x_1}^2 = \left( 1 - \left| \frac {\corr {x_1}}{\appro {x_1}} \right|^2 \right) \cdot \appro {x_1}^2 \leq \big( 1 - (1 - \epsilon_{R, 1}^-)^2 \big) \appro {x_1}^2 = \big( \epsilon_{R, 1}^- (2 - \epsilon_{R, 1}^-) \big) \appro {x_1}^2. \] Adding the corresponding expression for the second term $\appro {x_1}^2 - \corr {x_1}^2$ yields \begin {equation} \label {eq:propnormepsminus} \frac {\appro x - \corr x}{\appro x} \leq \max \big( \epsilon_{R, 1}^- (2 - \epsilon_{R, 1}^-), \epsilon_{I, 1}^- (2 - \epsilon_{I, 1}^-) \big) =: \epsilon^-, \end {equation} and under the assumption that $\epsilon^- \geq 0$, inspection of Definition~\ref {def:relerror} shows that $\epsilon^- \geq \relerror^- (\appro x)$ since $\appro x$ and $\corr x$ are positive. The converse estimation yields \begin {equation} \label {eq:propnormepsplus} \relerror^+ (\appro x) \leq \epsilon^+ := \frac {\appro x - \corr x}{\appro x} \leq \max \big( \epsilon_{R, 1}^+ (2 + \epsilon_{R, 1}^+), \epsilon_{I, 1}^+ (2 + \epsilon_{I, 1}^+) \big) \end {equation} and $\relerror (\appro x) \leq \epsilon := \max (\epsilon^-, \epsilon^+)$. Letting $\epsilon_1 = \max ( \epsilon_{R, 1}^-, \epsilon_{R, 1}^+, \epsilon_{I, 1}^-, \epsilon_{I, 1}^+ ) = \max ( \epsilon_{R, 1}, \epsilon_{I, 1} )$ and $k_1 = \max ( k_{R, 1}, k_{I, 1})$, we have $\epsilon \leq \epsilon_1 (2 + \epsilon_1) \leq 2 k_1 (2 + \epsilon_1) 2^{-p}$ by Proposition~\ref {prop:relerror}. We obtain an alternative expression for the absolute error as \begin {equation} \label {eq:propnormalt} \error (\appro x) \leq \epsilon \appro x \leq 2 k_1 (2 + \epsilon_1) 2^{\Exp (\appro x) - p} \end {equation} \subsubsection {Division} \label{sssec:propdiv} Let \[ \appro z = \frac {\appro {z_1}}{\appro {z_2}} = \frac {\appro {z_1} \overline {\appro {z_2}}}{\Norm (\appro {z_2})}. \] Then the propagated error may be derived by cumulating the errors obtained for multiplication in \S\ref {sssec:propmul}, the norm in \S\ref {sssec:propnorm} and the division by a real in \S\ref {sssec:proprealdiv}. Let $\appro {z_3} = \appro {z_1} \overline {\appro {z_2}} = \appro {x_3} + i \appro {y_3}$, $d = \Exp (\appro {x_1} \appro {x_2}) - \Exp (\appro {x_3})$ and $d' = \Exp (\appro {y_1} \appro {y_2}) - \Exp (\appro {x_3})$. Then \eqref {eq:propmulre} applies and yields $\error (\appro {x_3}) \leq k_{R, 3} 2^{\Exp (\appro {x_3}) - p}$ with \[ k_{R, 3} = \left( k_{R, 1} (2 + \epsilon_{R, 2}^+) + k_{R, 2} (2 + \epsilon_{R, 1}^+) \right) 2^d + \left( k_{I, 1} (2 + \epsilon_{I, 2}^+) + k_{I, 2} (2 + \epsilon_{I, 1}^+) \right) 2^{d'}. \] We then apply \eqref {eq:propnorm} and \eqref {eq:propnormepsminus} to $\appro {x_4} = \Norm (\appro {z_2})$ to obtain $\error (\appro {x_4}) \leq k_4 2^{\Exp (\appro {x_4}) - p}$ with \[ k_4 = 2 \left( k_{R, 2} (2 + \epsilon_{R, 2}^+) + k_{I, 2} (2 + \epsilon_{I, 2}^+) \right) \] and \[ \relerror (\appro {x_4}) = \epsilon_4^- \leq \max \big( \epsilon_{R, 2}^- (2 - \epsilon_{R, 2}^-), \epsilon_{I, 2}^- (2 - \epsilon_{I, 2}^-) \big). \] Now \eqref {eq:proprealdiv} shows that \begin {equation} \label {eq:propdivre} \error (\appro x) \leq \frac {2 (k_{R, 3} + k_4)}{1 - \epsilon_4^-} 2^{\Exp (\appro x) - p}. \end {equation} As written above, $d$ and $d'$ depend not only on the input and output, but also on the intermediate value $\appro {x_3}$. But noting that $ \appro {x_3} = \appro x \Norm (\appro {z_2}) \geq \appro x 2^{2 \max (\Exp (\appro {x_2}, \Exp (\appro {y_2})) - 2}, $ so that \begin {align*} d &\leq \Exp (\appro {x_1} \appro {x_2}) - \Exp (\appro x) - 2 \max (\Exp (\appro {x_2}, \Exp (\appro {y_2})) + 2, \\ d' &\leq \Exp (\appro {y_1} \appro {y_2}) - \Exp (\appro x) - 2 \max (\Exp (\appro {x_2}, \Exp (\appro {y_2})) + 2, \end {align*} yields a bound \eqref {eq:propdivre} that is independent of intermediate values of the computation. The error in the imaginary part is computed in the same way as \begin {equation} \label {eq:propdivim} \error (\appro y) \leq \frac {2 (k_{I, 3} + k_4)}{1 - \epsilon_4^-} 2^{\Exp (\appro y) - p}. \end {equation} with \begin {align*} k_{I, 3} &= \left( k_{R, 1} (2 + \epsilon_{I, 2}^+) + k_{I, 2} (2 + \epsilon_{R, 1}^+) \right) 2^\delta + \left( k_{I, 1} (2 + \epsilon_{R, 2}^+) + k_{R, 2} (2 + \epsilon_{I, 1}^+) \right) 2^{\delta'} \\ \delta &\leq \Exp (\appro {x_1} \appro {y_2}) - \Exp (\appro y) - 2 \max (\Exp (\appro {x_2}, \Exp (\appro {y_2})) + 2 \\ \delta' &\leq \Exp (\appro {x_2} \appro {y_1}) - \Exp (\appro y) - 2 \max (\Exp (\appro {x_2}, \Exp (\appro {y_2})) + 2 \end {align*} As for the multiplication, a coarser bound may be obtained more easily using complex relative errors. Let $\corr {z_n} = (1 + \theta_n) \appro {z_n}$ with $\epsilon_n = | \theta_n |$. Then $\corr z = (1 + \theta_n) \appro z$ with \[ \theta = \frac {1 + \theta_1}{1 + \theta_2} - 1 = (\theta_1 - \theta_2) \sum_{k = 0}^\infty (- \theta_2)^k \text { and } \epsilon \leq (|\theta_1| + |\theta_2|) \sum_{k = 0}^\infty |\theta_2|^k. \] Using the same notation and assumptions as at the end of \S\ref {sssec:propmul}, in particular that $\corr z$ and $\appro z$ lie in the same quadrant and that all higher order error terms (involving $\epsilon_1^2$, $\epsilon_2^2$, $\epsilon_1 \epsilon_2$ or higher powers) are absorbed by $2^{\Exp (\appro x) - p}$, we find the exact same error estimate \eqref {eq:propmulcomrel} also for the case of division. \subsubsection {Logarithm} \section {Algorithms} This section describes in detail the algorithms used in \mpc, together with the error analysis that allows to prove that the results are correct in the {\mpc} semantics: The input numbers are assumed to be exact, and the output corresponds to the exact result rounded in the desired direction. \subsection {\texttt {mpc\_sqrt}} The following algorithm is due to Friedland \cite{Friedland67,Smith98}. Let $z = x + i y$. Let $w = \sqrt { \frac {|x| + \sqrt {x^2 + y^2}}{2}}$ and $t = \frac {y}{2w}$. Then $(w + it)^2 = |x| + iy$, and with the branch cut on the negative real axis we obtain \[ \sqrt z = \left\{ \begin {array}{cl} w + i t & \text {if } x > 0 \\ t + i w & \text {if } x < 0, y > 0 \\ -t - i w & \text {if } x < 0, y < 0 \end {array} \right. \] We compute $w$ rounded down and thus round down all intermediate results. $\sqrt {x^2 + y^2}$ is computed with an error of \ulp{1} by a call to \texttt {mpc\_abs}; $|x|$ is added with an error of \ulp{1}, since both terms are positive; division by~$2$ is free of error. So $w^2$ is computed with a cumulated error of \ulp{2}. This error of \ulp{2} propagates as is through the real square root: Since we rounded down the argument, we have $\epsilon_1^- = 0$ in \eqref {eq:proprealsqrt}; an error of \ulp{1} needs to be added for the rounding of $w$, so that the total error is \ulp{3}. $t$ is rounded away. Plugging the error of \ulp{3} for $w$ and \ulp{0} for $y$ into \eqref {eq:proprealdiv} shows that the propagated error of real division is \ulp{6}, to which an additional rounding error of \ulp{1} has to be added for a total error of \ulp{7}. \subsection {\texttt {mpc\_log}} Let $z = x + i y$. Then $\log (z) = \frac {1}{2} \log (x^2 + y^2) + i \atantwo (y, x)$. The imaginary part is computed by a call to the corresponding {\mpfr} function. Let $w = \log (x^2 + y^2)$, rounded down. The error of the complex norm is \ulp{1}. The generic error of the real logarithm is then given by \ulp{$2^{2 - e_w} + 1$}, where $e_w$ is the exponent of $w$. For $e_w \geq 2$, this is bounded by \ulp{2} or 2~digits; otherwise, it is bounded by \ulp{$2^{3 - e_w}$} or $3 - e_w$ digits. \subsection {\texttt {mpc\_tan}} Let $z = x + i y$ with $x \neq 0$ and $y \neq 0$. We compute $\tan z$ as follows: \begin{align*} u &\leftarrow \A(\sin z) &\error(\Re(u)) &\leq 1 \Ulp(\Re(u)) &\error(\Im(u)) &\leq 1 \Ulp(\Im(u)) \\ v &\leftarrow \A(\cos z) &\error(\Re(v)) &\leq 1 \Ulp(\Re(v)) &\error(\Im(v)) &\leq 1 \Ulp(\Im(v)) \\ t &\leftarrow \A(u/v) &\error(\Re(t)) &\leq k_R \Ulp(\Re(t)) &\error(\Im(t)) &\leq k_I \Ulp(\Im(t)) \end{align*} where $w_2 \leftarrow \A(w_1)$ means that the real and imaginary parts of $w_2$ are respectively the real and imaginary part of $w_1$ rounded away from zero to the working precision. We know that $\Re(\frac{a+i b}{c+i d})=\frac{a c +b d}{c^2 + d^2}$ and $\Im(\frac{a+i b}{c+i d})=\frac{a d -b c}{c^2 + d^2}$, so in the special case of $\tan z=\frac{\sin x\cosh y+i\cos x\sinh y}{\cos x\cosh y-i\sin x\sinh y}$, we have $abcd < 0$ which means that there might be a cancellation in the computation of the real part while it does never happen in the one of the imaginary part. Then, using the generic error of the division (see \ref{sssec:propdiv}), we have \begin{align*} \error(\Re(t)) &\leq [1+2^{3+e_1}+2^{3+e_2}+2^6] \Ulp(\Re(t)), \\ \error(\Im(t)) &\leq [1+2^3+2^3+2^6] \Ulp(\Im(t)), \end{align*} where $e_1=\Exp(a c) -\Exp(a c+b d)$ and $e_2=\Exp(b d) -\Exp(a c+b d)$. The second inequality shows that $2^7$ is suitable choice for $k_I$. As $|\sinh y|<\cosh y$ for every nonzero $y$, we have $bd0}$, $m$ odd and $e \in \Z$. Then $x^{m 2^e}$ is a dyadic complex if and only if $x^{2^e}$ is. \end{lemma} \begin{proof} Notice that by Proposition~\ref {prop:dyadic} the set of dyadics forms a ring, whence any positive integral power of a dyadic is again dyadic. Thus if $x^{2^e}$ is dyadic, then so is $x^{m 2^e}$. Conversely, assume that $x^{m 2^e}$ is dyadic. If $e \geq 0$, then $x^{2^e}$ is dyadic independently of the assumption, and it remains to consider the case $e < 0$. Write $x = i^u \prod_{k \geq 0} p_k^{\alpha_k}$ and $z = x^{m 2^e} = i^v \prod_k p_k^{\beta_k}$, so that $x^m = z^{2^{|e|}}$. The uniqueness of the prime decomposition implies that $m \alpha_k = 2^{|e|} \beta_k$, and since $m$ is odd, $2^{|e|}$ must divide $\alpha_k$. Then $x^{2^e} = i^w \prod_k p_k^{\gamma_k}$ with $w \equiv m^{-1} v \pmod 4$ and $\gamma_k = \frac {\alpha_k}{2^{|e|}}$. Now $\alpha_k \geq 0$ for $k \geq 1$ implies $\gamma_k \geq 0$ for $k \geq 1$, and $x^{2^e}$ is dyadic by Proposition~\ref {prop:dyadic}. \end{proof} It remains to decide when $x^{2^e}$ is dyadic for $x$ dyadic. If $e \geq 0$, this is trivially the case. For $e < 0$, the question boils down to whether it is possible to take $e$ successive square roots of $x$; as soon as the process fails, it is clear that $x^{2^e}$ cannot be dyadic. \begin{lemma} \label {lm:sqrtrat} Let $x \in \Q (i)$, and write $x = (a + b i)^2$ with $a$, $b \in \R$. Then either both of $a$ and $b$ are rational, or none of them is. \end{lemma} \begin{proof} Assume that one of $a$ and $b$ is rational. Then $\Im x = 2 a b \in \Q$ implies that also the other one is rational. \end{proof} \begin{lemma} Let $x$ be dyadic, and write $x = (a + b i)^2$ with $a$, $b \in \R$. Then either both of $a$ and $b$ are dyadic reals, or none of them is. \end{lemma} \begin{proof} Assume that one of $a$ and $b$ is a dyadic real, that is, a rational with a power of~$2$ as denominator. Then $a$, $b \in \Q$ by Lemma~\ref {lm:sqrtrat}. Now, $\Re x = a^2 - b^2$ implies that also the square of the \textit {a priori} not dyadic coefficient $a$ or $b$, and thus the coefficient itself, has as denominator a power of~$2$. \end{proof} \begin {theorem} Let $x = m 2^e$ and $y = n 2^f$ be dyadic complex numbers with $m$ and $n$ odd, and let $z = x^y$. Call the pair $(x, y)$ {\em exceptional} if at least one of $\Re z$ or $\Im z$ is a dyadic real. Exceptional pairs occur only in the following cases: \begin {enumerate} \item $y = 0$; then $z = 1$ \item $x \geq 0$ and $y \neq 0$ are real; then $\Im z = 0$, and the question whether $\Re z = x^y$ is dyadic involves only real numbers and can thus be delegated to \mpfr. \item $x < 0$ and $y \neq 0$ are real. \begin {enumerate} \item $y \in \Z$; then $\Im z = 0$, and $\Re z = x^y$ is dyadic if and only if $y > 0$, or $y < 0$ and $-m = 1$. \item $y \in \frac {1}{2} \Z \backslash \Z$, that is, $f = -1$; then $\Re z = 0$, and $\Im z = (-x)^y$ is dyadic if and only if $e$ is even, $-m$ is a square, and, in case $y < 0$, $-m = 1$. \item $y \in \frac {1}{4} \Z \backslash \frac {1}{2} \Z$, that is, $f = -2$; then $z = \frac {1 + i}{\sqrt 2} (-x)^y$ has both real and imaginary dyadic parts if and only if $e \equiv 2 \pmod 4$, $-m$ is a fourth power, and, in case $y < 0$, $-m = 1$. \end {enumerate} \item $y$ not real; see Conjecture \item $y > 0$ real, $x$ not real; see above \item $y < 0$ real, $x$ not real; still to do \end {enumerate} \end {theorem} \begin {proof} \begin {enumerate} \item Clear by definition. \item Clear. \item The first two subcases $f \geq -1$ follow from the observation that $x^y = (-1)^y (-x)^y$, where $(-1)^y \in \langle i \rangle$. For $y > 0$, the number $(-x)^y$ is dyadic if and only if $(-x)^{2^f}$ is, which leads to the result; for $y < 0$, one furthemore needs that $(-m)^{-1}$ is dyadic, which for $m$ odd is only possible if $-m = 1$. The third subcase $f = -2$ is similar, but one needs that $(-x)^y$ is dyadic up to a factor of $\sqrt 2$. We proceed to show that for $f \leq -3$, there is no exceptional pair. Suppose that $(x, y)$ is an exceptional pair; by switching to $\left( x^{|n|}, \frac {y}{|n|} \right)$, we may assume without loss of generality that $|n| = 1$. Then $x^y$ is obtained by taking $|f|$ successive square roots of either $x$ or $\frac {1}{x}$, both of which are elements of $\Q (i)$. Lemma~\ref {lm:sqrtrat} implies that both $\Re (x^y)$ and $\Im (x^y)$ are rational. Write $x^y = \alpha \zeta = \alpha \zeta_r + i \alpha \zeta_i$, where $\alpha = (-x)^y \in \R$ and $\zeta = \zeta_r + i \zeta_i$ is a primitive root of unity of order~$2^{|f| + 1}$. Then $\alpha \zeta_r$, $\alpha \zeta_i \in \Q$ implies $\zeta \in \Q (i, \alpha)$. Moreover, $\alpha^2 = \alpha^2 (\zeta_r^2 + \zeta_i^2) = (\alpha \zeta_r)^2 + (\alpha \zeta_i)^2 \in \Q (i)$, so that $\Q (i, \alpha)$ is an extension of degree at most~$4$ of $\Q$ containing $\Q (\zeta)$ and thus a primitive $16$-th root of unity, which is impossible. \item \item \item \end {enumerate} \end {proof} A relevant reference is \cite{BrDiJeLeMeMuReStTo09}, especially Section 4.5 which discusses complex floating-point numbers, and gives error bounds for multiplication, division and square root. \bibliographystyle{acm} \bibliography{algorithms} \end {document}