s_fma.c - vx32 - Local 9vx git repository for patches.
(HTM) git clone git://r-36.net/vx32
(DIR) Log
(DIR) Files
(DIR) Refs
---
s_fma.c (5003B)
---
1 /*-
2 * Copyright (c) 2005 David Schultz <das@FreeBSD.ORG>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27 #include <fenv.h>
28 #include <float.h>
29 #include <math.h>
30
31 /*
32 * Fused multiply-add: Compute x * y + z with a single rounding error.
33 *
34 * We use scaling to avoid overflow/underflow, along with the
35 * canonical precision-doubling technique adapted from:
36 *
37 * Dekker, T. A Floating-Point Technique for Extending the
38 * Available Precision. Numer. Math. 18, 224-242 (1971).
39 *
40 * This algorithm is sensitive to the rounding precision. FPUs such
41 * as the i387 must be set in double-precision mode if variables are
42 * to be stored in FP registers in order to avoid incorrect results.
43 * This is the default on FreeBSD, but not on many other systems.
44 *
45 * Tests on an Itanium 2 indicate that the hardware's FMA instruction
46 * is almost twice as fast as this implementation. The hardware
47 * instruction should be used on platforms that support it.
48 *
49 * XXX May incur an absolute error of 0x1p-1074 for subnormal results
50 * due to double rounding induced by the final scaling operation.
51 *
52 * XXX On machines supporting quad precision, we should use that, but
53 * see the caveat in s_fmaf.c.
54 */
55 double
56 fma(double x, double y, double z)
57 {
58 static const double split = 0x1p27 + 1.0;
59 double xs, ys, zs;
60 double c, cc, hx, hy, p, q, tx, ty;
61 double r, rr, s;
62 int oround;
63 int ex, ey, ez;
64 int spread;
65
66 if (x == 0.0 || y == 0.0)
67 return (z);
68 if (z == 0.0)
69 return (x * y);
70
71 /* Results of frexp() are undefined for these cases. */
72 if (!isfinite(x) || !isfinite(y) || !isfinite(z))
73 return (x * y + z);
74
75 xs = frexp(x, &ex);
76 ys = frexp(y, &ey);
77 zs = frexp(z, &ez);
78 oround = fegetround();
79 spread = ex + ey - ez;
80
81 /*
82 * If x * y and z are many orders of magnitude apart, the scaling
83 * will overflow, so we handle these cases specially. Rounding
84 * modes other than FE_TONEAREST are painful.
85 */
86 if (spread > DBL_MANT_DIG * 2) {
87 fenv_t env;
88 feraiseexcept(FE_INEXACT);
89 switch(oround) {
90 case FE_TONEAREST:
91 return (x * y);
92 case FE_TOWARDZERO:
93 if (x > 0.0 ^ y < 0.0 ^ z < 0.0)
94 return (x * y);
95 feholdexcept(&env);
96 r = x * y;
97 if (!fetestexcept(FE_INEXACT))
98 r = nextafter(r, 0);
99 feupdateenv(&env);
100 return (r);
101 case FE_DOWNWARD:
102 if (z > 0.0)
103 return (x * y);
104 feholdexcept(&env);
105 r = x * y;
106 if (!fetestexcept(FE_INEXACT))
107 r = nextafter(r, -INFINITY);
108 feupdateenv(&env);
109 return (r);
110 default: /* FE_UPWARD */
111 if (z < 0.0)
112 return (x * y);
113 feholdexcept(&env);
114 r = x * y;
115 if (!fetestexcept(FE_INEXACT))
116 r = nextafter(r, INFINITY);
117 feupdateenv(&env);
118 return (r);
119 }
120 }
121 if (spread < -DBL_MANT_DIG) {
122 feraiseexcept(FE_INEXACT);
123 if (!isnormal(z))
124 feraiseexcept(FE_UNDERFLOW);
125 switch (oround) {
126 case FE_TONEAREST:
127 return (z);
128 case FE_TOWARDZERO:
129 if (x > 0.0 ^ y < 0.0 ^ z < 0.0)
130 return (z);
131 else
132 return (nextafter(z, 0));
133 case FE_DOWNWARD:
134 if (x > 0.0 ^ y < 0.0)
135 return (z);
136 else
137 return (nextafter(z, -INFINITY));
138 default: /* FE_UPWARD */
139 if (x > 0.0 ^ y < 0.0)
140 return (nextafter(z, INFINITY));
141 else
142 return (z);
143 }
144 }
145
146 /*
147 * Use Dekker's algorithm to perform the multiplication and
148 * subsequent addition in twice the machine precision.
149 * Arrange so that x * y = c + cc, and x * y + z = r + rr.
150 */
151 fesetround(FE_TONEAREST);
152
153 p = xs * split;
154 hx = xs - p;
155 hx += p;
156 tx = xs - hx;
157
158 p = ys * split;
159 hy = ys - p;
160 hy += p;
161 ty = ys - hy;
162
163 p = hx * hy;
164 q = hx * ty + tx * hy;
165 c = p + q;
166 cc = p - c + q + tx * ty;
167
168 zs = ldexp(zs, -spread);
169 r = c + zs;
170 s = r - c;
171 rr = (c - (r - s)) + (zs - s) + cc;
172
173 fesetround(oround);
174 return (ldexp(r + rr, ex + ey));
175 }
176