this repo has no description
1/* acosf.s -- acosf for standard math library.
2
3 Written by Eric Postpischil, July 2007.
4*/
5
6
7 .literal8
8
9// Define the points where our algorithm changes.
10nPoint: .double -.62000000476837158
11pPoint: .double +.62000000476837158
12
13// Define miscellaneous constants.
14nOne: .double -1
15pOne: .double +1
16HalfPi: .double 1.5707963267948966192313217
17Pi: .double 3.1415926535897932384626433
18
19// Define a coefficient for center polynomial (used for x in [-.62, +.62]).
20C2: .double -0.7009458946846486978642201e-1
21
22// Define coefficients for tail polynomial (used for x outside [-.62, +.62]).
23pT2: .double 0.1475254770791043897245664e-2
24nT2: .double -0.1475254770791043897245664e-2
25
26// Define coefficients for reciprocal-square-root refinement.
27S0: .double 4.999999720603234088021257968
28S1: .double -3.333333463718495862077843807
29
30
31 .const
32 .align 4
33
34/* Define some coefficients for center polynomial (used for x in [-.62,
35 +.62]). These are stored in pairs at aligned addresses for use in SIMD
36 instructions.
37*/
38C1: .double +1.5714407739207253912688773, -1.5375796594506042231041545
39C0: .double +1.2521077245762256716713145, +1.8992998261460658570880250
40
41/* Define some coefficients for positive tail polynomial (used for x in (+.62,
42 1)).
43*/
44T0: .double +14.7303701447222424478789710, +27.0927484663960477135094919
45T1: .double +2.7185038560132749878882294, -8.6073428141752546318770571
46
47
48// Rename the general registers (just to make it easier to keep track of them).
49#if defined __i386__
50 #define r0 %eax
51 #define r1 %ecx
52 #define r2 %edx
53 #define r3 %ebx
54 #define r4 %esp
55 #define r5 %ebp
56 #define r6 %esi
57 #define r7 %edi
58#elif defined __x86_64__
59 #define r0 %rax
60 #define r1 %rcx
61 #define r2 %rdx
62 #define r3 %rbx
63 #define r4 %rsp
64 #define r5 %rbp
65 #define r6 %rsi
66 #define r7 %rdi
67#else
68 #error "Unknown architecture."
69#endif
70
71
72 .text
73
74
75// Define various symbols.
76
77#define BaseP r0 // Base address for position-independent addressing.
78
79#define p %xmm0 // Must be in %xmm0 for return on x86_64.
80#define x %xmm1
81#define x1 %xmm2
82#define w x1
83#define pa %xmm3
84#define e %xmm4
85#define rss %xmm5
86
87#if defined __i386__
88
89 // Define location of argument x.
90 #define Argx 4(%esp)
91
92 // Define how to address data. BaseP must contain the address of label 0.
93 #define Address(label) label-0b(BaseP)
94
95#elif defined __x86_64__
96
97 // Define location of argument x.
98 #define Argx %xmm0
99
100 // Define how to address data.
101 #define Address(label) label(%rip)
102
103#endif
104
105
106/* float acosf(float x).
107
108 Notes:
109
110 Citations in parentheses below indicate the source of a requirement.
111
112 "C" stands for ISO/IEC 9899:TC2.
113
114 The Open Group specification (IEEE Std 1003.1, 2004 edition) adds no
115 requirements since it defers to C and requires errno behavior only if
116 we choose to support it by arranging for "math_errhandling &
117 MATH_ERRNO" to be non-zero, which we do not.
118
119 Return value:
120
121 For arcosine of 1, return +0 (C F.9.1.1).
122
123 For 1 < |x| (including infinity), return NaN (C F.9.1.1).
124
125 For a NaN, return the same NaN (C F.9 11 and 13). (If the NaN is a
126 signalling NaN, we return the "same" NaN quieted.)
127
128 Otherwise:
129
130 If the rounding mode is round-to-nearest, return arccosine(x)
131 faithfully rounded.
132
133 Return a value in [0, pi] (C 7.12.4.1 3). Note that this prohibits
134 returning a correctly rounded value for acosf(-1), since pi rounded
135 to a float lies outside that interval.
136
137 Not implemented: In other rounding modes, return arccosine(x)
138 possibly with slightly worse error, not necessarily honoring the
139 rounding mode (Ali Sazegari narrowing C F.9 10).
140
141 Exceptions:
142
143 May or may not raise inexact, even if the result is exact (C F.9 8).
144
145 Raise invalid if the input is a signalling NaN (C 5.2.4.2.2 3, in spite
146 of C 4.2.1) or 1 < |x| (including infinity) (C F.9.1.1) but not if the
147 input is a quiet NaN (C F.9 11).
148
149 May not raise exceptions otherwise (C F.9 9).
150
151 Properties:
152
153 Monotonic, proven by exhaustive testing.
154
155 Exhaustive testing proved this routine returns faithfully rounded
156 results. Since the rsqrtss instruction is specified to return a value
157 in an interval, tests were performed using each possible result,
158 showing that a valid result will be obtained regardless of which
159 value rsqrtss provides.
160*/
161 .align 5
162#if !defined DevelopmentInstrumentation
163 // This is the regular name used in the deployed implementation.
164 .globl _acosf
165 _acosf:
166#else
167 // This is the name used for a special test version of the routine.
168 .globl _acosfInstrumented
169 _acosfInstrumented:
170#endif
171
172 cvtss2sd Argx, x // Convert x to double precision.
173
174 #if defined __i386__
175
176 // Get address of 0 in BaseP.
177 call 0f // Push program counter onto stack.
178 0:
179 pop BaseP // Get program counter.
180
181 #endif
182
183/* We use different algorithms for different parts of the domain. There
184 is a "negative tail" from -1 to nPoint, a center from nPoint to pPoint,
185 and a "positive tail" from pPoint to +1. Here, we compare and branch
186 to the appropriate code.
187
188 There are also special cases: NaNs, x < -1, and 1 < x. These are weeded
189 out in PositiveTail or NegativeTail.
190*/
191
192 ucomisd Address(pPoint), x
193 ja PositiveTail
194
195 ucomisd Address(nPoint), x
196 jb NegativeTail
197
198
199/* Here we have nPoint <= x <= pPoint. This is handled with a simple
200 evaluation of a polynomial that approximates arccosine.
201
202 The polynomial has been arranged into the form:
203
204 pi/2 - x + x *
205 c2 * (x**4 + c01 * x**2 + c00)
206 * x**2 * (x**4 + c11 * x**2 * c10)).
207
208 The coefficients c00 and c10 are stored in a pair at C0, and c01 and c11
209 are stored at C1. c2 is at C2.
210
211 The two quartic factors are evaluated in SIMD registers. For brevity, some
212 comments below describe only one element of a register. The other is
213 analagous.
214*/
215 movsd x, x1 // Save a copy of x for later.
216 mulsd x, x // Form x**2.
217 movapd Address(C1), p // Get first coefficient pair.
218 unpcklpd x, x // Duplicate in SIMD register.
219 addpd x, p // Form x**2 + c1.
220 mulpd x, p // Form x**4 + c1 * x**2.
221 movlpd Address(C2), x // Put c2 in low element, with x**2 in high.
222 mulsd x1, x // Multiply by x.
223 addpd Address(C0), p // Form x**4 + c1 * x**2 + c0.
224 mulpd x, p // Multiply by c2*x in low and x**2 in high.
225 movhlps p, pa // Get high element.
226 subsd Address(HalfPi), x1 // Form x - pi/2.
227 mulsd pa, p // Multiply two factors.
228 subsd x1, p // Form pi/2 - x + x * c2 * ...
229
230// Return the double-precision number currently in p.
231ReturnDoubleInp:
232 cvtsd2ss p, p // Convert result to single precision.
233
234// Return the single-precision number currently in p.
235ReturnSingleInp:
236
237 #if defined __i386__
238 movss p, Argx // Shuttle result through memory.
239 // This uses the argument area for scratch space, which is allowed.
240 flds Argx // Return input on floating-point stack.
241 #else
242 // On x86_64, the return value is now in p, which is %xmm0.
243 #endif
244
245 ret
246
247
248// Handle pPoint < x.
249PositiveTail:
250
251 movsd Address(pOne), w // Get +1 for math and comparison.
252 ucomisd w, x // Compare x to +1.
253 jae InputIsPositiveSpecial
254
255/* Here we have pPoint < x < 1. The algorithm here is inspired by the
256 identity arccosine(x) = 2 * arcsine(sqrt((1-x)/2)). Replacing arcsine with
257 an approximating polynomial would give an odd polynomial in sqrt((1-x)/2),
258 which is the same as sqrt(1-x) multiplied by some polynomial in x. So we
259 have:
260
261 arccosine(x) ~= sqrt(1-x) * t(x).
262
263 Unfortunately, the square-root instruction (rsqrtss) takes too long, so we
264 use the faster reciprocal-square-root-estimate instruction instead and
265 refine its estimate. Given an estimate e from the rsqrtss instruction, the
266 square root of 1-x is very nearly, e * (1-x) * s(e**2 * (1-x)). Let e2a
267 be e**2 * (1-x), so sqrt(1-x) is nearly e * a * s(e2a). The leading
268 coefficient of s, cl, has been removed (by dividing s by it) and multiplied
269 into the polynomial p above. That leaves:
270
271 sqrt(1-x) / cl ~= e * a * s(e2a)/cl.
272
273 s(e2a)/cl is e2a**2 + s1 * e2a + s0, where s1 and s0 have been stored at
274 labels S1 and S0, above.
275
276 t(x) has been arranged into the form:
277
278 t2
279 * (x**2 + t01 * x + t00)
280 * (x**2 + t11 * x + t10).
281
282 The two quadratic factors are evaluated in SIMD registers. For brevity,
283 some comments below describe only one element of a register. The other is
284 analagous.
285
286 So, our job here is to evaluate:
287
288 a = 1-x.
289
290 e = estimate of 1/sqrt(a).
291
292 e2a = e*e*a.
293
294 acosf(x) ~= e * a
295 * (e2a**2 + s1 * e2a + s0)
296 * t2
297 * (x**2 + t01 * x + t00)
298 * (x**2 + t11 * x + t10).
299*/
300
301 subsd x, w // Form 1-x.
302 cvtsd2ss w, e // Convert to single precision for rsqrtss.
303 movapd Address(T1), p // Start asinf polynomial.
304 unpcklpd x, x // Duplicate x.
305 addpd x, p // Form x + t1.
306 #if !defined DevelopmentInstrumentation
307 // This is the regular code used in the deployed implementation.
308 rsqrtss e, e // Estimate 1/sqrt(1-x).
309 #else
310 /* This instruction uses an estimate of 1/sqrt(1-x) passed by the
311 caller rather than the rsqrtss instruction. This allows us to test
312 the implementation with all values that rsqrtss might return.
313 */
314 movss 8(%esp), e // Use caller's estimate of 1/sqrt(1-x).
315 #endif
316 cvtss2sd e, e // Convert to double precision.
317 mulpd x, p // Form x**2 + t1*x.
318 mulsd e, w // Form e * (1-x).
319 mulsd w, e // Form e**2 * (1-x).
320 // "e" in comments refers to the initial estimate from rsqrtss.
321 movhpd Address(pT2), w // Copy coefficient into high element.
322 addpd Address(T0), p // Form x**2 + t1*x + t0.
323 movsd Address(S1), rss // Form s1.
324 addsd e, rss // Form e**2 * (1-x) + s1.
325 mulpd w, p // Form e * (1-x) * p(x), split.
326 mulsd e, rss // Form e2a**2 + s1 * e2a.
327 movhlps p, pa // Separate high element.
328 addsd Address(S0), rss // Form e2a**2 + s1 * e2a + s0.
329 mulsd pa, p // Finish e * (1-x) * p(x).
330 mulsd rss, p // Form e * (1-x) / sqrt(1-x) * p(x).
331
332 jmp ReturnDoubleInp
333
334
335// Handle x < nPoint.
336NegativeTail:
337
338 movsd Address(pOne), w // Get +1 for math.
339 ucomisd Address(nOne), x // Compare x to -1.
340 jbe InputIsNegativeSpecial
341
342/* Here we have -1 < x < nPoint. We use the same algorithm as in PositiveTail
343 but adapted for -x.
344
345 For brevity, some comments below describe only one element of a register.
346 The other is analagous.
347
348 Our job here is to evaluate:
349
350 a = 1+x.
351
352 e = estimate of 1/sqrt(a).
353
354 e2a = e*e*a.
355
356 acosf(x) ~= pi - e * a
357 * (e2a**2 + s1 * e2a + s0)
358 * t2
359 * (x**2 - t01 * x + t00)
360 * (x**2 - t11 * x + t10).
361
362 Note that the signs of terms involving x are negated from those in
363 PositiveTail.
364
365 For convenience, the final two factors are negated:
366
367 * (-x**2 + t01 * x - t00)
368 * (-x**2 + t11 * x - t10).
369*/
370
371 addsd x, w // Form 1+x.
372 cvtsd2ss w, e // Convert to single precision for rsqrtss.
373 movapd Address(T1), p // Start asinf polynomial.
374 unpcklpd x, x // Duplicate x.
375 subpd x, p // Form -x + t1.
376 #if !defined DevelopmentInstrumentation
377 // This is the regular code used in the deployed implementation.
378 rsqrtss e, e // Estimate 1/sqrt(1+x).
379 #else
380 /* This instruction uses an estimate of 1/sqrt(1+x) passed by the
381 caller rather than the rsqrtss instruction. This allows us to test
382 the implementation with all values that rsqrtss might return.
383 */
384 movss 8(%esp), e // Use caller's estimate of 1/sqrt(1+x).
385 #endif
386 cvtss2sd e, e // Convert to double precision.
387 mulpd x, p // Form -x**2 + t1*x.
388 mulsd e, w // Form e * (1+x).
389 mulsd w, e // Form e**2 * (1+x).
390 // "e" in comments refers to the initial estimate from rsqrtss.
391 movhpd Address(nT2), w // Copy coefficient into high element.
392 subpd Address(T0), p // Form -x**2 + t1*x - t0.
393 movsd Address(S1), rss // Form s1.
394 addsd e, rss // Form e**2 * (1+x) + s1.
395 mulpd w, p // Form e * (1+x) * p(x), split.
396 mulsd e, rss // Form e2a**2 + s1 * e2a.
397 movhlps p, pa // Separate high element.
398 addsd Address(S0), rss // Form e2a**2 + s1 * e2a + s0.
399 mulsd pa, p // Finish e * (1+x) * p(x).
400 mulsd rss, p // Form e * (1+x) / sqrt(1+x) * p(x).
401 addsd Address(Pi), p // Form pi - 2*asin(sqrt((1+x)/2)).
402
403 jmp ReturnDoubleInp
404
405
406/* Here we handle inputs greater than or equal to one, including infinity,
407 but not including NaNs. The condition code must be set to indicate equal
408 (zero flag is one) iff the input is one.
409*/
410InputIsPositiveSpecial:
411 je InputIsPositiveOne
412 jmp SignalInvalid
413
414
415/* Here we handle inputs less than or equal to -1, including -infinity, and
416 NaNs. The condition code must be set as if by using ucomisd to compare
417 the input in the "source operand" to -1 in the "destination operand".
418*/
419InputIsNegativeSpecial:
420 jp InputIsNaN
421 je InputIsNegativeOne
422
423 // Continue into SignalInvalid.
424// jmp SignalInvalid
425
426
427/* Here we handle inputs outside the domain of the function. We subtract
428 infinity from itself to generate the invalid signal and return a NaN.
429*/
430
431 .literal4
432Infinity: .long 0x7f800000
433
434 .text
435SignalInvalid:
436 movss Address(Infinity), p
437 subss p, p // Generate invalid signal and NaN value.
438 jmp ReturnSingleInp
439
440
441/* Here we handle an input of +1 or -1. arccosine(-1) is pi, which increases
442 when rounded to single precision. However, we are required to return
443 results in [0, pi], so we return pi rounded down.
444*/
445
446 .literal8
447/* Define a value near pi that yields pi rounded down when converted to
448 single precision. This allows us to generate inexact and return the
449 desired value for acosf(-1).
450*/
451AlmostPi: .double 3.1415925
452
453 .text
454InputIsPositiveOne:
455 xorps p, p // Return zero.
456 jmp ReturnSingleInp
457
458InputIsNegativeOne:
459 cvtsd2ss Address(AlmostPi), p
460 jmp ReturnSingleInp
461
462
463InputIsNaN:
464
465 #if defined __i386__
466 flds Argx // Return result on floating-point stack.
467 #else
468 cvtsd2ss x, %xmm0 // Return in %xmm0.
469 /* We cannot just return the input, because we must quiet a
470 signalling NaN.
471 */
472 #endif
473
474 ret