this repo has no description
1/* asinf.s -- asinf for standard math library.
2
3 Written by Eric Postpischil, June 2007.
4*/
5
6
7 .literal8
8
9// Define the points where our algorithm changes.
10nPoint: .double -.57
11pPoint: .double +.57
12
13// Define miscellaneous constants.
14nOne: .double -1
15pOne: .double +1
16HalfPi: .double 1.5707963267948966192313217
17
18// Define a coefficient for center polynomial (used for x in [-.57, +.57]).
19C2: .double 0.5175137818992348748214025e-1
20
21// Define coefficients for tail polynomials (used for x outside [-.57, +.57]).
22pT2: .double -0.1651550997063622696638513e-2
23nT2: .double +0.1651550997063622696638513e-2
24
25// Define coefficients for reciprocal-square-root refinement.
26S0: .double 4.999999720603234088021257968
27S1: .double -3.333333463718495862077843807
28
29
30 .const
31 .align 4
32
33/* Define some coefficients for center polynomial (used for x in [-.57,
34 +.57]). These are stored in pairs at aligned addresses for use in SIMD
35 instructions.
36*/
37C1: .double 1.8337275054369541825679091, -1.4826841272512669686823077
38C0: .double 1.5667812996782509912266781, 2.0555474653288563095606975
39
40// Define some coefficients for tail polynomial (used for x in (+.57, 1)).
41T0: .double +14.0375337010427139350610970, +25.3978891245023107522273720
42T1: .double +2.7174503090559850822332204, -8.3122365980471962999776635
43
44
45// Rename the general registers (just to make it easier to keep track of them).
46#if defined __i386__
47 #define r0 %eax
48 #define r1 %ecx
49 #define r2 %edx
50 #define r3 %ebx
51 #define r4 %esp
52 #define r5 %ebp
53 #define r6 %esi
54 #define r7 %edi
55#elif defined __x86_64__
56 #define r0 %rax
57 #define r1 %rcx
58 #define r2 %rdx
59 #define r3 %rbx
60 #define r4 %rsp
61 #define r5 %rbp
62 #define r6 %rsi
63 #define r7 %rdi
64#else
65 #error "Unknown architecture."
66#endif
67
68
69 .text
70
71
72// Define various symbols.
73
74#define BaseP r0 // Base address for position-independent addressing.
75
76#define p %xmm0 // Must be in %xmm0 for return on x86_64.
77#define x %xmm1
78#define x1 %xmm2
79#define w x1
80#define pa %xmm3
81#define e %xmm4
82#define rss %xmm5
83
84#if defined __i386__
85
86 // Define location of argument x.
87 #define Argx 4(%esp)
88
89 // Define how to address data. BaseP must contain the address of label 0.
90 #define Address(label) label-0b(BaseP)
91
92#elif defined __x86_64__
93
94 // Define location of argument x.
95 #define Argx %xmm0
96
97 // Define how to address data.
98 #define Address(label) label(%rip)
99
100#endif
101
102
103/* float asinf(float x).
104
105 Notes:
106
107 Citations in parentheses below indicate the source of a requirement.
108
109 "C" stands for ISO/IEC 9899:TC2.
110
111 The Open Group specification (IEEE Std 1003.1, 2004 edition) adds no
112 requirements since it defers to C and requires errno behavior only if
113 we choose to support it by arranging for "math_errhandling &
114 MATH_ERRNO" to be non-zero, which we do not.
115
116 Return value:
117
118 For arcsine of +/- zero, return zero with same sign (C F.9 12 and
119 F.9.1.2).
120
121 For 1 < |x| (including infinity), return NaN (C F.9.1.2).
122
123 For a NaN, return the same NaN (C F.9 11 and 13). (If the NaN is a
124 signalling NaN, we return the "same" NaN quieted.)
125
126 Otherwise:
127
128 If the rounding mode is round-to-nearest, return arcsine(x)
129 faithfully rounded.
130
131 Return a value in [-pi/2, +pi/2] (C 7.12.4.2 3). Note that this
132 prohibits returning correctly rounded values for asinf(-1) and
133 asinf(+1), since pi/2 rounded to a float lies outside that
134 interval.
135
136 Not implemented: In other rounding modes, return arcsine(x)
137 possibly with slightly worse error, not necessarily honoring the
138 rounding mode (Ali Sazegari narrowing C F.9 10).
139
140 Exceptions:
141
142 Raise underflow for a denormal result (C F.9 7 and Draft Standard for
143 Floating-Point Arithmetic P754 Draft 1.2.5 9.5). If the input is the
144 smallest normal, underflow may or may not be raised. This is stricter
145 than the older 754 standard.
146
147 May or may not raise inexact, even if the result is exact (C F.9 8).
148
149 Raise invalid if the input is a signalling NaN (C 5.2.4.2.2 3, in spite
150 of C 4.2.1) or 1 < |x| (including infinity) (C F.9.1.2) but not if the
151 input is a quiet NaN (C F.9 11).
152
153 May not raise exceptions otherwise (C F.9 9).
154
155 Properties:
156
157 Monotonic, proven by exhaustive testing.
158
159 Exhaustive testing proved this routine returns faithfully rounded
160 results. Since the rsqrtss instruction is specified to return a value
161 in an interval, tests were performed using each possible result,
162 showing that a valid result will be obtained regardless of which
163 value rsqrtss provides.
164*/
165 .align 5
166#if !defined DevelopmentInstrumentation
167 // This is the regular name used in the deployed implementation.
168 .globl _asinf
169 _asinf:
170#else
171 // This is the name used for a special test version of the routine.
172 .globl _asinfInstrumented
173 _asinfInstrumented:
174#endif
175
176 cvtss2sd Argx, x // Convert x to double precision.
177
178 #if defined __i386__
179
180 // Get address of 0 in BaseP.
181 call 0f // Push program counter onto stack.
182 0:
183 pop BaseP // Get program counter.
184
185 #endif
186
187/* We use different algorithms for different parts of the domain. There
188 is a "negative tail" from -1 to nPoint, a center from nPoint to pPoint,
189 and a "positive tail" from pPoint to +1. Here, we compare and branch
190 to the appropriate code.
191
192 There are also special cases: NaNs, x < -1, and 1 < x. These are weeded
193 out in PositiveTail or NegativeTail.
194*/
195
196 ucomisd Address(pPoint), x
197 ja PositiveTail
198
199 ucomisd Address(nPoint), x
200 jb NegativeTail
201
202
203/* Here we have nPoint <= x <= pPoint. This is handled with a simple
204 evaluation of a polynomial that approximates arcsine.
205
206 The polynomial has been arranged into the form:
207
208 x + x *
209 c2 * (x**4 + c01 * x**2 + c00)
210 * x**2 * (x**4 + c11 * x**2 * c10).
211
212 The coefficients c00 and c10 are stored in a pair at C0, and c01 and c11
213 are stored at C1. c2 is at C2.
214
215 The two quartic factors are evaluated in SIMD registers. For brevity, some
216 comments below describe only one element of a register. The other is
217 analagous.
218*/
219 movsd x, x1 // Save a copy of x for later.
220 mulsd x, x // Form x**2.
221 movapd Address(C1), p // Get first coefficient pair.
222 unpcklpd x, x // Duplicate in SIMD register.
223 addpd x, p // Form x**2 + c1.
224 mulpd x, p // Form x**4 + c1 * x**2.
225 movlpd Address(C2), x // Put c2 in low element, with x**2 in high.
226 mulsd x1, x // Multiply by x.
227 addpd Address(C0), p // Form x**4 + c1 * x**2 + c0.
228 mulpd x, p // Multiply by c2*x in low and x**2 in high.
229 movhlps p, pa // Get high element.
230 mulsd pa, p // Multiply two factors.
231 addsd x1, p // Add final term.
232
233// Return the double-precision number currently in p.
234ReturnDoubleInp:
235 cvtsd2ss p, p // Convert result to single precision.
236
237// Return the single-precision number currently in p.
238ReturnSingleInp:
239
240 #if defined __i386__
241 movss p, Argx // Shuttle result through memory.
242 // This uses the argument area for scratch space, which is allowed.
243 flds Argx // Return input on floating-point stack.
244 #else
245 // On x86_64, the return value is now in p, which is %xmm0.
246 #endif
247
248 ret
249
250
251// Handle pPoint < x.
252PositiveTail:
253
254 movsd Address(pOne), w // Get +1 for math and comparison.
255 ucomisd w, x // Compare x to +1.
256 jae InputIsPositiveSpecial
257
258/* Here we have pPoint < x < 1. The algorithm here is inspired by the
259 identity arcsine(x) = pi/2 - 2 * arcsine(sqrt((1-x)/2)). Replacing arcsine
260 with an approximating polynomial would give an odd polynomial in
261 sqrt((1-x)/2), which is the same as sqrt(1-x) multiplied by some polynomial
262 in x. So we have:
263
264 arcsine(x) ~= pi/2 + sqrt(1-x) * t(x).
265
266 Unfortunately, the square-root instruction (rsqrtss) takes too long, so we
267 use the faster reciprocal-square-root-estimate instruction instead and
268 refine its estimate. Given an estimate e from the rsqrtss instruction, the
269 square root of 1-x is very nearly, e * (1-x) * s(e**2 * (1-x)). Let e2a
270 be e**2 * (1-x), so sqrt(1-x) is nearly e * a * s(e2a). The leading
271 coefficient of s, cl, has been removed (by dividing s by it) and multiplied
272 into the polynomial p above. That leaves:
273
274 sqrt(1-x) / cl ~= e * a * s(e2a)/cl.
275
276 s(e2a)/cl is e2a**2 + s1 * e2a + s0, where s1 and s0 have been stored at
277 labels S1 and S0, above.
278
279 t(x) has been arranged into the form:
280
281 t2
282 * (x**2 + t01 * x + t00)
283 * (x**2 + t11 * x + t10).
284
285 The two quadratic factors are evaluated in SIMD registers. For brevity,
286 some comments below describe only one element of a register. The other is
287 analagous.
288
289 So, our job here is to evaluate:
290
291 a = 1-x.
292
293 e = estimate of 1/sqrt(a).
294
295 e2a = e*e*a.
296
297 asinf(x) ~= pi/2 + e * a
298 * (e2a**2 + s1 * e2a + s0)
299 * t2
300 * (x**2 + t01 * x + t00)
301 * (x**2 + t11 * x + t10).
302*/
303
304 subsd x, w // Form 1-x.
305 cvtsd2ss w, e // Convert to single precision for rsqrtss.
306 movapd Address(T1), p // Start asinf polynomial.
307 unpcklpd x, x // Duplicate x.
308 addpd x, p // Form x + t1.
309 #if !defined DevelopmentInstrumentation
310 // This is the regular code used in the deployed implementation.
311 rsqrtss e, e // Estimate 1/sqrt(1-x).
312 #else
313 /* This instruction uses an estimate of 1/sqrt(1-x) passed by the
314 caller rather than the rsqrtss instruction. This allows us to test
315 the implementation with all values that rsqrtss might return.
316 */
317 movss 8(%esp), e // Use caller's estimate of 1/sqrt(1-x).
318 #endif
319 cvtss2sd e, e // Convert to double precision.
320 mulpd x, p // Form x**2 + t1*x.
321 mulsd e, w // Form e * (1-x).
322 mulsd w, e // Form e**2 * (1-x).
323 // "e" in comments refers to the initial estimate from rsqrtss.
324 movhpd Address(pT2), w // Copy coefficient into high element.
325 addpd Address(T0), p // Form x**2 + t1*x + t0.
326 movsd Address(S1), rss // Form s1.
327 mulpd w, p // Form e * (1-x) * p(x), split.
328 addsd e, rss // Form e**2 * (1-x) + s1.
329 movhlps p, pa // Separate high element.
330 mulsd pa, p // Finish e * (1-x) * p(x).
331 mulsd e, rss // Form e2a**2 + s1 * e2a.
332 addsd Address(S0), rss // Form e2a**2 + s1 * e2a + s0.
333 mulsd rss, p // Form e * (1-x) / sqrt(1-x) * p(x).
334 addsd Address(HalfPi), p // Form pi/2 - 2*asin(sqrt((1-x)/2)).
335
336 jmp ReturnDoubleInp
337
338
339// Handle x < nPoint.
340NegativeTail:
341
342 movsd Address(pOne), w // Get +1 for math.
343 ucomisd Address(nOne), x // Compare x to -1.
344 jbe InputIsNegativeSpecial
345
346/* Here we have -1 < x < nPoint. We use the same algorithm as in PositiveTail
347 but adapted for -x.
348
349 For brevity, some comments below describe only one element of a register.
350 The other is analagous.
351
352 Our job here is to evaluate:
353
354 a = 1+x.
355
356 e = estimate of 1/sqrt(a).
357
358 e2a = e*e*a.
359
360 asinf(x) ~= -pi/2 - e * a
361 * (e2a**2 + s1 * e2a + s0)
362 * -t2
363 * (x**2 - t01 * x + t00)
364 * (x**2 - t11 * x + t10).
365
366 Note that the signs of terms involving x are negated from those in
367 PositiveTail, and the result is negated as well (by changing "pi/2" to
368 "-pi/2" and "t2" to "-t2").
369
370 For convenience, the final two factors are negated:
371
372 * (-x**2 + t01 * x - t00)
373 * (-x**2 + t11 * x - t10).
374*/
375
376 addsd x, w // Form 1+x.
377 cvtsd2ss w, e // Convert to single precision for rsqrtss.
378 movapd Address(T1), p // Start asinf polynomial.
379 unpcklpd x, x // Duplicate x.
380 subpd x, p // Form -x + t1.
381 #if !defined DevelopmentInstrumentation
382 // This is the regular code used in the deployed implementation.
383 rsqrtss e, e // Estimate 1/sqrt(1+x).
384 #else
385 /* This instruction uses an estimate of 1/sqrt(1+x) passed by the
386 caller rather than the rsqrtss instruction. This allows us to test
387 the implementation with all values that rsqrtss might return.
388 */
389 movss 8(%esp), e // Use caller's estimate of 1/sqrt(1+x).
390 #endif
391 cvtss2sd e, e // Convert to double precision.
392 mulpd x, p // Form -x**2 + t1*x.
393 mulsd e, w // Form e * (1+x).
394 mulsd w, e // Form e**2 * (1+x).
395 // "e" in comments refers to the initial estimate from rsqrtss.
396 movhpd Address(nT2), w // Copy coefficient into high element.
397 subpd Address(T0), p // Form -x**2 + t1*x - t0.
398 movsd Address(S1), rss // Form s1.
399 addsd e, rss // Form e**2 * (1+x) + s1.
400 mulpd w, p // Form e * (1+x) * p(x), split.
401 mulsd e, rss // Form e2a**2 + s1 * e2a.
402 movhlps p, pa // Separate high element.
403 addsd Address(S0), rss // Form e2a**2 + s1 * e2a + s0.
404 mulsd pa, p // Finish e * (1+x) * p(x).
405 mulsd rss, p // Form e * (1+x) / sqrt(1+x) * p(x).
406 subsd Address(HalfPi), p // Form -pi/2 - 2*asin(sqrt((1+x)/2)).
407
408 jmp ReturnDoubleInp
409
410
411/* Here we handle inputs greater than or equal to one, including infinity,
412 but not including NaNs. The condition code must be set to indicate equal
413 (zero flag is one) iff the input is one.
414*/
415InputIsPositiveSpecial:
416 je InputIsPositiveOne
417 jmp SignalInvalid
418
419
420/* Here we handle inputs less than or equal to -1, including -infinity, and
421 NaNs. The condition code must be set as if by using ucomisd to compare
422 the input in the "source operand" to -1 in the "destination operand".
423*/
424InputIsNegativeSpecial:
425 jp InputIsNaN
426 je InputIsNegativeOne
427
428 // Continue into SignalInvalid.
429// jmp SignalInvalid
430
431
432/* Here we handle inputs outside the domain of the function. We subtract
433 infinity from itself to generate the invalid signal and return a NaN.
434*/
435
436 .literal4
437Infinity: .long 0x7f800000
438
439 .text
440SignalInvalid:
441 movss Address(Infinity), p
442 subss p, p // Generate invalid signal and NaN value.
443 jmp ReturnSingleInp
444
445
446/* Here we handle an input of +1 or -1. arcsine(+1) is pi/2, which increases
447 when rounded to single precision. However, we are required to return
448 results in [-pi/2, pi/2], so we return pi/2 rounded toward zero.
449*/
450
451 .literal8
452/* Define values near +pi/2 and -pi/2 that yield +/- pi/2 rounded toward zero
453 when converted to single precision. This allows us to generate inexact and
454 return the desired values for asinf(+1) and asinf(-1).
455*/
456AlmostpHalfPi: .double +1.5707962
457AlmostnHalfPi: .double -1.5707962
458
459 .text
460InputIsPositiveOne:
461 cvtsd2ss Address(AlmostpHalfPi), p
462 jmp ReturnSingleInp
463
464InputIsNegativeOne:
465 cvtsd2ss Address(AlmostnHalfPi), p
466 jmp ReturnSingleInp
467
468
469InputIsNaN:
470
471 #if defined __i386__
472 flds Argx // Return result on floating-point stack.
473 #else
474 cvtsd2ss x, %xmm0 // Return in %xmm0.
475 /* We cannot just return the input, because we must quiet a
476 signalling NaN.
477 */
478 #endif
479
480 ret