src/libm/Source/Intel/sincostan.c at fixPythonPipStalling

overby.me / darling-nix
fork atom
this repo has no description
fork atom
darling-nix / src / libm / Source / Intel / sincostan.c
at fixPythonPipStalling 1102 lines 36 kB view raw
wrap content
Lubos Dolezel Restructured source tree to prepare for merge with the "darling" repo 10y ago
f228ae16
   1/*	sincostan.c -- sin, cos, and tan for standard math library.
   2
   3	These are preliminary versions of sin, cos, and tan written to provide
   4	argument reduction using the correct value of pi, rather than pi rounded to
   5	66-bits as used by the Intel trigonometric instructions.  Versions to
   6	enhance accuracy and speed may follow.
   7*/
   8
   9
  10#include <math.h>
  11
  12
  13// Describe the destination floating-point type.
  14#define	SignificandBits	53
  15#define	ExponentBits	11
  16
  17
  18/*	Define a structure for accessing the internal components of a double.
  19
  20	s.exponent is the raw exponent field.
  21
  22	s.significand2 is the least two significant bits of the significand field.
  23
  24	The other fields are unused.
  25*/
  26typedef union
  27{
  28	double d;
  29	struct
  30	{
  31		#if defined __BIG_ENDIAN__
  32			unsigned int sign         : 1;
  33			unsigned int exponent     : ExponentBits;
  34			unsigned int significand0 : SignificandBits-1-32;
  35			unsigned int significand1 : 30;
  36			unsigned int significand2 :  2;
  37		#else	// defined __BIG_ENDIAN__
  38			unsigned int significand2 :  2;
  39			unsigned int significand1 : 30;
  40			unsigned int significand0 : SignificandBits-1-32;
  41			unsigned int exponent     : ExponentBits;
  42			unsigned int sign         : 1;
  43		#endif	// defined __BIG_ENDIAN__
  44	} s;
  45} Double;
  46
  47
  48/*	Each of the following numbers is a strict upper bound on some technique
  49	for calculating sin, cos, or tan.  That is, the technique is known to apply
  50	when |x| < bound, where x is the function argument.  (ReduceFull also
  51	requires that \x| be greater than some amount around 0x1p-622, to avoid
  52	underflow or loss of accuracy.)
  53
  54	If |x| < BoundDenormal, x is denormal or zero.
  55
  56	If |x| < BoundTrivialSin, x is the correctly rounded sine of x.
  57
  58	If |x| < BoundTrivialCos, 1 is the correctly rounded cosine of x.
  59
  60	If |x| < BoundTrivialTan, x is the correctly rounded tangent of x.
  61
  62	If |x| < BoundPolynomial, sinp(x) or cosp(x) may be used for sin(x) or
  63	cos(x), with no argument reduction.
  64
  65	If |x| < BoundMedium, ReduceMedium may be used to reduce x.
  66
  67	If |x| < BoundFull, ReduceFull may be used to reduce x.
  68
  69	Otherwise, x is an infinity or a NaN.
  70*/
  71#define	BoundDenormal	0x1p-1022
  72#define	BoundTrivialSin	0x1.7137449123ef7p-26
  73#define	BoundTrivialCos	0x1.6A09E667F3BCDp-27
  74#define	BoundTrivialTan	0x1.250BFE1B082F5p-26
  75#define	BoundPolynomial	0x3.243f6a8885a31p-2
  76#define	BoundMedium		0x1.000013be57a40p19
  77#define	BoundFull		INFINITY
  78
  79
  80/*	Below, we define methods to test whether a function argument, x, is within
  81	bounds suitable for a techique for implementing the function.  Several
  82	methods are defined, using natural floating-point comparisons or using
  83	integer comparisons on 32 or 64 bits of the floating-point representation.
  84	Implementations may choose whichever method is best for performance.
  85
  86	To use, first prepare the key used for checking bounds:
  87
  88		BoundKeyType xk = Key(x);
  89
  90	That is done once at the start of a routine.  Then, as many times as
  91	desired, bounds are checked:
  92
  93		if (Within(xk, bound))
  94			...
  95
  96	The "bound" must be one of the names defined above, such as BoundMedium.
  97
  98	If Within(xk, bound) evaluates to true, then |x| < bound.  The converse is
  99	not necessarily true; if Within(xk, bound) is false, |x| might be less than
 100	bound.  This is because some implementations of Within do not check all
 101	bits of xk and bound.  They are intended only as fast screens that get the
 102	majority of values.  However, if floor(log[2](|x|)) < floor(log[2](bound)),
 103	Within(xk, bound) is guaranteed to be true -- its evaluation includes at
 104	least all of the floating-point exponent.
 105*/
 106#if defined __i386__	// (Add any other desired architectures.)
 107
 108	/*	This method uses integer comparisons with the most significant 32 bits
 109		of the IEEE-754 representation of the floating-point number.
 110	*/
 111
 112	#include <inttypes.h>
 113
 114	typedef uint32_t BoundKeyType;
 115
 116	typedef union
 117	{
 118		double d;
 119		#if defined __BIG_ENDIAN__
 120			struct { uint32_t key, notkey; } s;
 121		#else
 122			struct { uint32_t notkey, key; } s;
 123		#endif
 124	} DoubleForBound;
 125
 126	// Prepare constants containing the bounds in our desired form.
 127	#define	DefineBound(bound)	\
 128		static const DoubleForBound Double##bound = { bound };
 129	DefineBound(BoundDenormal)
 130	DefineBound(BoundTrivialSin)
 131	DefineBound(BoundTrivialCos)
 132	DefineBound(BoundTrivialTan)
 133	DefineBound(BoundPolynomial)
 134	DefineBound(BoundMedium)
 135	DefineBound(BoundFull)
 136
 137	// Get the key for |x|.
 138	static BoundKeyType Key(double x)
 139	{
 140		const DoubleForBound X = { x };
 141		BoundKeyType t = X.s.key;
 142		return t & 0x7fffffff;
 143	}
 144
 145	#define	Within(xk, bound)	((xk) < Double##bound.s.key)
 146
 147#elif defined __x86_64__	// (Add any other desired architectures.)
 148
 149	/*	This method uses integer comparisons with all 64 bits of the IEEE-754
 150		representation of the floating-point number.
 151	*/
 152
 153	#include <inttypes.h>
 154
 155	typedef uint64_t BoundKeyType;
 156
 157	typedef union
 158	{
 159		double d;
 160		uint64_t key;
 161	} DoubleForBound;
 162
 163	// Prepare constants containing the bounds in our desired form.
 164	#define	DefineBound(bound)	\
 165		static const DoubleForBound Double##bound = { bound };
 166	DefineBound(BoundDenormal)
 167	DefineBound(BoundTrivialSin)
 168	DefineBound(BoundTrivialCos)
 169	DefineBound(BoundTrivialTan)
 170	DefineBound(BoundPolynomial)
 171	DefineBound(BoundMedium)
 172	DefineBound(BoundFull)
 173
 174	// Get the key for |x|.
 175	static BoundKeyType Key(double x)
 176	{
 177		const DoubleForBound X = { x };
 178		BoundKeyType t = X.key;
 179		return t & 0x7fffffffffffffff;
 180	}
 181
 182	#define	Within(xk, bound)	((xk) < Double##bound.key)
 183
 184#else
 185
 186	// If no other method is selected, use regular floating-point comparison.
 187
 188	typedef double BoundKeyType;
 189
 190	#define	DefineBound(bound)
 191
 192	// Get the key for |x|.
 193	static BoundKeyType Key(double x)
 194	{
 195		#if defined __i386__ || defined __x86_64__
 196			// On Intel, just clear the sign bit.
 197			double xk = -0.;
 198			__asm__("andnpd %[x], %[xk]" : [xk] "+x" (xk) : [x] "x" (x));
 199			return xk;
 200		#elif 1
 201			return x < 0 ? -x : +x;
 202		#else
 203			return fabs(x);
 204		#endif
 205	}
 206
 207	#define	Within(xk, bound)	((xk) < bound)
 208
 209#endif
 210
 211
 212/*	Several argument-reduction routines follow.  Each has this specification:
 213
 214		static void Reduce<Name>(double *xp, int *a, double x).
 215
 216	Input:
 217	
 218		x is a number to be reduced modulo pi/2.  Each routine has
 219		qualifications on what values it supports for x.  Generally, x must not
 220		be a NaN or infinity.
 221
 222	Output:
 223
 224		*xp is set to a residue of x modulo pi/2.  Mostly, -pi/4 <= x <= +pi/4,
 225		but *xp may be outside this interval by as much as 3.24128e-11.
 226
 227		*a is set to the arc of the circle x is in.  0 <= *a < 4.
 228
 229		On output, x is approximately k * 2*pi + *a * pi/2 + *xp for some
 230		integer k.  This holds even if x is outside [-pi/4, pi/4]; *a matches
 231		*xp.
 232
 233	This means all the reduction routines are returning *xp in radians.
 234	ReduceFull goes through steps in which it has period-fraction units (0 to 1
 235	corresponding to 0 to pi/2 radians), so the restoration of radians is
 236	forced and introduces additional rounding error and execution time.  We
 237	might want to do something about that.
 238
 239	Since *xp is a single double, it is insufficient to represent the residue
 240	precisely enough to compute a faithfully rounded function.  For that, we
 241	will have to return extended precision, perhaps a long double or perhaps
 242	two doubles.
 243*/
 244
 245
 246/*	static void ReduceFull(double *xp, int *a, double x).
 247
 248	Input:
 249	
 250		x is a number to be reduced modulo pi/2.  This routine requires 1 <=
 251		|x|.  (The actual limit is something smaller, perhaps around 0x1p-622,
 252		but 1 suffices.)
 253
 254	Output:
 255
 256		*xp is set to x modulo pi/2.
 257
 258		*a is set to the arc of the circle x is in.  0 <= *a < 4.
 259
 260		On output, x is approximately k * 2*pi + *a * pi/2 + *xp for some
 261		integer k.
 262
 263	This routine was adapted from the _sin function in double_fns.h (which
 264	implements vvsin).
 265*/
 266static void ReduceFull(double *xp, int *a, double x)
 267{
 268	// The rows are 8-bit shifts of 27-bit windows on 2/pi * 0x1p400.
 269	static const double TwoOverPiWithOffset[4][45] =
 270	{
 271		{
 272			+0x0.0000000p+0, +0x1.45f3070p+399, -0x1.1b1bbe8p+372,
 273			-0x1.6b01ec8p+345, +0x1.5f47d50p+318, -0x1.6447e48p+291,
 274			-0x1.3ad4ce0p+263, +0x1.e21c820p+237, +0x1.fe51640p+208,
 275			-0x1.5085110p+182, +0x1.586dca0p+154, -0x1.c8e2df0p+129,
 276			+0x1.374b800p+102, +0x1.924bbb0p+74, -0x1.f62e6e0p+48,
 277			+0x1.cfe1df0p+20, -0x1.38d3b58p-6, -0x1.63045e0p-34,
 278			+0x1.1afa980p-63, -0x1.44bb7b0p-88, -0x1.6638fe0p-116,
 279			+0x1.ad17e00p-142, -0x1.bec66e0p-168, -0x1.4e33e58p-195,
 280			+0x1.9cfa4e0p-223, +0x1.08bf178p-249, -0x1.036be40p-279,
 281			+0x1.8ffc4c0p-303, -0x1.0fd3000p-339, -0x1.fc04340p-358,
 282			-0x1.dce94c0p-385, +0x1.4da3ee0p-413, -0x1.64c0980p-439,
 283			-0x1.b069ec8p-465, -0x1.1617380p-493, -0x1.32c3400p-521,
 284			-0x1.5d28ae0p-548, +0x1.eeb1fb0p-574, -0x1.a0e8500p-604,
 285			+0x1.e839cf8p-627, +0x1.e294a48p-654, +0x1.d4d7f68p-681,
 286			+0x1.fb11f90p-708, -0x1.517bd50p-735, +0x1.9823800p-767
 287		},
 288		{
 289			+0x0.0000000p+0, +0x1.45f3000p+399, +0x1.b727240p+377,
 290			-0x1.f56b020p+353, +0x1.3abe900p+325, -0x1.5964480p+299,
 291			+0x1.b6c52b0p+271, +0x1.93c4390p+244, +0x1.07f9480p+214,
 292			-0x1.38a8428p+191, -0x1.0ea7920p+162, -0x1.b7238c0p+135,
 293			+0x1.09374b8p+110, +0x1.924c000p+74, -0x1.15f62e8p+56,
 294			+0x1.21cfe20p+28, -0x1.0a71a70p+1, -0x1.acb1820p-25,
 295			-0x1.77dca08p-52, -0x1.68a25d8p-79, -0x1.ec598e0p-106,
 296			-0x1.fb29740p-133, -0x1.037d8d0p-161, +0x1.1d63980p-188,
 297			+0x1.a99cfa0p-215, +0x1.3908bf0p-241, +0x1.77bf250p-269,
 298			+0x1.d8ffc80p-299, -0x1.a000880p-322, +0x1.6603fc0p-350,
 299			-0x1.a1dce90p-377, -0x1.2fac970p-403, -0x1.2593040p-433,
 300			+0x1.9e4f960p-457, +0x1.36e9e90p-485, -0x1.c099620p-512,
 301			+0x1.7fa8b60p-538, -0x1.5b08a70p-565, -0x1.41a0e80p-596,
 302			-0x1.30be300p-622, -0x1.821d6b8p-646, +0x1.25d4d80p-673,
 303			-0x1.2813b80p-702, -0x1.ca8be00p-730, +0x1.580cc10p-754
 304		},
 305		{
 306			+0x0.0000000p+0, +0x1.4600000p+399, -0x1.9f246c0p+386,
 307			-0x1.bbead60p+360, -0x1.ec54100p+329, -0x1.c159648p+307,
 308			+0x1.c0db628p+280, +0x1.5993c40p+252, +0x1.c820ff0p+225,
 309			+0x1.458eaf0p+198, +0x1.ebbc560p+172, +0x1.b7246e0p+144,
 310			+0x1.d2126f0p+117, -0x1.a3ff370p+91, +0x1.2eea0a0p+64,
 311			-0x1.736f180p+37, -0x1.e214e40p+8, +0x1.62534e8p-17,
 312			-0x1.177dc80p-48, -0x1.0568a28p-71, +0x1.1213a68p-98,
 313			-0x1.c7eca60p-127, +0x1.7df9040p-154, +0x1.cc8eb20p-179,
 314			-0x1.9f2b318p-206, -0x1.6c6f780p-237, +0x1.f8bbdf8p-260,
 315			+0x1.283b200p-288, -0x1.da00080p-318, -0x1.fa67f00p-344,
 316			-0x1.0d0ee80p-372, +0x1.6b414e0p-397, -0x1.7049650p-423,
 317			+0x1.fb3c9f0p-450, +0x1.6136ea0p-477, -0x1.7381320p-505,
 318			-0x1.8680578p-530, +0x1.aea4f78p-557, -0x1.38141a0p-584,
 319			-0x1.d098600p-613, +0x1.ce7de28p-638, +0x1.4a4baa0p-666,
 320			-0x1.404a050p-692, +0x1.1f8d5d0p-720, +0x1.0ac0680p-749
 321		},
 322		{
 323			+0x0.0000000p+0, +0x1.8000000p+399, -0x1.d067c90p+396,
 324			-0x1.b1bbeb0p+368, +0x1.4fe13b0p+341, -0x1.05c1598p+315,
 325			+0x1.bb81b70p+287, -0x1.d6a66c0p+260, -0x1.de37df0p+233,
 326			-0x1.ae9c400p+200, -0x1.4214438p+180, -0x1.4f246e0p+153,
 327			+0x1.b8e9090p+126, +0x1.ba5c010p+99, -0x1.b6d1160p+72,
 328			+0x1.3a32440p+43, -0x1.80f10a0p+17, -0x1.c69dac8p-9,
 329			-0x1.8c11780p-36, +0x1.1afa978p-63, -0x1.12edec8p-90,
 330			+0x1.338e050p-117, -0x1.4ba0818p-144, -0x1.f633718p-171,
 331			+0x1.8e60d50p-198, -0x1.8c16c70p-225, +0x1.17e2f00p-254,
 332			-0x1.036be28p-279, +0x1.ff89800p-308, -0x1.0fd3400p-339,
 333			+0x1.fde5e00p-365, +0x1.18b5a10p-388, -0x1.64b8248p-414,
 334			-0x1.9302618p-441, -0x1.834f648p-468, -0x1.6173820p-497,
 335			+0x1.9a797f8p-522, +0x1.45aea50p-549, -0x1.14e0500p-578,
 336			-0x1.a0e84c0p-604, -0x1.7c63040p-631, -0x1.d6b5b40p-658,
 337			-0x1.59404a0p-684, -0x1.3b81cc0p-714, +0x1.7421580p-738
 338		}
 339	};
 340
 341	Double X = { x };
 342
 343	/*	Set ec to the unbiased exponent minus 33.  Why 33?  I do not know.
 344		This was in the vvsin code in double_fns.h.
 345	*/
 346	int ec = X.s.exponent - (1023+33);
 347
 348	// Set k to ceiling(ec / 27) and m to residue.
 349	int k = (ec + 26) * (607*4) >> 16;
 350	int m = 27*k - ec;
 351
 352	// offset is used to select a row in the reduction table.  See below.
 353	int offset = m >> 3;
 354
 355	/*	The reduction table, TwoOverPiWithOffset, contains bits of 2/pi.
 356		First, all entries are scaled by 2**400 to avoid overflow/underflow
 357		issues.  Each entry contains 27 bits of 2/pi, except in the first two
 358		columns.  The first column contains zeroes because no reduction is done
 359		for numbers that are already small.  The entries in the second row
 360		start before (at a bit with higher value) the leading bit of 2/pi, so
 361		they contain some leading zeroes.  The sign bits form part of the 27
 362		bits represented.
 363
 364		Each row forms a contiguous string of bits of 2/pi.  That is, adding
 365		all the entries with sufficient precision yields a single bit string
 366		representing 2/pi.  The rows differ in their starting points; row 0
 367		begins with the leading bit of 2/pi, and rows 1, 2, and 3 begin 8, 16,
 368		and 24 bits before that.
 369	*/
 370
 371	/*	Scale x to avoid overflow in Dekker split.  This is compensated for
 372		in the entries in TwoOverPiWithOffset.
 373	*/
 374	x *= 0x1p-400;
 375
 376	/*	Use Dekker's algorithm to split x into 26 bits and 27 bits.  This
 377		requires round-to-nearest mode.
 378	*/
 379	double xDekker = x * (0x1p27 + 1);
 380	double x0 = xDekker - (xDekker - x);
 381	double x1 = x - x0;
 382
 383	// Get address of starting point in table.
 384	const double *p0 = &TwoOverPiWithOffset[offset][k];
 385
 386	// Get table entries.
 387	const double fp0 = p0[0];
 388	const double fp1 = p0[1];
 389	const double fp2 = p0[2];
 390	const double fp3 = p0[3];
 391
 392	// Get high bits of x * f, where f is the part of 2/pi we are using.
 393	const double f0 = x1 * fp0 + fp1 * x0;
 394
 395	double f = x1 * fp1 + fp2 * x0;
 396
 397	// Combine to do integer work.
 398	const double fi = f0 + f;
 399
 400	static const double IntegerBias = 0x1.8p52;
 401
 402	// Force the integer bits into a specific position.
 403	Double Fi = { fi + IntegerBias };
 404		/*	|fi| is less than 0x1p36, so fi + IntegerBias is well within
 405			[0x1p52, 0x1p53), so it has a known exponent, and the bits with
 406			weight 2 and 1 are the least significant bits in its significand.
 407
 408			We know |fi| is less than 0x1p36 because it is the sum of x1 * fp0,
 409			fp1 * x0, and something small in f.  x0 has the same exponent as
 410			x.  Say 2**e <= |x| < 2**(e+1).  (That is the original x, before we
 411			scaled it by 0x1p-400.)  The table entry we look up for fp0 has
 412			magnitude less than 2**(400+27-27*ceiling((e-33)/27)), and fp1 is
 413			less, bounded by 2**(400-27*ceiling((e-33)/27)).  Including the
 414			scaling in x, fp1 * x0 is less than 2**(e+1-400) *
 415			2**(400-27*ceiling((e-33)/27)) <= 2**34.  Similarly, fp0 * x1 is
 416			less than 2**34, so their sum is less than 2**35.
 417		*/
 418
 419	// Get the two least significant integer bits.
 420	*a = Fi.s.significand2;
 421
 422	double fint = Fi.d - IntegerBias;
 423
 424	const double fp4 = p0[4];
 425	const double fp5 = p0[5];
 426	const double fp6 = p0[6];
 427
 428	f = f0 - fint + f;
 429	f += x1 * fp2 + fp3 * x0;
 430	f += x1 * fp3 + fp4 * x0;
 431	f += x1 * fp4 + fp5 * x0;
 432	f += x1 * fp5 + fp6 * x0;
 433
 434	// Convert to radians by multiplying by pi/2.
 435	*xp = f * 0x3.243F6A8885A3p-1;
 436}
 437
 438
 439/*	static void ReduceMedium(double *xp, int *a, double x).
 440
 441	Input:
 442	
 443		x is a number to be reduced modulo pi/2.  This routine requires |x| <=
 444		X.  X is described below.
 445
 446	Output:
 447
 448		*xp is set to x modulo pi/2.
 449
 450		*a is set to the arc of the circle x is in.  0 <= *a < 4.
 451
 452		On output, x is approximately k * 2*pi + *a * pi/2 + *xp for some
 453		integer k.
 454
 455	Nomenclature:
 456
 457		p is the period, pi/2.
 458
 459		X is the maximum value x may have.  X is 0x1.000013be57a3fp19.
 460
 461		InversePeriod is 1/p, rounded to the nearest double, ties to even.
 462
 463		n is x * InversePeriod, rounded to the nearest double, ties to even,
 464		and then rounded to the nearest integer, ties to even.
 465
 466		N is the maximum value n may have.  N is X * InversePeriod, rounded to
 467		the nearest double, ties to even, and then rounded to the nearest
 468		integer, ties to even.  N is 333773.
 469
 470	Notes:
 471
 472		In comments in this routine, unquoted expressions are mathematical and
 473		quoted expressions are floating-point.  Thus, "a + b" refers to the
 474		floating-point operation of addition, including rounding.
 475
 476		"n * Period[0]" is exact for all |n| <= N.  It is inexact for n = N+1,
 477		so this is the source of the bound on x.  X is the greatest value for
 478		which |n| <= N.  (It is happenstance that Period[0] limits x; if the
 479		period were different, its bits might result in Period[1] causing the
 480		limit.  n has some other roles in the errors in this routine that might
 481		limit x when used with other periods or different precisions.)
 482
 483		The difference between InversePeriod and 1/p can cause n to differ from
 484		the ideal value when x is near a multiple of p.  The result is that
 485		instead of producing a result inside the target interval [-p/2, p/2], a
 486		result slightly outside the interval is produced.  The result may be as
 487		much as 3.24128e-11 outside the interval.  The polynomial and the
 488		polynomial evaluation must be satisfactory over this extended interval.
 489*/
 490static void ReduceMedium(double *xp, int *a, double x)
 491{
 492	static const double InversePeriod =
 493		2 / 0x3.243f6a8885a308d313198a2e03707344ap0;
 494
 495	/*	Period is an extended-precision representation of the reduction period.
 496		Each element except the last has the property that multiplication by
 497		any integer with magnitude at most N is exact.
 498	*/
 499	static const double Period[] = {
 500			+0x1.921FB54440000p-0,
 501			+0x1.68C234C4C0000p-39,
 502			+0x1.98A2E03707345p-77
 503		};
 504
 505	// Estimate x / p and round to nearest integer.
 506	double n = x * InversePeriod + 0x1.8p52 - 0x1.8p52;
 507
 508	// Record which arc of the circle x lies in.
 509	*a = (int) n & 3;
 510
 511	// This is exact, per design of Period.
 512	double np0 = n * Period[0];
 513
 514	// This is exact, per design of Period.
 515	double np1 = n * Period[1];
 516
 517	/*	The error here is at most 1/2 ULP(n * Period[2]), which is
 518		1/2 ULP(n * +0x1.98A2E03707345p-77) = ULP(n * +0x1.98A2E03707345p-78).
 519
 520		n <= N = 333773, so an absolute bound on that is ULP(333773 *
 521		+0x1.98A2E03707345p-78) = 0x1p-111, but we need a bound dependent on n
 522		to partition a proof below.
 523
 524		This is called error (a).
 525	*/
 526	double np2 = n * Period[2];
 527
 528	/*	Set x00 to x - n * Period[0].  np0 is very near x, so this is exact,
 529		per David Goldberg, _What Every Computer Scientist Should Know About
 530		Floating-Point Arithmetic_, Theorem 11.
 531
 532		The nomenclature used here is:
 533
 534			x0 is x - n * (Period[0]),
 535			x1 is x - n * (Period[0] + Period[1]), and
 536			x2 is x - n * (Period[0] + Period[1] + Period[2]).
 537
 538			x00 is the first and only "word" of an extended precision
 539			representation of x0.
 540
 541			x10 is the first "word" of x1; it has the most significant bits.
 542			x11 is the next "word" of x1; it has the following bits.
 543	*/
 544	double x00 = x - np0;
 545
 546	/*	Subtract n * Period[1] from x0 to produce x1.  These operations yield
 547		an exact result, per Knuth, _The Art of Computer Programming_ 2, second
 548		edition, page 221, section 4.2.2, Theorem C, in the sense that x10 +
 549		x11 will exactly equal x0 - n * Period[1].  The first statement puts
 550		the high bits in x10, and the second statement determines what was
 551		rounded away in x10 and puts it in x11.
 552
 553		Knuth assumes |np1| <= |x00|.  This is often the case, but perhaps
 554		cancellation has reduced the magnitude of x00.  Hoever, if |x00| <
 555		|np1|, this arithmetic is exact, because x00's least significant bit is
 556		at least as large as the ULP of Period[1].
 557	*/
 558	double x10 = x00 - np1;
 559	double x11 = x00 - x10 - np1;
 560
 561	/*	This comment demonstrates the rounding error in the following
 562		statement, "t = x11 - np2", is tiny relative to x10.
 563
 564		This is called error (b).
 565
 566		Let LSB(x) be the weight of the least significant bit set in a
 567		floating-point number x.  E.g, LSB(1.25) is 1/4, although ULP(1.25) is
 568		2**-52.
 569		
 570		Here, x10 and x11 are each multiples of LSB(Period[1]).  If |x10| <
 571		2**53 LSB(Period[1]), then x11 is zero, because all the bits of x10 +
 572		x11 fit into x10.  In that case, "x11 - np2" is exact.
 573
 574		Otherwise, 2**53 LSB(Period[1]) <= |x10|, so 2 LSB(Period[1]) <=
 575		ULP(x10).  LSB(Period[1]) = 0x1p-73, so 0x1p-72 <= ULP(x10).
 576
 577		By construction of x10 and x11, |x11| <= 1/2 ULP(x10).
 578		
 579		|np2| <= |"n * Period[2]"| <= |n * Period[2] * (1+2**-53)| = 333733 *
 580		0x1.98A2E03707345p-77 * (1+2**-53) < 0x1.0426p-58.
 581
 582		|"x11 - np2"| <= 2*max(|x11|, |np2|) <= 2*max(1/2 ULP(x10),
 583		0x1.0426p-58) = max(ULP(x10), 0x1.0426p-57).  The rounding error in
 584		"x11 - np2" is at most 2**-53 times this, so it is at most max(2**-53
 585		ULP(x10), 0x1p-110).  Since ULP(x10) is at least 0x1p-72, the rounding
 586		error is at most 0x1p-38 ULP(x10).
 587	*/
 588	double t = x11 - np2;
 589
 590	/*	Add t to x1 to produce x2.  As before, we could use Knuth's technique
 591		to produce an exact sum, x20 + x21 = x10 + t.  For now, we only return
 592		x20.  When we want extended precision, we could return x21 as well.
 593		(If so, be careful to show that |t| <= |x10|, as required for Knuth's
 594		technique, or otherwise show the arithmetic is sufficiently accurate.)
 595
 596		The difference between x20 and x10 + t is called error (c).  It is at
 597		most 1/2 ULP(x20).
 598	*/
 599	double x20 = x10 + t;
 600	// double x21 = x10 - x20 + t;
 601
 602	*xp = x20;
 603
 604	/*	Having subtracted n * (Period[0] + Period[1] + Period[2]) from x, we
 605		have an error caused by the difference between p and Period[0] +
 606		Period[1] + Period[2], which is less than 0x1.6fdb2p-131.  So the error
 607		is less than n * 0x1.6fdb2p-131.
 608	   
 609		n <= N = 333773, so an absolute bound on that is 333773 *
 610		0x1.6fdb2p-131 < 0x1.d45fp-113, but we need a bound dependent on n to
 611		partition a proof below.
 612
 613		This is called error (d).
 614
 615		Our reduction has four errors:
 616
 617			(a), which is at most ULP(n * +0x1.98A2E03707345p-78) <= 0x1p-111.
 618			(b), which is at most 0x1p-38 ULP(x10).
 619			(c), which is at most 1/2 ULP(x20).
 620			(d), which is at most n * 0x1.6fdb2p-131 < 0x1.d45fp-113.
 621
 622		From the discussion of (b), if x11 is not zero, then 0x1p-72 <=
 623		ULP(x10).  Then x10 dominates the result, because np2 is so small that
 624		subtracting it from x1 to produce x2 cannot reduce the exponent by more
 625		than one.  So the errors (a), (b), and (d) are tiny relative to x10,
 626		and the total error is very nearly (c), 1/2 ULP(x20).
 627
 628		However, if x11 is zero, then (b) is zero, and we only need to consider
 629		(a), (c), and (d).  We partition this into two cases.
 630
 631		Case 0:  |x| < 0x1p12.
 632
 633			The double-precision floating-point number closest to a multiple of
 634			pi/2 in that interval is 0x1.6c6cbc45dc8dep6 (according to Maple
 635			code from Muller, _Elementary Functions_), and it is about
 636			0x1.6d61b58c99c43p-60 away from a multiple of pi/2.
 637
 638			n is at most 0x1p12 * InversePi, rounded to a double, so n < 2608.
 639			Then:
 640
 641				(a) is at most ULP(2608 * +0x1.98A2E03707345p-78) = 0x1p-119.
 642				(b) is zero.
 643				(c) is at most 1/2 ULP(x20).
 644				(d) is at most 2608 * 0x1.6fdb2p-131 < 0x1p-119.
 645
 646			The final result might be as low as about 0x1.6d6p-60.  That has an
 647			ULP of 0x1p-113, so (a) and (d) are small relative to it, and the
 648			total error is nearly (c), 1/2 ULP(x20).
 649
 650		Case 1:  0x1p12 <= |x|.
 651
 652			We still have |x| <= 0x1.000013be57a3fp19.  The double-precision
 653			floating-point number closest to a multiple of pi/2 in that set is
 654			0x1.6c6cbc45dc8dep11 (Muller again), and it is about
 655			0x1.6d61b58c99c43p-55 away from a multiple of pi/2.
 656
 657			Then:
 658
 659				(a) is at most ULP(333773 * +0x1.98A2E03707345p-78) = 0x1p-111.
 660				(b) is zero.
 661				(c) is at most 1/2 ULP(x20).
 662				(d) is at most 333773 * 0x1.6fdb2p-131 < 0x1.d45fp-113.
 663
 664			The final result might be as low as about 0x1.6d6p-55.  That has an
 665			ULP of 0x1p-108, so (a) and (d) are small relative to it, and the
 666			total error is nearly (c), 1/2 ULP(x20).
 667
 668		Therefore, the total error is never much more than 1/2 ULP of the value
 669		returned in *xp.
 670	*/
 671}
 672
 673
 674/*	static double sinp(double r)
 675	
 676	Return sine(r) using only a polynomial, no reduction.
 677
 678	Input:
 679	
 680		r, with |r| < pi/4 + 3.24128e-11.
 681
 682	Output:
 683
 684		Approximately sine(r) is returned.
 685*/
 686static double sinp(double r)
 687{
 688	double rr = r * r;
 689
 690	/*	Derived from Cephes Math Library Release 2.8:  June, 2000.  Maple's
 691		infnorm routine says this polynomial is within .0688 ULP of
 692		sine(r) for |r| < pi/4 + 3.24128e-11.
 693	*/
 694	return  r + r * rr * ((((((
 695		+ 1.58962301576546568060E-10) * rr
 696		- 2.50507477628578072866E-8 ) * rr
 697		+ 2.75573136213857245213E-6 ) * rr
 698		- 1.98412698295895385996E-4 ) * rr
 699		+ 8.33333333332211858878E-3 ) * rr
 700		- 1.66666666666666307295E-1 );
 701}
 702
 703
 704/*	static double cosp(double r)
 705	
 706	Return cosine(r) using only a polynomial, no reduction.
 707
 708	Input:
 709	
 710		r, with |r| < pi/4 + 3.24128e-11.
 711
 712	Output:
 713
 714		Approximately cosine(r) is returned.
 715*/
 716static double cosp(double r)
 717{
 718	double rr = r * r;
 719
 720	/*	Derived from Cephes Math Library Release 2.8:  June, 2000.  Maple's
 721		infnorm routine says this polynomial is within .00709 ULP of
 722		cosine(r) for |r| < pi/4 + 3.24128e-11.
 723	*/
 724	return (((((((
 725		- 1.13585365213876817300E-11) * rr
 726		+ 2.08757008419747316778E-9 ) * rr
 727		- 2.75573141792967388112E-7 ) * rr
 728		+ 2.48015872888517045348E-5 ) * rr
 729		- 1.38888888888730564116E-3 ) * rr
 730		+ 4.16666666666665929218E-2 ) * rr
 731		-  .5                       ) * rr
 732		+ 1;
 733}
 734
 735
 736/*	double sin(double x).
 737
 738	Notes:
 739
 740		Citations in parentheses below indicate the source of a requirement.
 741
 742		"C" stands for ISO/IEC 9899:TC2.
 743
 744		The Open Group specification (IEEE Std 1003.1, 2004 edition) adds no
 745		requirements since it defers to C and requires errno behavior only if
 746		we choose to support it by arranging for "math_errhandling &
 747		MATH_ERRNO" to be non-zero, which we do not.
 748
 749	Return value:
 750
 751		For +/- zero, return zero with same sign (C F.9 12 and F.9.1.6).
 752
 753		For +/- infinity, return a NaN (C F.9.1.6).
 754
 755		For a NaN, return the same NaN (C F.9 11 and 13).
 756
 757		Otherwise:
 758
 759			If the rounding mode is round-to-nearest, return sine(x) within a
 760			few ULP.  The maximum error of this routine is not precisely
 761			known.  The maximum error of the reduction might be around 3 ULP,
 762			although this is partly a guess.  The polynomials have small
 763			errors.  The polynomial evaluation might have an error under 1
 764			ULP.  So the worst error for this routine might be under 4 ULP.
 765		  
 766			Not currently implemented:  In other rounding modes, return sine(x)
 767			possibly with slightly worse error, not necessarily honoring the
 768			rounding mode (Ali Sazegari narrowing C F.9 10).
 769
 770		All results are in [-1, 1].
 771
 772	Exceptions:
 773
 774		Raise underflow for a denormal result (C F.9 7 and Draft Standard for
 775		Floating-Point Arithmetic P754 Draft 1.2.5 9.5).  If the input is the
 776		smallest normal, underflow may or may not be raised.  This is stricter
 777		than the older 754 standard.
 778
 779		May or may not raise inexact, even if the result is exact (C F.9 8).
 780
 781		Raise invalid if the input is a signalling NaN (C 5.2.4.2.2 3, in spite
 782		of C 4.2.1) or an infinity (C F.9.1.6) but not if the input is a quiet
 783		NaN (C F.9 11).
 784
 785		May not raise exceptions otherwise (C F.9 9).
 786
 787	Properties:
 788
 789		Desired to be monotonic.  Not yet proven!
 790*/
 791double sin(double x)
 792{
 793	// Get |x| in form used by the bound-check operations for a key.
 794	BoundKeyType xk = Key(x);
 795
 796	double r;
 797	int j0;
 798
 799	/*	Handle denormal numbers and zero here to generate underflow for
 800		denormals numbers and to avoid generating inexact for zero.
 801	*/
 802	if (Within(xk, BoundDenormal))
 803		return x * (1 - 0x1p-53);
 804
 805	/*	For small numbers, we can return x for sine(x).  Unfortunately,
 806		adding this check slows down the other cases, which may be more
 807		frequent.
 808	*/
 809	else if (Within(xk, BoundTrivialSin))
 810	{
 811		/*	Make t0 volatile to force compiler to fetch it at run-time
 812			rather than optimizing away the multiplication.
 813		*/
 814		static volatile const double t0 = 1/3.;
 815
 816		/*	Get t0 once.  If we wrote "t0*t0", the compiler would load it
 817			twice, since it is volatile.
 818		*/
 819		const double t1 = t0;
 820
 821		/*	Perform a multiplication and pass it into an assembly construct to
 822			prevent the compiler from knowing we do not use the result and
 823			optimizing away the multiplication.
 824		*/
 825		__asm__("" : : "X" (t1*t1));
 826
 827		// Return the floating-point number nearest sine(x), which is x.
 828		return x;
 829	}
 830
 831	// If |x| is small, no reduction is necessary.
 832	else if (Within(xk, BoundPolynomial))
 833		r = x, j0 = 0;
 834		/*	The interval for this could be enlarged by examining the sine
 835			polynomial and figuring out how big r can get before the error is
 836			too large.
 837		*/
 838
 839	// If |x| is medium, we can use a fast reduction routine.
 840	else if (Within(xk, BoundMedium))
 841		ReduceMedium(&r, &j0, x);
 842
 843	// Otherwise, we need the full, slow reduction routine.
 844	else if (Within(xk, BoundFull))
 845		ReduceFull(&r, &j0, x);
 846
 847	else
 848		return x-x;
 849
 850	switch (j0)
 851	{
 852		default:
 853		case 0:	return +sinp(r);
 854		case 1:	return +cosp(r);
 855		case 2:	return -sinp(r);
 856		case 3:	return -cosp(r);
 857	}
 858}
 859
 860
 861/*	double cos(double x).
 862
 863	Notes:
 864
 865		Citations in parentheses below indicate the source of a requirement.
 866
 867		"C" stands for ISO/IEC 9899:TC2.
 868
 869		The Open Group specification (IEEE Std 1003.1, 2004 edition) adds no
 870		requirements since it defers to C and requires errno behavior only if
 871		we choose to support it by arranging for "math_errhandling &
 872		MATH_ERRNO" to be non-zero, which we do not.
 873
 874	Return value:
 875
 876		For +/- infinity, return a NaN (C F.9.1.6).
 877
 878		For a NaN, return the same NaN (C F.9 11 and 13).
 879
 880		Otherwise:
 881
 882			If the rounding mode is round-to-nearest, return cosine(x) within a
 883			few ULP.  The maximum error of this routine is not precisely
 884			known.  The maximum error of the reduction might be around 3 ULP,
 885			although this is partly a guess.  The polynomials have small
 886			errors.  The polynomial evaluation might have an error under 1
 887			ULP.  So the worst error for this routine might be under 4 ULP.
 888		  
 889			Not currently implemented:  In other rounding modes, return
 890			cosine(x) possibly with slightly worse error, not necessarily
 891			honoring the rounding mode (Ali Sazegari narrowing C F.9 10).
 892
 893		All results are in [-1, 1].
 894
 895	Exceptions:
 896
 897		Raise underflow for a denormal result (C F.9 7 and Draft Standard for
 898		Floating-Point Arithmetic P754 Draft 1.2.5 9.5).  If the input is the
 899		smallest normal, underflow may or may not be raised.  This is stricter
 900		than the older 754 standard.
 901
 902		May or may not raise inexact, even if the result is exact (C F.9 8).
 903
 904		Raise invalid if the input is a signalling NaN (C 5.2.4.2.2 3, in spite
 905		of C 4.2.1) or an infinity (C F.9.1.5) but not if the input is a quiet
 906		NaN (C F.9 11).
 907
 908		May not raise exceptions otherwise (C F.9 9).
 909
 910	Properties:
 911
 912		Desired to be monotonic.  Not yet proven!
 913*/
 914double cos(double x)
 915{
 916	// Get |x| in form used by the bound-check operations for a key.
 917	BoundKeyType xk = Key(x);
 918
 919	double r;
 920	int j0;
 921
 922	// Avoid generating inexact for zero.
 923	if (x == 0)
 924		return 1;
 925
 926	/*	For small numbers, we can return 1 for cosine(x).  Unfortunately,
 927		adding this check slows down the other cases, which may be more
 928		frequent.  It greatly speeds up cases where |x| < 0x1p-492, by
 929		avoiding arithmetic with denormals.
 930	*/
 931	else if (Within(xk, BoundTrivialCos))
 932	{
 933		/*	Make t0 volatile to force compiler to fetch it at run-time
 934			rather than optimizing away the multiplication.
 935		*/
 936		static volatile const double t0 = 1/3.;
 937
 938		/*	Get t0 once.  If we wrote "t0*t0", the compiler would load it
 939			twice, since it is volatile.
 940		*/
 941		const double t1 = t0;
 942
 943		/*	Perform a multiplication and pass it into an assembly construct to
 944			prevent the compiler from knowing we do not use the result and
 945			optimizing away the multiplication.
 946		*/
 947		__asm__("" : : "X" (t1*t1));
 948
 949		// Return the floating-point number nearest cosine(x), which is 1.
 950		return 1;
 951	}
 952
 953	// If |x| is small, no reduction is necessary.
 954	else if (Within(xk, BoundPolynomial))
 955		r = x, j0 = 0;
 956		/*	The interval for this could be enlarged by examining the cosine
 957			polynomial and figuring out how big r can get before the error is
 958			too large.
 959		*/
 960
 961	// If |x| is medium, we can use a fast reduction routine.
 962	else if (Within(xk, BoundMedium))
 963		ReduceMedium(&r, &j0, x);
 964
 965	// Otherwise, we need the full, slow reduction routine.
 966	else if (Within(xk, BoundFull))
 967		ReduceFull(&r, &j0, x);
 968
 969	else
 970		return x-x;
 971
 972	switch (j0)
 973	{
 974		default:
 975		case 0:	return +cosp(r);
 976		case 1:	return -sinp(r);
 977		case 2:	return -cosp(r);
 978		case 3:	return +sinp(r);
 979	}
 980}
 981
 982
 983/*	double tan(double x).
 984
 985	Notes:
 986
 987		Citations in parentheses below indicate the source of a requirement.
 988
 989		"C" stands for ISO/IEC 9899:TC2.
 990
 991		The Open Group specification (IEEE Std 1003.1, 2004 edition) adds no
 992		requirements since it defers to C and requires errno behavior only if
 993		we choose to support it by arranging for "math_errhandling &
 994		MATH_ERRNO" to be non-zero, which we do not.
 995
 996	Return value:
 997
 998		For +/- zero, return zero with same sign (C F.9 12 and F.9.1.7).
 999
1000		For +/- infinity, return a NaN (C F.9.1.7).
1001
1002		For a NaN, return the same NaN (C F.9 11 and 13).
1003
1004		Otherwise:
1005
1006			If the rounding mode is round-to-nearest, return tangent(x) within
1007			a few ULP.  The maximum error of this routine is not precisely
1008			known.  The maximum error of the reduction might be around 3 ULP,
1009			although this is partly a guess.  The polynomials have small
1010			errors.  The polynomial evaluation might have an error under 1
1011			ULP.  How the final division affects error has not been considered
1012			yet.  4.55 ULP has been observed.
1013		  
1014			Not currently implemented:  In other rounding modes, return
1015			tangent(x) possibly with slightly worse error, not necessarily
1016			honoring the rounding mode (Ali Sazegari narrowing C F.9 10).
1017
1018	Exceptions:
1019
1020		Raise underflow for a denormal result (C F.9 7 and Draft Standard for
1021		Floating-Point Arithmetic P754 Draft 1.2.5 9.5).  If the input is the
1022		smallest normal, underflow may or may not be raised.  This is stricter
1023		than the older 754 standard.
1024
1025		May or may not raise inexact, even if the result is exact (C F.9 8).
1026
1027		Raise invalid if the input is a signalling NaN (C 5.2.4.2.2 3, in spite
1028		of C 4.2.1) or an infinity (C F.9.1.7) but not if the input is a quiet
1029		NaN (C F.9 11).
1030
1031		May not raise exceptions otherwise (C F.9 9).
1032
1033	Properties:
1034
1035		Desired to be monotonic.  Not yet proven!
1036*/
1037double tan(double x)
1038{
1039	// Get |x| in form used by the bound-check operations for a key.
1040	BoundKeyType xk = Key(x);
1041
1042	double r;
1043	int j0;
1044
1045	/*	Handle denormal numbers and zero here to generate underflow for
1046		denormals numbers and to avoid generating inexact for zero.
1047	*/
1048	if (Within(xk, BoundDenormal))
1049		return x * (1 - 0x1p-53);
1050
1051	/*	For small numbers, we can return x for tangent(x).  Unfortunately,
1052		adding this check slows down the other cases, which may be more
1053		frequent.
1054	*/
1055	else if (Within(xk, BoundTrivialTan))
1056	{
1057		/*	Make t0 volatile to force compiler to fetch it at run-time
1058			rather than optimizing away the multiplication.
1059		*/
1060		static volatile const double t0 = 1/3.;
1061
1062		/*	Get t0 once.  If we wrote "t0*t0", the compiler would load it
1063			twice, since it is volatile.
1064		*/
1065		const double t1 = t0;
1066
1067		/*	Perform a multiplication and pass it into an assembly construct to
1068			prevent the compiler from knowing we do not use the result and
1069			optimizing it away.
1070		*/
1071		__asm__("" : : "X" (t1*t1));
1072
1073		// Return the floating-point number nearest tangent(x), which is x.
1074		return x;
1075	}
1076
1077	// If |x| is small, no reduction is necessary.
1078	else if (Within(xk, BoundPolynomial))
1079		r = x, j0 = 0;
1080		/*	The interval for this could be enlarged by examining the
1081			polynomials and figuring out how big r can get before the error is
1082			too large.
1083		*/
1084
1085	// If |x| is medium, we can use a fast reduction routine.
1086	else if (Within(xk, BoundMedium))
1087		ReduceMedium(&r, &j0, x);
1088
1089	// Otherwise, we need the full, slow reduction routine.
1090	else if (Within(xk, BoundFull))
1091		ReduceFull(&r, &j0, x);
1092
1093	else
1094		return x-x;
1095
1096	switch (j0)
1097	{
1098		default:
1099		case 0:	case 2: return + sinp(r) / cosp(r);
1100		case 1:	case 3: return - cosp(r) / sinp(r);
1101	}
1102}