From stephen@wilberforce.math.missouri.edu  Fri Jul 27 02:47:52 2012
Return-Path: <stephen@wilberforce.math.missouri.edu>
Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:4f8:fff6::34])
	by hub.freebsd.org (Postfix) with ESMTP id 94722106564A
	for <FreeBSD-gnats-submit@freebsd.org>; Fri, 27 Jul 2012 02:47:52 +0000 (UTC)
	(envelope-from stephen@wilberforce.math.missouri.edu)
Received: from wilberforce.math.missouri.edu (wilberforce.math.missouri.edu [128.206.184.213])
	by mx1.freebsd.org (Postfix) with ESMTP id 6C2B58FC0C
	for <FreeBSD-gnats-submit@freebsd.org>; Fri, 27 Jul 2012 02:47:52 +0000 (UTC)
Received: from wilberforce.math.missouri.edu (localhost [127.0.0.1])
	by wilberforce.math.missouri.edu (8.14.5/8.14.5) with ESMTP id q6R2lkBj021155
	for <FreeBSD-gnats-submit@freebsd.org>; Thu, 26 Jul 2012 21:47:46 -0500 (CDT)
	(envelope-from stephen@wilberforce.math.missouri.edu)
Received: (from stephen@localhost)
	by wilberforce.math.missouri.edu (8.14.5/8.14.5/Submit) id q6R2lkeR021134;
	Thu, 26 Jul 2012 21:47:46 -0500 (CDT)
	(envelope-from stephen)
Message-Id: <201207270247.q6R2lkeR021134@wilberforce.math.missouri.edu>
Date: Thu, 26 Jul 2012 21:47:46 -0500 (CDT)
From: Stephen Montgomery-Smith <stephen@freebsd.org>
Reply-To: Stephen Montgomery-Smith <stephen@freebsd.org>
To: FreeBSD-gnats-submit@freebsd.org
Cc:
Subject: complex arcsinh, log, etc.
X-Send-Pr-Version: 3.113
X-GNATS-Notify:

>Number:         170206
>Category:       bin
>Synopsis:       [msun] [patch] complex arcsinh, log, etc.
>Confidential:   no
>Severity:       non-critical
>Priority:       low
>Responsible:    freebsd-numerics
>State:          open
>Quarter:        
>Keywords:       
>Date-Required:  
>Class:          change-request
>Submitter-Id:   current-users
>Arrival-Date:   Fri Jul 27 02:50:08 UTC 2012
>Closed-Date:    
>Last-Modified:  Mon Nov 05 20:03:02 UTC 2012
>Originator:     Stephen Montgomery-Smith
>Release:        FreeBSD 8.3-STABLE amd64
>Organization:
>Environment:
System: FreeBSD wilberforce 8.3-STABLE FreeBSD 8.3-STABLE #0: Tue Jun 12 20:29:45 CDT 2012 stephen@wilberforce:/usr/obj/usr/src/sys/GENERIC amd64


	
>Description:
	

Implement casin(h), cacos(h), catan(h), clog functions.

>How-To-Repeat:
	
>Fix:

	

These algorithms seem to have relative errors no worse than 4ULP for the arc-trig functions, and no worse than 5ULP for clog.


# This is a shell archive.  Save it in a file, remove anything before
# this line, and then unpack it by entering "sh file".  Note, it may
# create directories; files and directories will be owned by you and
# have default permissions.
#
# This archive contains:
#
#	cplex.c
#	catrig.c
#
echo x - cplex.c
sed 's/^X//' >cplex.c << 'ab710d5798014bb1e9398bb65fa5894f'
X#include <stdio.h>
X#include <complex.h>
X#include <float.h>
X#include <math.h>
X
X#include "math_private.h"
X
X/* round down to 18 = 54/3 bits */
Xstatic double trim(double x) {
X	uint32_t hi;
X
X	GET_HIGH_WORD(hi, x);
X	INSERT_WORDS(x, hi &0xfffffff8, 0);
X	return x;
X}
X
Xdouble complex
Xclog(double complex z)
X{
X	double x, y;
X	double ax, ay, x0, y0, x1, y1, x2, y2, t, hm1;
X	double val[12];
X	int i, sorted;
X
X	x = creal(z);
X	y = cimag(z);
X
X	/* Handle NaNs using the general formula to mix them right. */
X	if (x != x || y != y)
X		return (cpack(log(hypot(x, y)), atan2(y, x)));
X
X	ax = fabs(x);
X	ay = fabs(y);
X	if (ax < ay) {
X		t = ax;
X		ax = ay;
X		ay = t;
X	}
X
X	/*
X	 * To avoid unnecessary overflow, if x or y are very large, divide x
X	 * and y by M_E, and then add 1 to the logarithm.  This depends on
X	 * M_E being larger than sqrt(2).
X	 * There is a potential loss of accuracy caused by dividing by M_E,
X	 * but this case should happen extremely rarely.
X	 */
X	if (ay > 5e307)
X		return (cpack(log(hypot(x / M_E, y / M_E)) + 1, atan2(y, x)));
X
X	if (ax == 1) {
X		if (ay < 1e-150)
X			return (cpack((ay * 0.5) * ay, atan2(y, x)));
X		return (cpack(log1p(ay * ay) * 0.5, atan2(y, x)));
X	}
X
X	/*
X	 * Because atan2 and hypot conform to C99, this also covers all the
X	 * edge cases when x or y are 0 or infinite.
X	 */
X	if (ax < 1e-50 || ay < 1e-50 || ax > 1e50 || ay > 1e50)
X		return (cpack(log(hypot(x, y)), atan2(y, x)));
X
X	/* 
X	 * From this point on, we don't need to worry about underflow or
X	 * overflow in calculating ax*ax or ay*ay.
X	 */
X
X	/* Some easy cases. */
X
X	if (ax >= 1)
X		return (cpack(log1p((ax-1)*(ax+1) + ay*ay) * 0.5, atan2(y, x)));
X
X	if (ax*ax + ay*ay <= 0.7)
X		return (cpack(log(ax*ax + ay*ay) * 0.5, atan2(y, x)));
X
X	/*
X	 * Take extra care so that ULP of real part is small if hypot(x,y) is
X	 * moderately close to 1.
X	 */
X
X	x0 = trim(ax);
X	ax = ax-x0;
X	x1 = trim(ax);
X	x2 = ax-x1;
X	y0 = trim(ay);
X	ay = ay-y0;
X	y1 = trim(ay);
X	y2 = ay-y1;
X
X	val[0] = x0*x0;
X	val[1] = y0*y0;
X	val[2] = 2*x0*x1;
X	val[3] = 2*y0*y1;
X	val[4] = x1*x1;
X	val[5] = y1*y1;
X	val[6] = 2*x0*x2;
X	val[7] = 2*y0*y2;
X	val[8] = 2*x1*x2;
X	val[9] = 2*y1*y2;
X	val[10] = x2*x2;
X	val[11] = y2*y2;
X
X	/* Bubble sort. */
X	do {
X		sorted = 1;
X		for (i=0;i<11;i++) {
X			if (val[i] < val[i+1]) {
X				sorted = 0;
X				t = val[i];
X				val[i] = val[i+1];
X				val[i+1] = t;
X			}
X		}
X	} while (!sorted);
X
X	hm1 = -1;
X	for (i=0;i<12;i++) hm1 += val[i];
X	return (cpack(0.5 * log1p(hm1), atan2(y, x)));
X}
X
Xfloat complex
Xclogf(float complex z)
X{
X	return clog(z);
X}
X
ab710d5798014bb1e9398bb65fa5894f
echo x - catrig.c
sed 's/^X//' >catrig.c << 'e37ddaa44b334e25d827d6a69ee351aa'
X#include <complex.h>
X#include <float.h>
X#include <math.h>
X
X#include "math_private.h"
X
X/*
X * gcc doesn't implement complex multiplication or division correctly,
X * so we need to handle infinities specially. We turn on this pragma to
X * notify conforming c99 compilers that the fast-but-incorrect code that
X * gcc generates is acceptable, since the special cases have already been
X * handled.
X */
X#pragma	STDC CX_LIMITED_RANGE	ON
X
Xcomplex double clog(complex double z);
X
Xstatic const double
Xone =  1.00000000000000000000e+00, /* 0x3FF00000, 0x00000000 */
Xhuge=  1.00000000000000000000e+300;
X
X/*
X * Testing indicates that all these functions are accurate up to 4 ULP.
X */
X
X/*
X * The algorithm is very close to that in "Implementing the complex arcsine
X * and arccosine functions using exception handling" by T. E. Hull,
X * Thomas F. Fairgrieve, and Ping Tak Peter Tang, published in ACM
X * Transactions on Mathematical Software, Volume 23 Issue 3, 1997, Pages
X * 299-335, http://dl.acm.org/citation.cfm?id=275324
X *
X * casinh(x+iy) = sign(x)*log(A+sqrt(A*A-1)) + sign(y)*I*asin(B)
X * where
X * A = 0.5(|z+I| + |z-I|) = f(x,1+y) + f(x,1-y) + 1
X * B = 0.5(|z+I| - |z-I|)
X * z = x+I*y
X * f(x,y) = 0.5*(hypot(x,y)-y)
X * We also use
X * asin(B) = atan2(sqrt(A*A-y*y),y)
X * A-y = f(x,y+1) + f(x,y-1).
X *
X * Much of the difficulty comes because computing f(x,y) may produce
X * underflows.
X */
X
X/*
X * Returns 0.5*(hypot(x,y)-y).  It assumes x is positive, and that y does
X * not satisfy the inequalities 0 < fabs(y) < 1e-20.
X * If reporting the answer risks an underflow, the underflow flag is set,
X * and it returns 0.5*(hypot(x,y)-y)/x/x.
X */
Xstatic double f(double x, double y, int *underflow) {
X	if (x==0) {
X		*underflow = 0;
X		if (y > 0)
X			return 0;
X		return -y;
X	}
X	if (y==0) {
X		*underflow = 0;
X		return 0.5*x;
X	}
X	if (x < 1e-100 && x < y) {
X		*underflow = 1;
X		return 0.5/(hypot(x,y)+y);
X	}
X	if (x < y) {
X		*underflow = 0;
X		return 0.5*x*x/(hypot(x,y)+y);
X	}
X	*underflow = 0;
X	return 0.5*(hypot(x,y)-y);
X}
X
X/*
X * All the hard work is contained in this function.
X * Upon return:
X * rx = Re(casinh(x+I*y))
X * B_good is set to 1 if the value of B is usable.
X * If B_good is set to 0, A2my2 = A*A-y*y.
X */
Xstatic void do_hard_work(double x, double y, double *rx, int *B_good, double *B, double *A2my2)
X{
X	double R, S, A, fp, fm;
X	int fpuf, fmuf;
X
X	R = hypot(x,y+1);
X	S = hypot(x,y-1);
X	A = 0.5*(R + S);
X
X	if (A < 10) {
X		fp = f(x,1+y,&fpuf);
X		fm = f(x,1-y,&fmuf);
X		if (fpuf == 1 && fmuf == 1) {
X			if (huge+x>one) /* set inexact flag. */
X				*rx = log1p(x*sqrt((fp+fm)*(A+1)));
X		} else if (fmuf == 1) {
X			/* Overflow not possible because fp < 1e50 and x > 1e-100.
X			   Underflow not possible because either fm=0 or fm
X			   approximately bigger than 1e-200. */
X			if (huge+x>one) /* set inexact flag. */
X				*rx = log1p(fp+sqrt(x)*sqrt((fp/x+fm*x)*(A+1)));
X		} else if (fpuf == 1) {
X			/* Similar arguments against over/underflow. */
X			if (huge+x>one) /* set inexact flag. */
X				*rx = log1p(fm+sqrt(x)*sqrt((fm/x+fp*x)*(A+1)));
X		} else {
X			*rx = log1p(fp + fm + sqrt((fp+fm)*(A+1)));
X		}
X	} else
X		*rx = log(A + sqrt(A*A-1));
X
X	*B = y/A; /* = 0.5*(R - S) */
X	*B_good = 1;
X
X	if (*B > 0.5) {
X		*B_good = 0;
X		fp = f(x,y+1,&fpuf);
X		fm = f(x,y-1,&fmuf);
X		if (fpuf == 1 && fmuf == 1)
X			*A2my2 =x*sqrt((A+y)*(fp+fm));
X		else if (fmuf == 1)
X			/* Overflow not possible because fp < 1e50 and x > 1e-100.
X			   Underflow not possible because either fm=0 or fm
X			   approximately bigger than 1e-200. */
X			*A2my2 = sqrt(x)*sqrt((A+y)*(fp/x+fm*x));
X		else if (fpuf == 1)
X			/* Similar arguments against over/underflow. */
X			*A2my2 = sqrt(x)*sqrt((A+y)*(fm/x+fp*x));
X		else
X			*A2my2 = sqrt((A+y)*(fp+fm));
X	}
X}
X
Xdouble complex
Xcasinh(double complex z)
X{
X	double x, y, rx, ry, B, A2my2;
X	int sx, sy;
X	int B_good;
X
X	x = creal(z);
X	y = cimag(z);
X	sx = signbit(x);
X	sy = signbit(y);
X	x = fabs(x);
X	y = fabs(y);
X
X	if (cabs(z) > 1e20) {
X		if (huge+x>one) { /* set inexact flag. */
X			if (sx == 0) return clog(2*z);
X			if (sx == 1) return -clog(-2*z);
X		}
X	}
X
X	if (cabs(z) < 1e-20)
X		if (huge+x>one) /* set inexact flag. */
X			return z;
X
X	do_hard_work(x, y, &rx, &B_good, &B, &A2my2);
X	if (B_good)
X		ry = asin(B);
X	else
X		ry = atan2(y,A2my2);
X
X	if (sx == 1) rx = -rx;
X	if (sy == 1) ry = -ry;
X
X	return cpack(rx,ry);
X}
X
X/*
X * casin(z) = reverse(casinh(reverse(z)))
X * where reverse(x+I*y) = y+x*I = I*conj(x+I*y).
X */
X
Xdouble complex
Xcasin(double complex z)
X{
X	complex result;
X
X	result = casinh(cpack(cimag(z),creal(z)));
X	return cpack(cimag(result),creal(result));
X}
X
X/*
X * cacos(z) = PI/2 - casin(z)
X * but do the computation carefully so cacos(z) is accurate when z is
X * close to 1.
X */
X
Xdouble complex
Xcacos(double complex z)
X{
X	double x, y, rx, ry, B, A2my2;
X	int sx, sy;
X	int B_good;
X	complex w;
X
X	x = creal(z);
X	y = cimag(z);
X	sx = signbit(x);
X	sy = signbit(y);
X	x = fabs(x);
X	y = fabs(y);
X
X	if (cabs(z) > 1e20) {
X		if (huge+x>one) { /* set inexact flag. */
X			w = clog(2*z);
X			if (signbit(cimag(w)) == 0)
X				return cpack(cimag(w),-creal(w));
X			return cpack(-cimag(w),creal(w));
X		}
X	}
X
X	if (cabs(z) < 1e-10)
X		if (huge+x>one) /* set inexact flag. */
X			return cpack(M_PI_2-creal(z),-cimag(z));
X
X	do_hard_work(y, x, &ry, &B_good, &B, &A2my2);
X	if (B_good) {
X		if (sx==0)
X			rx = acos(B);
X		else
X			rx = acos(-B);
X	} else {
X		if (sx==0)
X			rx = atan2(A2my2,x);
X		else
X			rx = atan2(A2my2,-x);
X	}
X
X	if (sy==0) ry = -ry;
X
X	return cpack(rx,ry);
X}
X
X/*
X * cacosh(z) = I*cacos(z) or -I*cacos(z)
X * where the sign is chosen so Re(cacosh(z)) >= 0.
X */
X
Xdouble complex
Xcacosh(double complex z)
X{
X	complex double w;
X
X	w = cacos(z);
X	if (signbit(cimag(w)) == 0)
X		return cpack(cimag(w),-creal(w));
X	else
X		return cpack(-cimag(w),creal(w));
X}
X
X/* 
X * catanh(z) = 0.5 * log((z+1)/(z-1))
X *           = 0.25 * log(|z+1|^2/|z-1|^2) + 0.5 * I * atan2(2y, (1-x*x-y*y))
X */
X
Xdouble complex
Xcatanh(double complex z)
X{
X	double x, y, rx, ry, hp, hm;
X
X	x = creal(z);
X	y = cimag(z);
X
X	if (cabs(z) < 1e-20)
X		if (huge+x>one) /* set inexact flag. */
X			return z;
X
X	if (cabs(z) > 1e20)
X		if (huge+x>one) { /* set inexact flag. */
X			if (signbit(x) == 0)
X				return cpack(0,M_PI_2);
X			return cpack(0,-M_PI_2);
X	}
X
X	if (fabs(y) < 1e-100) {
X		if (huge+x>one) { /* set inexact flag. */
X			hp = (x+1)*(x+1);
X			hm = (x-1)*(x-1);
X		}
X	} else {
X		hp = (x+1)*(x+1)+y*y; /* |z+1|^2 */
X		hm = (x-1)*(x-1)+y*y; /* |z-1|^2 */
X	}
X
X	if (hp < 0.5 || hm < 0.5)
X		rx = 0.25*(log(hp/hm));
X	else if (x > 0)
X		rx = 0.25*log1p(4*x/hm);
X	else
X		rx = -0.25*log1p(-4*x/hp);
X
X	if (x==1 || x==-1) {
X		if (signbit(y) == 0)
X			ry = atan2(2, -y)/2;
X		else
X			ry = atan2(-2, y)/2;
X	} else if (fabs(y) < 1e-100) {
X		if (huge+x>one) /* set inexact flag. */
X			ry = atan2(2*y, (1-x)*(1+x))/2;
X	} else
X		ry = atan2(2*y, (1-x)*(1+x)-y*y)/2;
X
X	return cpack(rx,ry);
X}
X
X/*
X * catan(z) = reverse(catanh(reverse(z)))
X * where reverse(x+I*y) = y+x*I = I*conj(x+I*y).
X */
X
Xdouble complex
Xcatan(double complex z)
X{
X	complex result;
X
X	result = catanh(cpack(cimag(z),creal(z)));
X	return cpack(cimag(result),creal(result));
X}
e37ddaa44b334e25d827d6a69ee351aa
exit

>Release-Note:
>Audit-Trail:

From: Stephen Montgomery-Smith <stephen@missouri.edu>
To: bug-followup@FreeBSD.org, stephen@FreeBSD.org
Cc:  
Subject: Re: bin/170206: complex arcsinh, log, etc.
Date: Thu, 26 Jul 2012 23:50:03 -0500

 Oops - error in the comments for catanh:
 
 catanh(z) = 0.5 * log((1+z)/(1-z))
 

From: Bruce Evans <brde@optusnet.com.au>
To: Stephen Montgomery-Smith <stephen@freebsd.org>
Cc: FreeBSD-gnats-submit@freebsd.org, freebsd-bugs@freebsd.org
Subject: Re: bin/170206: complex arcsinh, log, etc.
Date: Sat, 28 Jul 2012 00:26:56 +1000 (EST)

 On Wed, 25 Jul 2012, Stephen Montgomery-Smith wrote:
 
 > This function seems to be able to compute clog with a worst case relative 
 > error of 4 or 5 ULP.
 > ...
 
 I lost your previous reply about this after reading just the first part.
 Please resend if interested.
 
 First part recovered by vidcontrol:
 
 VC> > I'm still working on testing and fixing clog.  Haven't got near the more
 VC> > complex functions.
 VC> >
 VC> > For clog, the worst case that I've found so far has x^2+y^2-1 ~= 1e-47:
 VC> >
 VC> >      x = 0.999999999999999555910790149937383830547332763671875000000000
 VC> >      y =
 VC> > 0.0000000298023223876953091912775497878893005143652317201485857367516
 VC> >        (need high precision decimal or these rounded to 53 bits binary)
 VC> >      x^2+y^2-1 = 1.0947644252537633366591637369e-47
 VC> 
 VC> That is exactly 2^(-156).  So maybe triple quad precision really is enough.
 
 Hmm.  But you need 53 more value bits after the 156.  Quadruple precision
 gives 3 to spare.  I didn't notice that this number was exactly a power
 of 2, but just added 15-17 for the value bits in decimal to 47 to get over
 60.
 
 VC> > so it needs more than tripled double precision for a brute force
 VC> > evaluation, and the general case is probably worse.  I'm working
 VC> > on a rearrangement so that doubled double precision works in the
 VC> > general case.  Both your version and my version get this case right,
 VC> > but mess up different much easier cases.  It takes insanely great
 VC> > accuracy to get even 1 bit in this case right, but now that we
 
 Tripled double precision is enough for this because -1 cancels with
 leading terms, giving almost quadrupled double precision:
 
 % 	hm1 = -1;
 % 	for (i=0;i<12;i++) hm1 += val[i];
 % 	return (cpack(0.5 * log1p(hm1), atan2(y, x)));
 
 It is the trailing terms that I think don't work right here.  You sort
 them and add from high to low, but normally it is necessary to add
 from low to high (consider terms [1, DBL_EPSILON/2, DBL_EPSILON/4]).
 Adding from high to low cancels with the -1 term, but then only
 particular values work right.  Also, I don't see how adding the low
 terms without extra precision preserves enough precision.
 
 Bruce

From: Stephen Montgomery-Smith <stephen@missouri.edu>
To: Bruce Evans <brde@optusnet.com.au>
Cc: Stephen Montgomery-Smith <stephen@freebsd.org>,
        FreeBSD-gnats-submit@freebsd.org, freebsd-bugs@freebsd.org
Subject: Re: bin/170206: complex arcsinh, log, etc.
Date: Fri, 27 Jul 2012 09:39:55 -0500

 On 07/27/2012 09:26 AM, Bruce Evans wrote:
 > On Wed, 25 Jul 2012, Stephen Montgomery-Smith wrote:
 >
 >> This function seems to be able to compute clog with a worst case
 >> relative error of 4 or 5 ULP.
 >> ...
 >
 > I lost your previous reply about this after reading just the first part.
 > Please resend if interested.
 >
 > First part recovered by vidcontrol:
 >
 > VC> > I'm still working on testing and fixing clog.  Haven't got near
 > the more
 > VC> > complex functions.
 > VC> >
 > VC> > For clog, the worst case that I've found so far has x^2+y^2-1 ~=
 > 1e-47:
 > VC> >
 > VC> >      x =
 > 0.999999999999999555910790149937383830547332763671875000000000
 > VC> >      y =
 > VC> > 0.0000000298023223876953091912775497878893005143652317201485857367516
 > VC> >        (need high precision decimal or these rounded to 53 bits
 > binary)
 > VC> >      x^2+y^2-1 = 1.0947644252537633366591637369e-47
 > VC> VC> That is exactly 2^(-156).  So maybe triple quad precision really
 > is enough.
 >
 > Hmm.  But you need 53 more value bits after the 156.  Quadruple precision
 > gives 3 to spare.  I didn't notice that this number was exactly a power
 > of 2, but just added 15-17 for the value bits in decimal to 47 to get over
 > 60.
 
 I think one should be able to prove mathematically that if the number is 
 as small as 1e-47, only the first one or two bits of the mantissa will 
 be non-zero.  I think that if more than triple double precision is 
 needed, it is only one or two more bits more than triple double precision.
 

From: Stephen Montgomery-Smith <stephen@missouri.edu>
To: Bruce Evans <brde@optusnet.com.au>
Cc: Stephen Montgomery-Smith <stephen@freebsd.org>,
        FreeBSD-gnats-submit@freebsd.org, freebsd-bugs@freebsd.org
Subject: Re: bin/170206: complex arcsinh, log, etc.
Date: Fri, 27 Jul 2012 09:45:02 -0500

 On 07/27/2012 09:26 AM, Bruce Evans wrote:
 
 > %     hm1 = -1;
 > %     for (i=0;i<12;i++) hm1 += val[i];
 > %     return (cpack(0.5 * log1p(hm1), atan2(y, x)));
 >
 > It is the trailing terms that I think don't work right here.  You sort
 > them and add from high to low, but normally it is necessary to add
 > from low to high (consider terms [1, DBL_EPSILON/2, DBL_EPSILON/4]).
 > Adding from high to low cancels with the -1 term, but then only
 > particular values work right.  Also, I don't see how adding the low
 > terms without extra precision preserves enough precision.
 
 I understand what you are saying.  But in this case adding in order of 
 smallest to largest (adding -1 last) won't work.  If all the signs in 
 the same direction, it would work.  But -1 has the wrong sign.
 
 But I can also tell you that I haven't thought my algorithm through 
 every special case.  I can tell you it seems to work in all the examples 
 I tried.  But I don't have a mathematical proof.

From: Stephen Montgomery-Smith <stephen@missouri.edu>
To: Bruce Evans <brde@optusnet.com.au>
Cc: Stephen Montgomery-Smith <stephen@freebsd.org>,
        FreeBSD-gnats-submit@freebsd.org, freebsd-bugs@freebsd.org
Subject: Re: bin/170206: complex arcsinh, log, etc.
Date: Fri, 27 Jul 2012 15:50:14 -0500

 On 07/27/2012 09:26 AM, Bruce Evans wrote:
 
 > VC> > For clog, the worst case that I've found so far has x^2+y^2-1 ~=
 > 1e-47:
 > VC> >
 > VC> >      x =
 > 0.999999999999999555910790149937383830547332763671875000000000
 > VC> >      y =
 > VC> > 0.0000000298023223876953091912775497878893005143652317201485857367516
 > VC> >        (need high precision decimal or these rounded to 53 bits
 > binary)
 > VC> >      x^2+y^2-1 = 1.0947644252537633366591637369e-47
 > VC> VC> That is exactly 2^(-156).  So maybe triple quad precision really
 > is enough.
 
 Furthermore, if you use the computation (x-1)*(x+1)*y*y (assuming as you 
 do x>y>0), only double precision is necessary.  This is proved in the 
 paper "Implementing Complex Elementary Functions Using Exception 
 Handling" by Hull, Fairgrieve, Tang, ACM Transactions on Mathematical 
 Software, Vol 20, No 2, 1994.  They give a bound on the error, which I 
 think can be interpreted as being around 3.9 ULP.
 
 And I think you will see that your example does not contradict their 
 theorem.  Because in your example, x-1 will be rather small.
 
 So to get reasonable ULP (reasonable meaning 4 rather than 1), double 
 precision is all you need.

From: Bruce Evans <brde@optusnet.com.au>
To: Stephen Montgomery-Smith <stephen@missouri.edu>
Cc: Bruce Evans <brde@optusnet.com.au>,
        Stephen Montgomery-Smith <stephen@FreeBSD.org>,
        FreeBSD-gnats-submit@FreeBSD.org, freebsd-bugs@FreeBSD.org
Subject: Re: bin/170206: complex arcsinh, log, etc.
Date: Sat, 28 Jul 2012 14:28:13 +1000 (EST)

 On Fri, 27 Jul 2012, Stephen Montgomery-Smith wrote:
 
 > On 07/27/2012 09:26 AM, Bruce Evans wrote:
 >
 >> VC> > For clog, the worst case that I've found so far has x^2+y^2-1 ~=
 >> 1e-47:
 >> VC> >
 >> VC> >      x =
 >> 0.999999999999999555910790149937383830547332763671875000000000
 >> VC> >      y =
 >> VC> > 0.0000000298023223876953091912775497878893005143652317201485857367516
 >> VC> >        (need high precision decimal or these rounded to 53 bits
 >> binary)
 >> VC> >      x^2+y^2-1 = 1.0947644252537633366591637369e-47
 >> VC> VC> That is exactly 2^(-156).  So maybe triple quad precision really
 >> is enough.
 >
 > Furthermore, if you use the computation (x-1)*(x+1)*y*y (assuming as you do 
 > x>y>0), only double precision is necessary.  This is proved in the paper 
 > "Implementing Complex Elementary Functions Using Exception Handling" by Hull, 
 > Fairgrieve, Tang, ACM Transactions on Mathematical Software, Vol 20, No 2, 
 > 1994.  They give a bound on the error, which I think can be interpreted as 
 > being around 3.9 ULP.
 
 I'm using x*x-1+y*y in doubled precision, which I believe is better.
 
 I'm now thinking of the following refinement: suppose x is close to 1
 and y is close to 0 (other cases are easier to get right accidentally,
 but harder to analyze).  Then u = x-1 in non-doubled precision is exact
 and cancels most low bits.  So u*u is exact in non-doubled precision.
 Thus x*x-1 can be evalated in mostly-non-doubled precision as u*u+2*u.
 2u is exact, and u*u is a tiny correction term (less than about half
 an ulp relative to 2*u).  If we just wanted to pass this result to
 log1p(), it would round to -2*u and no doubled precision would be
 necessary.  But we need to retain it to add to y*y.  The necessary
 extra precision is much easier for addition than for multiplication.
 If I'm right that u*u is less than half an ulp, then (-2*u, u*u) is
 already in my normal form for doubled precision.
 
 Oops, this is essentially the same as (x-1)*(x+1).  x-1 is u, and
 x+1 is u+2, so the product is u*u+2*u grouped in a numerically bad
 way (it either needs extra precision for x+1 and then for the
 multiplication, or loses accuracy starting with x+1).  Did you
 mean doubled precision, not double precision?
 
 This also avoids the following complication: double precision has an
 odd number of bits, so it can't be split in half for calculating x*x
 and y*y.  The usual 26+27 split would give an error of half an ulp in
 doubled doubled precision.  The above avoids this for x*x in some
 critical cases.  I hope in all critical cases.
 
 Cases where x*x and y*y are both nearly 0.5 have other interesting
 points.  If x*x => 0.5, then x*x-1 is exact in doubled precsion.  When
 x*x < 0.5, x*x-1 is not necessarily exact in doubled precision.  I
 handle these cases by writing the expression as (x*x-0.5)+(y*y-0.5).
 When x*x >= 0.5 and y*y >= 0.25, both methods give exact subtractions
 and I think they give the same result.
 
 > And I think you will see that your example does not contradict their theorem. 
 > Because in your example, x-1 will be rather small.
 >
 > So to get reasonable ULP (reasonable meaning 4 rather than 1), double 
 > precision is all you need.
 
 I think you mean doubled precision.
 
 4 in undoubled precision is mediocre, but 4 in doubled precision is
 many more than needed, but I hope I get 0.5+ epsilon.  The result
 starting with double precision would then be accurate with 53-4 or
 (54-0.5-epsilon) extra bits if the log1p() of it were taken with
 infinite precision.  But log1p() has finite precision, and I'm seeing
 the effects of slightly more than half a double precision bit being
 lost on conversion of the doubled double precision x*x+y*y-1 when
 passed to log1p(), and then another slightly more than half [...] lost
 by imperfect rounding of log1p().  So one of my tests is to remove the
 log1p() source of inaccuracy by replacing it with log1pl().  In float
 precision, exhaustive testing is possible though not complete; all
 cases tested with of |z| as close as possible to 1 were perfectly
 rounded.
 
 Bruce

From: Bruce Evans <brde@optusnet.com.au>
To: Stephen Montgomery-Smith <stephen@missouri.edu>
Cc: Bruce Evans <brde@optusnet.com.au>,
        Stephen Montgomery-Smith <stephen@freebsd.org>,
        FreeBSD-gnats-submit@freebsd.org, freebsd-bugs@freebsd.org
Subject: Re: bin/170206: complex arcsinh, log, etc.
Date: Sat, 28 Jul 2012 15:25:04 +1000 (EST)

 On Fri, 27 Jul 2012, Stephen Montgomery-Smith wrote:
 
 > On 07/27/2012 09:26 AM, Bruce Evans wrote:
 >
 >> %     hm1 = -1;
 >> %     for (i=0;i<12;i++) hm1 += val[i];
 >> %     return (cpack(0.5 * log1p(hm1), atan2(y, x)));
 >> 
 >> It is the trailing terms that I think don't work right here.  You sort
 >> them and add from high to low, but normally it is necessary to add
 >> from low to high (consider terms [1, DBL_EPSILON/2, DBL_EPSILON/4]).
 >> Adding from high to low cancels with the -1 term, but then only
 >> particular values work right.  Also, I don't see how adding the low
 >> terms without extra precision preserves enough precision.
 >
 > I understand what you are saying.  But in this case adding in order of 
 > smallest to largest (adding -1 last) won't work.  If all the signs in the 
 > same direction, it would work.  But -1 has the wrong sign.
 
 No, even if all the signs are the same, adding from the highest to lowest
 can lose precision.  Normally at most 1 ulp, while cancelation can lose
 almost 2**MANT_DIG ulps.  Example:
 
 #define	DE	DBL_EPSILON		// for clarity
 
 (1)   1 + DE/2        = 1         (half way case rounded down to even)
 (2)   1 + DE/2 + DE/2 = 1         (double rounding)
 (3)   DE/2 + DE/2 + 1 = 1 + DE    (null rounding)
 
 We want to add -1 to a value near 1 like the above.  Now a leading 1
 in the above will cancel with the -1, and the the order in (3) becomes
 the inaccurate one.  Modify the above by shifting the epsilons and adding
 another 1 to get an example for our context:
 
 (2')  -1 + 1 + DE + DE*DE/2 + DE*DE/2 = DE
        (The leading +-1 are intentionally grouped and cancel.  The other
        terms are (2) multiplied by DE, and suffer the same double rounding.)
 (3')  -1 + 1 + DE*DE/2 + DE*DE/2 + DE = DE + DE*DE
        (The leading +-1 are intentionally grouped and cancel as before.
        The lower terms must be added from low to high, as in (3).)
 
 The right order is perhaps more transparently described as always from
 low to high, with suitable and explicit grouping of terms using
 parentheses to reduce cancelation errors.  But I don't like parentheses
 and prefer to depend on left to right evaluation.  With some parentheses,
 the above becomes:
 
 (3'') (DE**2/2 + DE**2/2 + DE + (1 + -1)
        (Full parentheses for the left to right order would be unreadable,
        so although the order is critical, they shouldn't be used to
        emphasize it.)
 
 Here the cancelation is exact, but in general it gives a nonzero term
 which might need to be sorted into the other terms.  Strictly ordering
 all the terms is usually unnecessary and slow, and is usually not done.
 Neither is the analysis needed to prove that it is unnecessary.  Even
 the above examples (3'*) are sloppy about this.  They only work because
 the cancelation is exact.  In (3'), (-1 + 1) is added first.  That is
 correct since it is lowest (0).  In (3'') (1 + -1) is added last.  That
 is also correct, although the term is not the highest, since it is 0.
 Usually the cancelation won't be exact and gives a term that is far from
 lowest.  Assuming that it is still highest is the best sloppy sorting.
 
 Efficient evaluation of polynomials usually requires regrouping them
 in numerically dangerous ways.  I do some analysis for this.
 
 Bruce

From: Stephen Montgomery-Smith <stephen@missouri.edu>
To: Bruce Evans <brde@optusnet.com.au>
Cc: Stephen Montgomery-Smith <stephen@FreeBSD.org>,
        FreeBSD-gnats-submit@FreeBSD.org, freebsd-bugs@FreeBSD.org
Subject: Re: bin/170206: complex arcsinh, log, etc.
Date: Sat, 28 Jul 2012 00:36:36 -0500

 This is a multi-part message in MIME format.
 --------------090903080003030709020200
 Content-Type: text/plain; charset=ISO-8859-1; format=flowed
 Content-Transfer-Encoding: 7bit
 
 Yes, everywhere I said "double precision" I meant "doubled precision."
 
 I think the papers by Hull et al were perfectly happy with a ULP of 
 around 4.
 
 I have been trying to do a little better, but like you I am noticing 
 that log1p isn't that good either.
 
 I have tried some other things.  I am attaching this example which gets 
 a ULP a little over 2.  I simulate high precision arithmetic by 
 expanding everything out into integers.  I certainly didn't aim for a 
 speedy program.
 
 --------------090903080003030709020200
 Content-Type: text/x-csrc;
  name="cplex.c"
 Content-Transfer-Encoding: 7bit
 Content-Disposition: attachment;
  filename="cplex.c"
 
 #include <stdio.h>
 #include <string.h>
 #include <complex.h>
 #include <float.h>
 #include <math.h>
 
 #include "math_private.h"
 
 /* Get binary digits -d through -d-16.  Assume x > 0 */
 uint32_t get_bits(double x, int d) {
 	uint32_t hi, lo;
 	int e;
 
 	if (x == 0) return 0;
 	e = d+ilogb(x)-4;
 	EXTRACT_WORDS(hi, lo, x);
 	hi &= 0x000fffff;
 	hi |= 0x00100000;
 	if (e <= -32) return 0;
 	if (e <= 0) {
 		hi >>= -e;
 		return hi & 0xffff;
 	}
 	if (e < 32) {
 		hi <<= e;
 		lo >>= (32-e);
 		return (hi | lo) & 0xffff;
 	}
 	if (e == 32)
 		return lo & 0xffff;
 	if (e <= 63) {
 		lo <<= (e-32);
 		return lo & 0xffff;
 	}
 	return 0;
 }
 
 #define NR_BLOCKS 8
 
 double complex
 clog(double complex z)
 {
 	double x, y;
 	double ax, ay, t, hm1;
 	uint64_t xx[NR_BLOCKS+1], yy[NR_BLOCKS+1];
 	uint64_t zz[NR_BLOCKS+1];
 	uint64_t carry;
 	int sign;
 	int i, j;
 
 	x = creal(z);
 	y = cimag(z);
 
 	/* Handle NaNs using the general formula to mix them right. */
 	if (x != x || y != y)
 		return (cpack(log(hypot(x, y)), atan2(y, x)));
 
 	ax = fabs(x);
 	ay = fabs(y);
 	if (ax < ay) {
 		t = ax;
 		ax = ay;
 		ay = t;
 	}
 
 	/*
 	 * To avoid unnecessary overflow, if x or y are very large, divide x
 	 * and y by M_E, and then add 1 to the logarithm.  This depends on
 	 * M_E being larger than sqrt(2).
 	 * There is a potential loss of accuracy caused by dividing by M_E,
 	 * but this case should happen extremely rarely.
 	 */
 	if (ay > 5e307)
 		return (cpack(log(hypot(x / M_E, y / M_E)) + 1, atan2(y, x)));
 
 	if (ax == 1) {
 		if (ay < 1e-150)
 			return (cpack((ay * 0.5) * ay, atan2(y, x)));
 		return (cpack(log1p(ay * ay) * 0.5, atan2(y, x)));
 	}
 
 	/*
 	 * Because atan2 and hypot conform to C99, this also covers all the
 	 * edge cases when x or y are 0 or infinite.
 	 */
 	if (ax < 1e-50 || ay < 1e-50 || ax > 1e50 || ay > 1e50)
 		return (cpack(log(hypot(x, y)), atan2(y, x)));
 
 	/* 
 	 * From this point on, we don't need to worry about underflow or
 	 * overflow in calculating ax*ax or ay*ay.
 	 */
 
 	/* Some easy cases. */
 
 	if (ax*ax + ay*ay <= 0.1 || ax*ax + ay*ay >= 10)
 		return (cpack(log(ax*ax + ay*ay) * 0.5, atan2(y, x)));
 
 	/*
 	 * Take extra care so that ULP of real part is small if hypot(x,y) is
 	 * moderately close to 1.
 	 */
 
 	for (i=-1; i<NR_BLOCKS; i++) {
 		xx[i+1] = get_bits(ax,16*i);
 		yy[i+1] = get_bits(ay,16*i);
 	}
 
 	memset(zz,0,sizeof(zz));
 	for (i=-1; i<NR_BLOCKS; i++)
 		for (j=-1; j<NR_BLOCKS && i+j+1 < NR_BLOCKS; j++) {
 			zz[i+j+2] += xx[i+1]*xx[j+1];
 			zz[i+j+2] += yy[i+1]*yy[j+1];
 		}
 	zz[0]--;
 	carry = 0;
 	for (i=NR_BLOCKS-1; i>=-1; i--) {
 		zz[i+1] += carry;
 		carry = zz[i+1] >> 16;
 		zz[i+1] &= 0xffff;
 	}
 
 	if ((zz[0] & 0x8000) != 0) {
 		sign = 1;
 		for (i=-1; i<NR_BLOCKS; i++)
 			zz[i+1] = 0xffff & (~zz[i+1]);
 	} else
 		sign = 0;
 
 	hm1 = 0;
 	for (i=-1; i<NR_BLOCKS; i++)
 		hm1 += zz[i+1] * exp2(16*(-1-i));
 
 	if (sign == 1) hm1 = -hm1;
 
 	return (cpack(0.5 * log1p(hm1), atan2(y, x)));
 }
 
 float complex
 clogf(float complex z)
 {
 	return clog(z);
 }
 
 
 --------------090903080003030709020200--

From: Stephen Montgomery-Smith <stephen@missouri.edu>
To: Bruce Evans <brde@optusnet.com.au>
Cc: Stephen Montgomery-Smith <stephen@freebsd.org>,
        FreeBSD-gnats-submit@freebsd.org, freebsd-bugs@freebsd.org
Subject: Re: bin/170206: complex arcsinh, log, etc.
Date: Sat, 28 Jul 2012 00:44:04 -0500

 On 07/28/2012 12:25 AM, Bruce Evans wrote:
 > On Fri, 27 Jul 2012, Stephen Montgomery-Smith wrote:
 >
 >> On 07/27/2012 09:26 AM, Bruce Evans wrote:
 >>
 >>> %     hm1 = -1;
 >>> %     for (i=0;i<12;i++) hm1 += val[i];
 >>> %     return (cpack(0.5 * log1p(hm1), atan2(y, x)));
 >>>
 >>> It is the trailing terms that I think don't work right here.  You sort
 >>> them and add from high to low, but normally it is necessary to add
 >>> from low to high (consider terms [1, DBL_EPSILON/2, DBL_EPSILON/4]).
 >>> Adding from high to low cancels with the -1 term, but then only
 >>> particular values work right.  Also, I don't see how adding the low
 >>> terms without extra precision preserves enough precision.
 >>
 >> I understand what you are saying.  But in this case adding in order of
 >> smallest to largest (adding -1 last) won't work.  If all the signs in
 >> the same direction, it would work.  But -1 has the wrong sign.
 >
 > No, even if all the signs are the same, adding from the highest to lowest
 > can lose precision.  Normally at most 1 ulp, while cancelation can lose
 > almost 2**MANT_DIG ulps.  Example:
 >
 > #define    DE    DBL_EPSILON        // for clarity
 >
 > (1)   1 + DE/2        = 1         (half way case rounded down to even)
 > (2)   1 + DE/2 + DE/2 = 1         (double rounding)
 > (3)   DE/2 + DE/2 + 1 = 1 + DE    (null rounding)
 >
 > We want to add -1 to a value near 1 like the above.  Now a leading 1
 > in the above will cancel with the -1, and the the order in (3) becomes
 > the inaccurate one.
 
 Yes, but in my situation, I am rather sure that when I am adding highest 
 to lowest that this won't occur.  I am starting with -1, then adding 
 something close to 1, then adding lots of smaller terms.  And I find it 
 very plausible that the kind of situation you describe won't happen. 
 x0*x0 is close to 1.  x0*x1 is at most sqrt(DE) times smaller.  And so 
 on.  So I think the kind of situation you describe should never happen.
 
 As I said, I don't have a mathematical proof that the kind of thing you 
 describe can NEVER happen.  I just have never observed it happen.

From: Bruce Evans <brde@optusnet.com.au>
To: Stephen Montgomery-Smith <stephen@missouri.edu>
Cc: Bruce Evans <brde@optusnet.com.au>, freebsd-bugs@freebsd.org,
        FreeBSD-gnats-submit@freebsd.org,
        Stephen Montgomery-Smith <stephen@freebsd.org>
Subject: Re: bin/170206: complex arcsinh, log, etc.
Date: Sat, 28 Jul 2012 17:35:51 +1000 (EST)

 On Sat, 28 Jul 2012, Stephen Montgomery-Smith wrote:
 
 > On 07/28/2012 12:25 AM, Bruce Evans wrote:
 >> 
 >> #define    DE    DBL_EPSILON        // for clarity
 >> 
 >> (1)   1 + DE/2        = 1         (half way case rounded down to even)
 >> (2)   1 + DE/2 + DE/2 = 1         (double rounding)
 >> (3)   DE/2 + DE/2 + 1 = 1 + DE    (null rounding)
 >> 
 >> We want to add -1 to a value near 1 like the above.  Now a leading 1
 >> in the above will cancel with the -1, and the the order in (3) becomes
 >> the inaccurate one.
 >
 > Yes, but in my situation, I am rather sure that when I am adding highest to 
 > lowest that this won't occur.  I am starting with -1, then adding something 
 > close to 1, then adding lots of smaller terms.  And I find it very plausible 
 > that the kind of situation you describe won't happen. x0*x0 is close to 1. 
 > x0*x1 is at most sqrt(DE) times smaller.  And so on.  So I think the kind of 
 > situation you describe should never happen.
 
 Ahem.  FP^2 space is not nearly as large as the metaverse (only 2^256
 cases even for sparc64), but it is large enough so that almost
 everything that can happen in it does happen in it.  You are right
 that problems are far away with the x* terms (x0*x0 had better not be
 very close to 1 unless it is exactly 1, since it it is too close then
 it will no longer be many times larger than x0*x1 after subtracting 1
 from it; the other cases for x* are simpler).  The problem is with the
 additional y* terms.  x and y are independent, so for many or most x,
 there are many y's with bits that cause half-way cases when combined with x.
 After splitting and squaring, the bits move around, so it hard to generate
 or control the offending y's.
 
 > As I said, I don't have a mathematical proof that the kind of thing you 
 > describe can NEVER happen.  I just have never observed it happen.
 
 There might be a measly 2^128 bad cases out of 2^256.  Then no one would
 even observe them by chance :-).  But half-way cases are fairly common.
 
 Bruce

From: Stephen Montgomery-Smith <stephen@missouri.edu>
To: Bruce Evans <brde@optusnet.com.au>
Cc: freebsd-bugs@freebsd.org, FreeBSD-gnats-submit@freebsd.org,
        Stephen Montgomery-Smith <stephen@freebsd.org>
Subject: Re: bin/170206: complex arcsinh, log, etc.
Date: Sat, 28 Jul 2012 10:40:41 -0500

 On 07/28/2012 02:35 AM, Bruce Evans wrote:
 > On Sat, 28 Jul 2012, Stephen Montgomery-Smith wrote:
 >
 >> On 07/28/2012 12:25 AM, Bruce Evans wrote:
 >>>
 >>> #define    DE    DBL_EPSILON        // for clarity
 >>>
 >>> (1)   1 + DE/2        = 1         (half way case rounded down to even)
 >>> (2)   1 + DE/2 + DE/2 = 1         (double rounding)
 >>> (3)   DE/2 + DE/2 + 1 = 1 + DE    (null rounding)
 >>>
 >>> We want to add -1 to a value near 1 like the above.  Now a leading 1
 >>> in the above will cancel with the -1, and the the order in (3) becomes
 >>> the inaccurate one.
 >>
 >> Yes, but in my situation, I am rather sure that when I am adding
 >> highest to lowest that this won't occur.  I am starting with -1, then
 >> adding something close to 1, then adding lots of smaller terms.  And I
 >> find it very plausible that the kind of situation you describe won't
 >> happen. x0*x0 is close to 1. x0*x1 is at most sqrt(DE) times smaller.
 >> And so on.  So I think the kind of situation you describe should never
 >> happen.
 >
 > Ahem.  FP^2 space is not nearly as large as the metaverse (only 2^256
 > cases even for sparc64), but it is large enough so that almost
 > everything that can happen in it does happen in it.  You are right
 > that problems are far away with the x* terms (x0*x0 had better not be
 > very close to 1 unless it is exactly 1, since it it is too close then
 > it will no longer be many times larger than x0*x1 after subtracting 1
 > from it; the other cases for x* are simpler).  The problem is with the
 > additional y* terms.  x and y are independent, so for many or most x,
 > there are many y's with bits that cause half-way cases when combined
 > with x.
 > After splitting and squaring, the bits move around, so it hard to generate
 > or control the offending y's.
 >
 >> As I said, I don't have a mathematical proof that the kind of thing
 >> you describe can NEVER happen.  I just have never observed it happen.
 >
 > There might be a measly 2^128 bad cases out of 2^256.  Then no one would
 > even observe them by chance :-).  But half-way cases are fairly common.
 
 I agree.  That is why I am sad that I don't have a mathematical proof. 
 and the probability of picking out the bad example by chance is 
 something like the chances of being hit by a large asteroid, so we will 
 never see it happen in a random experiment.
 

From: Stephen Montgomery-Smith <stephen@missouri.edu>
To: Bruce Evans <brde@optusnet.com.au>
Cc: freebsd-bugs@freebsd.org, FreeBSD-gnats-submit@freebsd.org,
        Stephen Montgomery-Smith <stephen@freebsd.org>
Subject: Re: bin/170206: complex arcsinh, log, etc.
Date: Sat, 28 Jul 2012 10:46:30 -0500

 This is a multi-part message in MIME format.
 --------------060304040300070501030603
 Content-Type: text/plain; charset=ISO-8859-1; format=flowed
 Content-Transfer-Encoding: 7bit
 
 OK.  This clog really seems to work.
 
 x*x + y*y - 1 is computed with a ULP less than 0.8.  The rest of the 
 errors seem to be due to the implementation of log1p.  The ULP of the 
 final answer seems to be never bigger than a little over 2.
 
 
 
 --------------060304040300070501030603
 Content-Type: text/x-csrc;
  name="cplex.c"
 Content-Transfer-Encoding: 7bit
 Content-Disposition: attachment;
  filename="cplex.c"
 
 #include <stdio.h>
 #include <string.h>
 #include <complex.h>
 #include <float.h>
 #include <math.h>
 
 #include "math_private.h"
 
 /* Get binary digits -d through -d-16.  Assume x > 0 */
 uint32_t get_bits(double x, int d) {
 	uint32_t hi, lo;
 	int e;
 
 	if (x == 0) return 0;
 	e = d+ilogb(x)-4;
 	EXTRACT_WORDS(hi, lo, x);
 	hi &= 0x000fffff;
 	hi |= 0x00100000;
 	if (e <= -32) return 0;
 	if (e <= 0) {
 		hi >>= -e;
 		return hi & 0xffff;
 	}
 	if (e < 32) {
 		hi <<= e;
 		lo >>= (32-e);
 		return (hi | lo) & 0xffff;
 	}
 	if (e == 32)
 		return lo & 0xffff;
 	if (e <= 63) {
 		lo <<= (e-32);
 		return lo & 0xffff;
 	}
 	return 0;
 }
 
 #define NR_BLOCKS 10
 
 double complex
 clog(double complex z)
 {
 	double x, y;
 	double ax, ay, t, hm1;
 	uint64_t xx[NR_BLOCKS+1], yy[NR_BLOCKS+1];
 	uint64_t zz[NR_BLOCKS+1];
 	uint64_t carry;
 	int sign;
 	int i, j;
 
 	x = creal(z);
 	y = cimag(z);
 
 	/* Handle NaNs using the general formula to mix them right. */
 	if (x != x || y != y)
 		return (cpack(log(hypot(x, y)), atan2(y, x)));
 
 	ax = fabs(x);
 	ay = fabs(y);
 	if (ax < ay) {
 		t = ax;
 		ax = ay;
 		ay = t;
 	}
 
 	/*
 	 * To avoid unnecessary overflow, if x or y are very large, divide x
 	 * and y by M_E, and then add 1 to the logarithm.  This depends on
 	 * M_E being larger than sqrt(2).
 	 * There is a potential loss of accuracy caused by dividing by M_E,
 	 * but this case should happen extremely rarely.
 	 */
 	if (ay > 5e307)
 		return (cpack(log(hypot(x / M_E, y / M_E)) + 1, atan2(y, x)));
 
 	if (ax == 1) {
 		if (ay < 1e-150)
 			return (cpack((ay * 0.5) * ay, atan2(y, x)));
 		return (cpack(log1p(ay * ay) * 0.5, atan2(y, x)));
 	}
 
 	/*
 	 * Because atan2 and hypot conform to C99, this also covers all the
 	 * edge cases when x or y are 0 or infinite.
 	 */
 	if (ax < 1e-50 || ay < 1e-50 || ax > 1e50 || ay > 1e50)
 		return (cpack(log(hypot(x, y)), atan2(y, x)));
 
 	/* 
 	 * From this point on, we don't need to worry about underflow or
 	 * overflow in calculating ax*ax or ay*ay.
 	 */
 
 	/* Some easy cases. */
 
 	if (ax*ax + ay*ay <= 0.1 || ax*ax + ay*ay >= 10)
 		return (cpack(log(ax*ax + ay*ay) * 0.5, atan2(y, x)));
 
 	/*
 	 * Take extra care so that ULP of real part is small if hypot(x,y) is
 	 * moderately close to 1.  We compute ax*ax + ay*ay - 1 using
 	 * long multiplication in base 2^16.
 	 */
 
 	/*
 	 * Split ax and ay into fixed point numbers with 160 bits after the
 	 * "decimal" point.
 	 */
 	for (i=-1; i<NR_BLOCKS; i++) {
 		xx[i+1] = get_bits(ax,16*i);
 		yy[i+1] = get_bits(ay,16*i);
 	}
 
 	/*
 	 * Long multiplication.  We use 64 bit integers instead of 32 bit
 	 * because we might get slightly bigger than 32 bit numbers due to
 	 * the additions.  (But probably 36 bit integers would be more than
 	 * enough.)
 	 */
 	memset(zz,0,sizeof(zz));
 	for (i=-1; i<NR_BLOCKS; i++)
 		for (j=-1; j<NR_BLOCKS && i+j+1 < NR_BLOCKS; j++) {
 			zz[i+j+2] += xx[i+1]*xx[j+1];
 			zz[i+j+2] += yy[i+1]*yy[j+1];
 		}
 	/* Subtract 1. */
 	zz[0]--;
 
 	/* Handle the carries. */
 	carry = 0;
 	for (i=NR_BLOCKS-1; i>=-1; i--) {
 		zz[i+1] += carry;
 		carry = zz[i+1] >> 16;
 		zz[i+1] &= 0xffff;
 	}
 
 	/*
 	 * If the number is negative, compute the 1's complement.  (We
 	 * should compute the 2's complement, but the the error will be 
 	 * negligable.)
 	 */
 	if ((zz[0] & 0x8000) != 0) {
 		sign = 1;
 		for (i=-1; i<NR_BLOCKS; i++)
 			zz[i+1] = 0xffff & (~zz[i+1]);
 	} else
 		sign = 0;
 
 	/* Convert fixed point into floating point. */
 	hm1 = 0;
 	for (i=NR_BLOCKS-1; i>=-1; i--)
 		hm1 += zz[i+1] * exp2(16*(-1-i));
 
 	if (sign == 1) hm1 = -hm1;
 
 	return (cpack(0.5 * log1p(hm1), atan2(y, x)));
 }
 
 float complex
 clogf(float complex z)
 {
 	return clog(z);
 }
 
 
 --------------060304040300070501030603--

From: Stephen Montgomery-Smith <stephen@missouri.edu>
To: Bruce Evans <brde@optusnet.com.au>
Cc: freebsd-bugs@freebsd.org, FreeBSD-gnats-submit@freebsd.org,
        Stephen Montgomery-Smith <stephen@freebsd.org>
Subject: Re: bin/170206: complex arcsinh, log, etc.
Date: Sat, 28 Jul 2012 11:15:20 -0500

 On 07/28/2012 10:46 AM, Stephen Montgomery-Smith wrote:
 > OK.  This clog really seems to work.
 >
 > x*x + y*y - 1 is computed with a ULP less than 0.8.  The rest of the
 > errors seem to be due to the implementation of log1p.  The ULP of the
 > final answer seems to be never bigger than a little over 2.
 >
 >
 
 
 Also, I don't think the problem is due to the implementation of log1p. 
 If you do an error analysis of log(1+x) where x is about exp(-1)-1, and 
 x is correct to within 0.8 ULP, I suspect that about 2.5 ULP is the best 
 you can do for the final answer:
 
 relative_error(log(1+x)) = fabs(1/((1+x) log(1+x))) * relative_error(x)
                    = 1.58 * relative_error(x)
 
 Given that log1p has itself a ULP of about 1, and relative error in x is 
 0.8, and considering x=exp(-1)-1, this gives a ULP at around 1.58*0.8+1 
 = 2.3.  And that is what I observed.
 
 (Here "=" means approximately equal to.)

From: Stephen Montgomery-Smith <stephen@missouri.edu>
To: Bruce Evans <brde@optusnet.com.au>
Cc: freebsd-bugs@freebsd.org, FreeBSD-gnats-submit@freebsd.org,
        Stephen Montgomery-Smith <stephen@freebsd.org>
Subject: Re: bin/170206: complex arcsinh, log, etc.
Date: Sat, 28 Jul 2012 16:46:39 -0500

 This is a multi-part message in MIME format.
 --------------070300040504040200030709
 Content-Type: text/plain; charset=ISO-8859-1; format=flowed
 Content-Transfer-Encoding: 7bit
 
 Here are some diffs to catrig.c so that it completely passes Peter 
 Jeremy's program www.rulingia.com/~peter/ctest.c.  That is, it seems to 
 get all the infs and nans correct.
 
 
 
 --------------070300040504040200030709
 Content-Type: text/x-diff;
  name="ca.diff"
 Content-Transfer-Encoding: 7bit
 Content-Disposition: attachment;
  filename="ca.diff"
 
 --- catrig.c-old1	2012-07-28 15:00:46.000000000 -0500
 +++ catrig.c	2012-07-28 16:39:24.000000000 -0500
 @@ -89,7 +89,7 @@
  	S = hypot(x,y-1);
  	A = 0.5*(R + S);
  
 -	if (A < 10) {
 +	if (A < 10 && isfinite(A)) {
  		fp = f(x,1+y,&fpuf);
  		fm = f(x,1-y,&fmuf);
  		if (fpuf == 1 && fmuf == 1) {
 @@ -108,9 +108,23 @@
  		} else {
  			*rx = log1p(fp + fm + sqrt((fp+fm)*(A+1)));
  		}
 -	} else
 +	} else if (isinf(y))
 +		*rx = y;
 +	else
  		*rx = log(A + sqrt(A*A-1));
  
 +	if (!isfinite(y)) {
 +		*B_good = 0;
 +		if (isfinite(x)) *A2my2 = 0;
 +		else if (isnan(x)) *A2my2 = x;
 +		else *A2my2 = y;
 +		return;
 +	} else if (isnan(x) && y == 0) {
 +		*B_good = 0;
 +		*A2my2 = 1;
 +		return;
 +	}
 +
  	*B = y/A; /* = 0.5*(R - S) */
  	*B_good = 1;
  
 @@ -147,7 +161,7 @@
  	x = fabs(x);
  	y = fabs(y);
  
 -	if (cabs(z) > 1e20) {
 +	if (cabs(z) > 1e20 && isfinite(x) && isfinite(y)) {
  		if (huge+x>one) { /* set inexact flag. */
  			if (sx == 0) return clog(2*z);
  			if (sx == 1) return -clog(-2*z);
 @@ -206,7 +220,7 @@
  	x = fabs(x);
  	y = fabs(y);
  
 -	if (cabs(z) > 1e20) {
 +	if (cabs(z) > 1e20 && isfinite(x) && isfinite(y)) {
  		if (huge+x>one) { /* set inexact flag. */
  			w = clog(2*z);
  			if (signbit(cimag(w)) == 0)
 @@ -271,6 +285,36 @@
  		if (huge+x>one) /* set inexact flag. */
  			return z;
  
 +	if (isinf(x) && isfinite(y)) {
 +		if (!((signbit(x) == 0) ^ (signbit(y) == 0)))
 +			return cpack(0,M_PI_2);
 +		return cpack(0,-M_PI_2);
 +	}
 +
 +	if (isinf(y)) {
 +		rx = copysign(0,x);
 +		if (signbit(y) == 0)
 +			return cpack(rx,M_PI_2);
 +		return cpack(rx,-M_PI_2);
 +	}
 +
 +	if (isinf(x) && isnan(y)) {
 +		rx = copysign(0, x);
 +		return cpack(rx, y);
 +	}
 +
 +	if (x == 0 && isnan(y))
 +		return cpack(x, y);
 +
 +	if (isinf(y)) {
 +		if (signbit(y) == 0)
 +			return cpack(0,M_PI_2);
 +		return cpack(0,-M_PI_2);
 +	}
 +
 +	if (isnan(x) || isnan(y))
 +		return clog(z);
 +
  	if (cabs(z) > 1e20)
  		if (huge+x>one) { /* set inexact flag. */
  			if (signbit(x) == 0)
 @@ -290,13 +334,17 @@
  
  	if (hp < 0.5 || hm < 0.5)
  		rx = 0.25*(log(hp/hm));
 +	else if (x == 0)
 +		rx = x;
  	else if (x > 0)
  		rx = 0.25*log1p(4*x/hm);
  	else
  		rx = -0.25*log1p(-4*x/hp);
  
  	if (x==1 || x==-1) {
 -		if (signbit(y) == 0)
 +		if (y==0)
 +			ry = y;
 +		else if (signbit(y) == 0)
  			ry = atan2(2, -y)/2;
  		else
  			ry = atan2(-2, y)/2;
 
 --------------070300040504040200030709--

From: Stephen Montgomery-Smith <stephen@missouri.edu>
To: Bruce Evans <brde@optusnet.com.au>
Cc: freebsd-bugs@freebsd.org, FreeBSD-gnats-submit@freebsd.org,
        Stephen Montgomery-Smith <stephen@freebsd.org>
Subject: Re: bin/170206: complex arcsinh, log, etc.
Date: Sat, 28 Jul 2012 18:12:07 -0500

 On 07/28/2012 11:15 AM, Stephen Montgomery-Smith wrote:
 > On 07/28/2012 10:46 AM, Stephen Montgomery-Smith wrote:
 >> OK.  This clog really seems to work.
 >>
 >> x*x + y*y - 1 is computed with a ULP less than 0.8.  The rest of the
 >> errors seem to be due to the implementation of log1p.  The ULP of the
 >> final answer seems to be never bigger than a little over 2.
 >>
 >>
 >
 >
 > Also, I don't think the problem is due to the implementation of log1p.
 > If you do an error analysis of log(1+x) where x is about exp(-1)-1, and
 > x is correct to within 0.8 ULP, I suspect that about 2.5 ULP is the best
 > you can do for the final answer:
 >
 > relative_error(log(1+x)) = fabs(1/((1+x) log(1+x))) * relative_error(x)
 >                    = 1.58 * relative_error(x)
 >
 > Given that log1p has itself a ULP of about 1, and relative error in x is
 > 0.8, and considering x=exp(-1)-1, this gives a ULP at around 1.58*0.8+1
 > = 2.3.  And that is what I observed.
 >
 > (Here "=" means approximately equal to.)
 
 And I should add that I just realized that ULP isn't quite the same as 
 relative error, so an extra factor of up to 2 could make its way into 
 the calculations.
 

From: Bruce Evans <brde@optusnet.com.au>
To: Stephen Montgomery-Smith <stephen@missouri.edu>
Cc: Bruce Evans <brde@optusnet.com.au>, freebsd-bugs@FreeBSD.org,
        FreeBSD-gnats-submit@FreeBSD.org,
        Stephen Montgomery-Smith <stephen@FreeBSD.org>
Subject: Re: bin/170206: complex arcsinh, log, etc.
Date: Sun, 29 Jul 2012 12:14:18 +1000 (EST)

 On Sat, 28 Jul 2012, Stephen Montgomery-Smith wrote:
 
 > On 07/28/2012 10:46 AM, Stephen Montgomery-Smith wrote:
 >> OK.  This clog really seems to work.
 >> 
 >> x*x + y*y - 1 is computed with a ULP less than 0.8.  The rest of the
 >> errors seem to be due to the implementation of log1p.  The ULP of the
 >> final answer seems to be never bigger than a little over 2.
 >
 > Also, I don't think the problem is due to the implementation of log1p. If you 
 > do an error analysis of log(1+x) where x is about exp(-1)-1, and x is correct 
 > to within 0.8 ULP, I suspect that about 2.5 ULP is the best you can do for 
 > the final answer:
 >
 > relative_error(log(1+x)) = fabs(1/((1+x) log(1+x))) * relative_error(x)
 >                  = 1.58 * relative_error(x)
 >
 > Given that log1p has itself a ULP of about 1, and relative error in x is 0.8, 
 > and considering x=exp(-1)-1, this gives a ULP at around 1.58*0.8+1 = 2.3. 
 > And that is what I observed.
 
 Not given:
 - the error in log1p is close to 0.5 ulps in my version
 - when the extra internal precision of my log1p is brought out, the error
    will be a maximum of about 1/2**7 ulps before rounding.
 This will only save about half an ulp after rounding.  The are accumalative
 error of about half an ulp from the other 2+ steps).  Extra precision for
 these is further off.
 
 I made progress towards completing exhaustive testing of this point
 for all cases near 1 for clogf.  All cases tested (at least 99% of
 all cases possible) were perfectly rounded.  There were an amazingly
 (before doing the analysis below) large number of half-way cases
 near (1 + ~1e-10*I).  This is an easy special case for clogf() -- x
 is precisely 1 so it reduces immediately to log1pf(1+y*y)/2 -- but
 when x is precisely 1 there are many half-way cases.  Due to
 numerical accidents, it turns out that many cases are correctly
 rounded in float precision but not in double precision.  My testing
 is not quite complete because it doesn't verify that the accidents
 aren't mostly due to my test methodology.  Part of the methodology
 is:
 
 - start with float args.  These have only 24 mantissa bits, so after
    promotion to double and long double precision they have many lower bits
    0.  This is what gives they many half-way cases that are unresolvable
    in long double precision after y is squared.  The magic arg values
    (y near 1e-10) are what is needed for squaring to push the lowest 1
    bit into an unfortunate place.
 - in both clogf() and clog(), return log1pl((long double)y*y)/2 (rounded
    to float precision), so that we see the inherent inaccuracy of clogf()
    and not external inaccuracy from log1pf().  This is uninteresting
    for x = 1 since there are no internal extra-precision calculations
    to verify for that case, but it is interesting for the general case
    to verify that the doubled float precision algorithm is exact.  I
    don't want the general case hidden by errors for this case, so I
    evaluate y*y in long double precision so as to minimise rounding
    errors in it, although the normal inaccurate y*y is part of clogf()
    (we could use doubled precision for it, but we don't because we know
    that all the extra precision would be lost when it is passed to
    log1pf().  Except when y*y is near underflow, we are more careful).
 - it turns out that the extra precision of log1pl() is enough for
    perfect rounding in float precision but not for perfect rounding
    in double precision.  This is no accident.  log1pl() gives only
    11 extra bits for double precision, but 40 extra for float
    precision.  A simple probabistic argument shows that 40 is always
    enough unless we are unlucky.  The chance of a half-way case that
    is not resolvable in long double precision is about 2*--11 and
    2**-40 in the 2 cases, respectively.  There are less than 2**-24
    args of interest (all x between sqrt(2)-epsilon and 1, corresponding
    y near sqrt(1-x*x)).  We would have to be much more unlucky to hit
    a half-way case with so many fewer cases in float precision.
    Assuming that the half-way cases are uniformly distributed, which
    they aren't, then the probabities for _not_ hitting a half-way case
    would be related to:
      (1-2**-40)**(2**24) = 0.999985  for float precision
      (1-2**-11)**(2**53) = <too small for pari>, but much smaller than
      (1-2**-11)**(2**39) = 3.315e-116608509  for double precision
    This calculation is certainly off by many powers of 2 even in float
    precision.  It's a bit surprising that the probability is so high
    for 2**24 cases (no birthday paradox with uniform distribution).
 
 Bruce

From: Stephen Montgomery-Smith <stephen@missouri.edu>
To: Bruce Evans <brde@optusnet.com.au>
Cc: freebsd-bugs@freebsd.org, FreeBSD-gnats-submit@freebsd.org,
        Stephen Montgomery-Smith <stephen@freebsd.org>
Subject: Re: bin/170206: complex arcsinh, log, etc.
Date: Sat, 28 Jul 2012 21:27:20 -0500

 On 07/28/2012 06:12 PM, Stephen Montgomery-Smith wrote:
 > On 07/28/2012 11:15 AM, Stephen Montgomery-Smith wrote:
 >> On 07/28/2012 10:46 AM, Stephen Montgomery-Smith wrote:
 >>> OK.  This clog really seems to work.
 >>>
 >>> x*x + y*y - 1 is computed with a ULP less than 0.8.  The rest of the
 >>> errors seem to be due to the implementation of log1p.  The ULP of the
 >>> final answer seems to be never bigger than a little over 2.
 >>>
 >>>
 >>
 >>
 >> Also, I don't think the problem is due to the implementation of log1p.
 >> If you do an error analysis of log(1+x) where x is about exp(-1)-1, and
 >> x is correct to within 0.8 ULP, I suspect that about 2.5 ULP is the best
 >> you can do for the final answer:
 >>
 >> relative_error(log(1+x)) = fabs(1/((1+x) log(1+x))) * relative_error(x)
 >>                    = 1.58 * relative_error(x)
 >>
 >> Given that log1p has itself a ULP of about 1, and relative error in x is
 >> 0.8, and considering x=exp(-1)-1, this gives a ULP at around 1.58*0.8+1
 >> = 2.3.  And that is what I observed.
 >>
 >> (Here "=" means approximately equal to.)
 >
 > And I should add that I just realized that ULP isn't quite the same as
 > relative error, so an extra factor of up to 2 could make its way into
 > the calculations.
 
 In fact, I think I messed it up a bunch.
 
 So let D(f(x)) denote the absolute error in f(x).
 
 D(f(x)) = f'(x) Dx.
 
 So
 
 D(log(1+x)) = Dx/(1+x).
 
 If x is a bit bigger than exp(-1)+1 = -0.63, which has ilogb = -1.  If 
 ULP in calculating x is around 0.8, then
 Dx = 0.8 * 2^(-d-1).
 where d is the number of bits in the mantissa,
 
 So D(log(1+x)) = Dx/0.37.
 Since log(1+x) is a little bit bigger than -1, and so ilogb(log(1+x)) = -1.
 
 ULP(log(1+x)) = Dx/0.37 * 2^{d+1} = 0.8/0.37 = 2.2
 
 Now add 1 for ULP in calculating log1p, and this only gives a ULP of 
 3.2.  So the observed bound is actually better than expected.  If one 
 could get the ULP of log1p to be as good as possible (0.5), the best ULP 
 one could get is 2.7.  We still do a bit better than that.
 

From: Stephen Montgomery-Smith <stephen@missouri.edu>
To: Bruce Evans <brde@optusnet.com.au>
Cc: freebsd-bugs@freebsd.org, FreeBSD-gnats-submit@freebsd.org,
        Stephen Montgomery-Smith <stephen@freebsd.org>
Subject: Re: bin/170206: complex arcsinh, log, etc.
Date: Sat, 28 Jul 2012 21:29:42 -0500

 This is a multi-part message in MIME format.
 --------------050105020207030702040007
 Content-Type: text/plain; charset=ISO-8859-1; format=flowed
 Content-Transfer-Encoding: 7bit
 
 On 07/28/2012 04:46 PM, Stephen Montgomery-Smith wrote:
 > Here are some diffs to catrig.c so that it completely passes Peter
 > Jeremy's program www.rulingia.com/~peter/ctest.c.  That is, it seems to
 > get all the infs and nans correct.
 
 And I think I messed up these diffs as well.  Can we try this instead?
 
 
 
 
 --------------050105020207030702040007
 Content-Type: text/x-diff;
  name="ca.diff"
 Content-Transfer-Encoding: 7bit
 Content-Disposition: attachment;
  filename="ca.diff"
 
 --- catrig.c-old1	2012-07-28 15:00:46.000000000 -0500
 +++ catrig.c	2012-07-28 21:10:20.000000000 -0500
 @@ -89,7 +89,7 @@
  	S = hypot(x,y-1);
  	A = 0.5*(R + S);
  
 -	if (A < 10) {
 +	if (A < 10 && isfinite(A)) {
  		fp = f(x,1+y,&fpuf);
  		fm = f(x,1-y,&fmuf);
  		if (fpuf == 1 && fmuf == 1) {
 @@ -108,9 +108,24 @@
  		} else {
  			*rx = log1p(fp + fm + sqrt((fp+fm)*(A+1)));
  		}
 -	} else
 +	} else if (isinf(y))
 +		*rx = y;
 +	else
  		*rx = log(A + sqrt(A*A-1));
  
 +	if (!isfinite(y)) {
 +		*B_good = 0;
 +		if (isfinite(x)) *A2my2 = 0;
 +		else if (isnan(x)) *A2my2 = x;
 +		else *A2my2 = y;
 +		return;
 +	}
 +	if (isnan(x) && y == 0) {
 +		*B_good = 0;
 +		*A2my2 = 1;
 +		return;
 +	}
 +
  	*B = y/A; /* = 0.5*(R - S) */
  	*B_good = 1;
  
 @@ -147,7 +162,7 @@
  	x = fabs(x);
  	y = fabs(y);
  
 -	if (cabs(z) > 1e20) {
 +	if (cabs(z) > 1e20 && isfinite(x) && isfinite(y)) {
  		if (huge+x>one) { /* set inexact flag. */
  			if (sx == 0) return clog(2*z);
  			if (sx == 1) return -clog(-2*z);
 @@ -164,8 +179,8 @@
  	else
  		ry = atan2(y,A2my2);
  
 -	if (sx == 1) rx = -rx;
 -	if (sy == 1) ry = -ry;
 +	if (sx == 1) rx = copysign(rx, -1);
 +	if (sy == 1) ry = copysign(ry, -1);
  
  	return cpack(rx,ry);
  }
 @@ -206,7 +221,7 @@
  	x = fabs(x);
  	y = fabs(y);
  
 -	if (cabs(z) > 1e20) {
 +	if (cabs(z) > 1e20 && isfinite(x) && isfinite(y)) {
  		if (huge+x>one) { /* set inexact flag. */
  			w = clog(2*z);
  			if (signbit(cimag(w)) == 0)
 @@ -232,7 +247,7 @@
  			rx = atan2(A2my2,-x);
  	}
  
 -	if (sy==0) ry = -ry;
 +	if (sy==0) ry = copysign(ry, -1);
  
  	return cpack(rx,ry);
  }
 @@ -271,11 +286,23 @@
  		if (huge+x>one) /* set inexact flag. */
  			return z;
  
 -	if (cabs(z) > 1e20)
 -		if (huge+x>one) { /* set inexact flag. */
 -			if (signbit(x) == 0)
 -				return cpack(0,M_PI_2);
 -			return cpack(0,-M_PI_2);
 +	if (isinf(x) && isnan(y))
 +		return cpack(copysign(0, x), y);
 +
 +	if (isnan(x) && isinf(y))
 +		return cpack(copysign(0, x), copysign(M_PI_2,y));
 +
 +	if (x == 0 && isnan(y))
 +		return cpack(x, y);
 +
 +	if (isnan(x) || isnan(y))
 +		return clog(z);
 +
 +	if (cabs(z) > 1e20) {
 +		if (isinf(x) || isinf(y))
 +			return cpack(copysign(0,x),copysign(M_PI_2,y));
 +		if (huge+x>one) /* set inexact flag. */
 +			return cpack(copysign(0,x),copysign(M_PI_2,y));
  	}
  
  	if (fabs(y) < 1e-100) {
 @@ -290,13 +317,17 @@
  
  	if (hp < 0.5 || hm < 0.5)
  		rx = 0.25*(log(hp/hm));
 +	else if (x == 0)
 +		rx = x;
  	else if (x > 0)
  		rx = 0.25*log1p(4*x/hm);
  	else
  		rx = -0.25*log1p(-4*x/hp);
  
  	if (x==1 || x==-1) {
 -		if (signbit(y) == 0)
 +		if (y==0)
 +			ry = y;
 +		else if (signbit(y) == 0)
  			ry = atan2(2, -y)/2;
  		else
  			ry = atan2(-2, y)/2;
 
 --------------050105020207030702040007--

From: Bruce Evans <brde@optusnet.com.au>
To: Stephen Montgomery-Smith <stephen@missouri.edu>
Cc: Bruce Evans <brde@optusnet.com.au>, freebsd-bugs@FreeBSD.org,
        FreeBSD-gnats-submit@FreeBSD.org,
        Stephen Montgomery-Smith <stephen@FreeBSD.org>
Subject: Re: bin/170206: complex arcsinh, log, etc.
Date: Sun, 29 Jul 2012 15:59:49 +1000 (EST)

 On Sat, 28 Jul 2012, Stephen Montgomery-Smith wrote:
 
 > On 07/28/2012 11:15 AM, Stephen Montgomery-Smith wrote:
 >> On 07/28/2012 10:46 AM, Stephen Montgomery-Smith wrote:
 >>> OK.  This clog really seems to work.
 >>> 
 >>> x*x + y*y - 1 is computed with a ULP less than 0.8.  The rest of the
 >>> errors seem to be due to the implementation of log1p.  The ULP of the
 >>> final answer seems to be never bigger than a little over 2.
 > ...
 > And I should add that I just realized that ULP isn't quite the same as 
 > relative error, so an extra factor of up to 2 could make its way into the 
 > calculations.
 
 Yes, this is tricky.  For denormals, it is easy to be off by a factor of
 2**MANT_DIG or infinity instead of only 2.
 
 For normals, the most interesting cases are near powers of 2 (say 1).
 One ulp is twice as large for values in [1, 2) as it is for values
 in [0.5, 1).  Even to determine which one to use, you need to know
 if the infinitely precise result is >= 1 or < 1, else you may be
 off by a factor of 2 in the error checking.  If the factor is 1/2,
 then it hides errors, and if it is 2 then it gives unexpected errors.
 
 For denormals, the easiest case to understand is when the correctly
 rounded case is the smallest strictly positive denormal.  Then the
 size of an ulp is the same as the value of this denormal.  A rounding
 error of < 1 ulp (but > 0.5 ulps) may give a result of 0 ulps or 2 ulps.
 Such errors are to be expected.  But relatively, they are infinity and
 2, respectively.  Normally you expected rounding errors of near
 2**-MANT_DIG, and infinity and 2 are much larger.  The relative error
 should be scaled (like the size of an ulp should be) so that you don't
 see such large errors.
 
 You might not care about denormals, but you should check a few of them
 and then you don't want errors in the checking software for them hiding
 errors for normals.  Without denormals, there would be no gradual
 underflow, and underflow from the smallest normal to 0 really would be
 essentially infinity (it would be about 2**MANT_DIG in ulps).
 
 My checking software scales for denormals, but I think I got it wrong
 and am off by a factor of 2.  For normals near a power of 2, its just
 impossible to determine the right scale without a reference function
 with enough extra precision to determine which side it is on.  Even
 the correct definition of an ulp is unclear near powers of 2 (perhaps
 other cases).  I want my checking program to match my definition, which
 is currently just what the checking program does :-) -- don't worry
 about the reference function not be precise enough.  There is the
 same uncertainty about the size of an ulp for a result of 0 (and
 more -- maybe the result should be -0 :-).
 
 Recently I started checking extra internal precision for some functions.
 This gives relative errors of < 2**(-MANT_DIG+N), where N is the extra
 precision.  When the relative error is near 2**-MANT_DIG, it is hard
 to tell if it is smaller than 1 ulp, since the ulp scale may be hard
 to determine and the relative error doesn't map simply to ulps.  When
 N >= 2, there is a factor of 2 to spare and final errors (after
 rounding) that are certain to be < 1 ulp are easy to ignore.  Even
 functions that don;t have much extra internal precision but which
 that meet my design goal of a final error < 1 ulp should have N at
 least 1, so their non-errors should be easy to not see too, but I have
 used this mainly for functions with N about 7.  The internal errors
 for the recently committed logl() are < 2**-120 relative on sparc64,
 except for denormal results.  This is obviously enough for 113-bit
 precision to 0.5+epsilon ulps.  But rounding would cluster the final
 errors near 2**-113 or 2**--114 and it wouldn't be obvious how this
 maps to ulps.
 
 Bruce

From: Stephen Montgomery-Smith <stephen@missouri.edu>
To: bug-followup@FreeBSD.org, stephen@FreeBSD.org
Cc:  
Subject: Re: bin/170206: [msun] [patch] complex arcsinh, log, etc.
Date: Sun, 29 Jul 2012 16:07:17 -0500

 This is a multi-part message in MIME format.
 --------------070709080600030802090207
 Content-Type: text/plain; charset=ISO-8859-1; format=flowed
 Content-Transfer-Encoding: 7bit
 
 I found a bug in catanh when |z| is very large.  I believe I have 
 corrected the bug in catrig.c.  I made some other small changes also, 
 mostly style and comments.
 
 
 
 --------------070709080600030802090207
 Content-Type: text/x-csrc;
  name="catrig.c"
 Content-Transfer-Encoding: 7bit
 Content-Disposition: attachment;
  filename="catrig.c"
 
 #include <complex.h>
 #include <float.h>
 #include <math.h>
 
 #include "math_private.h"
 
 /*
  * gcc doesn't implement complex multiplication or division correctly,
  * so we need to handle infinities specially. We turn on this pragma to
  * notify conforming c99 compilers that the fast-but-incorrect code that
  * gcc generates is acceptable, since the special cases have already been
  * handled.
  */
 #pragma	STDC CX_LIMITED_RANGE	ON
 
 double complex clog(double complex z);
 
 static const double
 one =  1.00000000000000000000e+00, /* 0x3FF00000, 0x00000000 */
 huge=  1.00000000000000000000e+300;
 
 /*
  * Testing indicates that all these functions are accurate up to 4 ULP.
  */
 
 /*
  * The algorithm is very close to that in "Implementing the complex arcsine
  * and arccosine functions using exception handling" by T. E. Hull,
  * Thomas F. Fairgrieve, and Ping Tak Peter Tang, published in ACM
  * Transactions on Mathematical Software, Volume 23 Issue 3, 1997, Pages
  * 299-335, http://dl.acm.org/citation.cfm?id=275324
  *
  * casinh(x+I*y) = sign(x)*log(A+sqrt(A*A-1)) + sign(y)*I*asin(B)
  * where
  * A = 0.5(|z+I| + |z-I|) = f(x,1+y) + f(x,1-y) + 1
  * B = 0.5(|z+I| - |z-I|)
  * z = x+I*y
  * f(x,y) = 0.5*(hypot(x,y)-y)
  *
  * We also use
  * asin(B) = atan2(sqrt(A*A-y*y),y)
  * A-y = f(x,y+1) + f(x,y-1).
  *
  * Much of the difficulty comes because computing f(x,y) may produce
  * underflows.
  */
 
 /*
  * Returns 0.5*(hypot(x,y)-y).  It assumes x is positive, and that y does
  * not satisfy the inequalities 0 < fabs(y) < 1e-20.
  * If reporting the answer risks an underflow, the underflow flag is set,
  * and it returns 0.5*(hypot(x,y)-y)/x/x.
  */
 inline static double
 f(double x, double y, int *underflow)
 {
 	if (x==0) {
 		*underflow = 0;
 		if (y > 0)
 			return 0;
 		return -y;
 	}
 	if (y==0) {
 		*underflow = 0;
 		return 0.5*x;
 	}
 	if (x < 1e-100 && x < y) {
 		*underflow = 1;
 		return 0.5/(hypot(x,y)+y);
 	}
 	if (x < y) {
 		*underflow = 0;
 		return 0.5*x*x/(hypot(x,y)+y);
 	}
 	*underflow = 0;
 	return 0.5*(hypot(x,y)-y);
 }
 
 /*
  * All the hard work is contained in this function.
  * Upon return:
  * rx = Re(casinh(x+I*y))
  * B_is_usable is set to 1 if the value of B is usable.
  * If B_is_usable is set to 0, A2my2 = A*A-y*y.
  */
 inline static void
 do_hard_work(double x, double y, double *rx, int *B_is_usable, double *B, double *A2my2)
 {
 	double R, S, A, fp, fm;
 	int fpuf, fmuf;
 
 	R = hypot(x,y+1);
 	S = hypot(x,y-1);
 	A = 0.5*(R + S);
 
 	/*
 	 * The paper by Hull et al suggests using A < 1.5.  Experiment
 	 * suggested A < 10 worked better.
 	 */
 	if (A < 10 && isfinite(A)) {
 		fp = f(x,1+y,&fpuf);
 		fm = f(x,1-y,&fmuf);
 		if (fpuf == 1 && fmuf == 1) {
 			if (huge+x>one) /* set inexact flag. */
 				*rx = log1p(x*sqrt((fp+fm)*(A+1)));
 		} else if (fmuf == 1) {
 			/*
 			 * Overflow not possible because fp < 1e50 and
 			 * x > 1e-100.
 			 * Underflow not possible because either fm=0 or fm
 			 * approximately bigger than 1e-200.
 			 */
 			if (huge+x>one) /* set inexact flag. */
 				*rx = log1p(fp+sqrt(x)*sqrt((fp/x+fm*x)*(A+1)));
 		} else if (fpuf == 1) {
 			/* Similar arguments against over/underflow. */
 			if (huge+x>one) /* set inexact flag. */
 				*rx = log1p(fm+sqrt(x)*sqrt((fm/x+fp*x)*(A+1)));
 		} else {
 			*rx = log1p(fp + fm + sqrt((fp+fm)*(A+1)));
 		}
 	} else if (isinf(y))
 		*rx = y;
 	else
 		*rx = log(A + sqrt(A*A-1));
 
 	if (!isfinite(y)) {
 		*B_is_usable = 0;
 		if (isfinite(x)) *A2my2 = 0;
 		else if (isnan(x)) *A2my2 = x;
 		else *A2my2 = y;
 		return;
 	}
 	if (isnan(x) && y == 0) {
 		*B_is_usable = 0;
 		*A2my2 = 1;
 		return;
 	}
 
 	*B = y/A; /* = 0.5*(R - S) */
 	*B_is_usable = 1;
 
 	/*
 	 * The paper by Hull et al suggests using B > 0.6417.  I just made up
 	 * the number 0.5.  It seems to work.
 	 */
 	if (*B > 0.5) {
 		*B_is_usable = 0;
 		fp = f(x,y+1,&fpuf);
 		fm = f(x,y-1,&fmuf);
 		if (fpuf == 1 && fmuf == 1)
 			*A2my2 =x*sqrt((A+y)*(fp+fm));
 		else if (fmuf == 1)
 			/*
 			 * Overflow not possible because fp < 1e50 and
 			 * x > 1e-100.
 			 * Underflow not possible because either fm=0 or fm
 			 * approximately bigger than 1e-200.
 			 */
 			*A2my2 = sqrt(x)*sqrt((A+y)*(fp/x+fm*x));
 		else if (fpuf == 1)
 			/* Similar arguments against over/underflow. */
 			*A2my2 = sqrt(x)*sqrt((A+y)*(fm/x+fp*x));
 		else
 			*A2my2 = sqrt((A+y)*(fp+fm));
 	}
 }
 
 double complex
 casinh(double complex z)
 {
 	double x, y, rx, ry, B, A2my2;
 	int sx, sy;
 	int B_is_usable;
 
 	x = creal(z);
 	y = cimag(z);
 	sx = signbit(x);
 	sy = signbit(y);
 	x = fabs(x);
 	y = fabs(y);
 
 	if (cabs(z) > 1e20 && isfinite(x) && isfinite(y)) {
 		if (huge+x>one) { /* set inexact flag. */
 			if (sx == 0) return clog(2*z);
 			if (sx == 1) return -clog(-2*z);
 		}
 	}
 
 	if (cabs(z) < 1e-20)
 		if (huge+x>one) /* set inexact flag. */
 			return z;
 
 	do_hard_work(x, y, &rx, &B_is_usable, &B, &A2my2);
 	if (B_is_usable)
 		ry = asin(B);
 	else
 		ry = atan2(y,A2my2);
 
 	if (sx == 1)
 		rx = copysign(rx, -1); /* casinh is odd. */
 	if (sy == 1)
 		ry = copysign(ry, -1); /* casinh(conj(z)) = conj(casinh(z)). */
 
 	return cpack(rx,ry);
 }
 
 /*
  * casin(z) = reverse(casinh(reverse(z)))
  * where reverse(x+I*y) = y+x*I = I*conj(x+I*y).
  */
 
 double complex
 casin(double complex z)
 {
 	complex w;
 
 	w = casinh(cpack(cimag(z),creal(z)));
 	return cpack(cimag(w),creal(w));
 }
 
 /*
  * cacos(z) = PI/2 - casin(z)
  * but do the computation carefully so cacos(z) is accurate when z is
  * close to 1.
  */
 
 double complex
 cacos(double complex z)
 {
 	double x, y, rx, ry, B, A2my2;
 	int sx, sy;
 	int B_is_usable;
 	complex w;
 
 	x = creal(z);
 	y = cimag(z);
 	sx = signbit(x);
 	sy = signbit(y);
 
 	x = fabs(x);
 	y = fabs(y);
 
 	if (cabs(z) > 1e20 && isfinite(x) && isfinite(y)) {
 		if (huge+x>one) { /* set inexact flag. */
 			w = clog(2*z);
 			if (signbit(cimag(w)) == 0)
 				return cpack(cimag(w),-creal(w));
 			return cpack(-cimag(w),creal(w));
 		}
 	}
 
 	if (cabs(z) < 1e-10)
 		if (huge+x>one) { /* set inexact flag. */
 			if (signbit(cimag(z)) == 0)
 				return cpack(M_PI_2-creal(z),copysign(cimag(z),-1));
 			return cpack(M_PI_2-creal(z),copysign(cimag(z),1));
 		}
 
 	do_hard_work(y, x, &ry, &B_is_usable, &B, &A2my2);
 	if (B_is_usable) {
 		if (sx==0)
 			rx = acos(B);
 		else
 			rx = acos(-B);
 	} else {
 		if (sx==0)
 			rx = atan2(A2my2,x);
 		else
 			rx = atan2(A2my2,-x);
 	}
 
 	if (sy==0)
 		ry = copysign(ry, -1); /* cacos(conj(z)) = conj(cacos(z)). */
 
 	return cpack(rx,ry);
 }
 
 /*
  * cacosh(z) = I*cacos(z) or -I*cacos(z)
  * where the sign is chosen so Re(cacosh(z)) >= 0.
  */
 
 double complex
 cacosh(double complex z)
 {
 	double complex w;
 
 	w = cacos(z);
 	if (signbit(cimag(w)) == 0)
 		return cpack(cimag(w),copysign(creal(w),-1));
 	else
 		return cpack(-cimag(w),creal(w));
 }
 
 /* 
  * catanh(z) = 0.5 * log((1+z)/(1-z))
  *           = 0.25 * log(|z+1|^2/|z-1|^2) + 0.5 * I * atan2(2y, (1-x*x-y*y))
  *
  * Note that |z+1|^2/|z-1|^2 = 1 + 4*x/|z-1|^2
  *                           = 1 / ( 1 - 4*x/|z+1|^2 )
  *
  * If |z| is large, then
  * catanh(z) appprox = 1/z + sign(y)*I*PI/2
  */
 double complex
 catanh(double complex z)
 {
 	double x, y, rx, ry;
 	double zp1_2, zm1_2; /* |z+1|^2 and |z-1|^2 */
 
 	x = creal(z);
 	y = cimag(z);
 
 	if (cabs(z) < 1e-20)
 		if (huge+x>one) /* set inexact flag. */
 			return z;
 
 	if (isinf(x) && isnan(y))
 		return cpack(copysign(0, x), y);
 
 	if (isnan(x) && isinf(y))
 		return cpack(copysign(0, x), copysign(M_PI_2,y));
 
 	if (x == 0 && isnan(y))
 		return cpack(x, y);
 
 	if (isnan(x) || isnan(y))
 		return clog(z);
 
 	if (isinf(x) || isinf(y))
 		return cpack(copysign(0,x),copysign(M_PI_2,y));
 
 	if (cabs(z) > 1e20)
 		if (huge+x>one) { /* set inexact flag. */
 			if (x==0)
 				return cpack(copysign(0,x),copysign(M_PI_2,y));
 			else
 				return 1/z + cpack(0,copysign(M_PI_2,y));
 		}
 
 	if (fabs(y) < 1e-100) {
 		if (huge+x>one) { /* set inexact flag. */
 			zp1_2 = (x+1)*(x+1);
 			zm1_2 = (x-1)*(x-1);
 		}
 	} else {
 		zp1_2 = (x+1)*(x+1)+y*y;
 		zm1_2 = (x-1)*(x-1)+y*y;
 	}
 
 	if (zp1_2 < 0.5 || zm1_2 < 0.5)
 		rx = 0.25*(log(zp1_2/zm1_2));
 	else if (x == 0)
 		rx = x;
 	else if (x > 0)
 		rx = 0.25*log1p(4*x/zm1_2);
 	else
 		rx = -0.25*log1p(-4*x/zp1_2);
 
 	if (x==1 || x==-1) {
 		if (y==0)
 			ry = y;
 		else if (signbit(y) == 0)
 			ry = atan2(2, -y)/2;
 		else
 			ry = atan2(-2, y)/2;
 	} else if (fabs(y) < 1e-100) {
 		if (huge+x>one) /* set inexact flag. */
 			ry = atan2(2*y, (1-x)*(1+x))/2;
 	} else
 		ry = atan2(2*y, (1-x)*(1+x)-y*y)/2;
 
 	return cpack(rx,ry);
 }
 
 /*
  * catan(z) = reverse(catanh(reverse(z)))
  * where reverse(x+I*y) = y+x*I = I*conj(x+I*y).
  */
 
 double complex
 catan(double complex z)
 {
 	complex w;
 
 	w = catanh(cpack(cimag(z),creal(z)));
 	return cpack(cimag(w),creal(w));
 }
 
 --------------070709080600030802090207--

From: Stephen Montgomery-Smith <stephen@missouri.edu>
To: bug-followup@FreeBSD.org, stephen@FreeBSD.org
Cc:  
Subject: Re: bin/170206: [msun] [patch] complex arcsinh, log, etc.
Date: Mon, 30 Jul 2012 17:29:56 -0500

 I'm going to stop posting updates here, and keep the latest version 
 here: http://people.freebsd.org/~stephen/
Responsible-Changed-From-To: freebsd-bugs->freebsd-numerics 
Responsible-Changed-By: peterj 
Responsible-Changed-When: Mon Nov 5 20:01:32 UTC 2012 
Responsible-Changed-Why:  
Redirect to -numerics 

http://www.freebsd.org/cgi/query-pr.cgi?pr=170206 
>Unformatted:
