expFloat32x4 function
SIMD Optimized Exponential function.
Author: Graciliano Monteiro Passos
Implementation
Float32x4 expFloat32x4(Float32x4 entry) {
entry = entry.clamp(_float32x4__87, _float32x4_87);
final x = entry.x;
final y = entry.y;
final z = entry.z;
final w = entry.w;
var xInt = _expXToInt(x);
var yInt = _expXToInt(y);
var zInt = _expXToInt(z);
var wInt = _expXToInt(w);
final xIdx = expIntTableMaxIndex + xInt;
final yIdx = expIntTableMaxIndex + yInt;
final zIdx = expIntTableMaxIndex + zInt;
final wIdx = expIntTableMaxIndex + wInt;
// Lookup exp(floor(x)).
// intPartA will have the upper 22 bits, intPartB will have the lower
// 52 bits.
final intPartA = Float32x4(expIntTableA[xIdx], expIntTableA[yIdx],
expIntTableA[zIdx], expIntTableA[wIdx]);
final intPartB = Float32x4(expIntTableB[xIdx], expIntTableB[yIdx],
expIntTableB[zIdx], expIntTableB[wIdx]);
final entryInt = Float32x4(
xInt.toDouble(), yInt.toDouble(), zInt.toDouble(), wInt.toDouble());
// Get the fractional part of x, find the greatest multiple of 2^-10 less than
// x and look up the exp function of it.
// fracPartA will have the upper 22 bits, fracPartB the lower 52 bits.
final frac = ((entry - entryInt) * _float32x4_1024);
final fracIntX = frac.x.toInt();
final fracIntY = frac.y.toInt();
final fracIntZ = frac.z.toInt();
final fracIntW = frac.w.toInt();
final fracPartA = Float32x4(expFracTableA[fracIntX], expFracTableA[fracIntY],
expFracTableA[fracIntZ], expFracTableA[fracIntW]);
final fracPartB = Float32x4(expFracTableB[fracIntX], expFracTableB[fracIntY],
expFracTableB[fracIntZ], expFracTableB[fracIntW]);
// epsilon is the difference in x from the nearest multiple of 2^-10. It
// has a value in the range 0 <= epsilon < 2^-10.
// Do the subtraction from x as the last step to avoid possible
// loss of precision.
final epsilon = entry -
(entryInt +
Float32x4(
fracIntX.toDouble(),
fracIntY.toDouble(),
fracIntZ.toDouble(),
fracIntW.toDouble(),
) /
_float32x4_1024);
// Compute z = exp(epsilon) - 1.0 via a minimax polynomial. z has
// full double precision (52 bits). Since z < 2^-10, we will have
// 62 bits of precision when combined with the contant 1. This will be
// used in the last addition below to get proper rounding.
// Remez generated polynomial. Converges on the interval [0, 2^-10], error
// is less than 0.5 ULP
var o = _float32x4_0_04168701738764507;
o = o * epsilon + _float32x4_0_1666666505023083;
o = o * epsilon + _float32x4_0_5000000000042687;
o = o * epsilon + _float32x4_1_0;
o = o * epsilon + _float32x4__3_940510424527919E_20;
// Compute (intPartA+intPartB) * (fracPartA+fracPartB) by binomial
// expansion.
// tempA is exact since intPartA and intPartB only have 22 bits each.
// tempB will have 52 bits of precision.
final tempA = intPartA * fracPartA;
final tempB =
intPartA * fracPartB + intPartB * fracPartA + intPartB * fracPartB;
// Compute the result. (1+z)(tempA+tempB). Order of operations is
// important. For accuracy add by increasing size. tempA is exact and
// much larger than the others. If there are extra bits specified from the
// pow() function, use them.
final tempC = tempB + tempA;
final result = tempC * o + tempB + tempA;
return result;
}