FeMul function
FeMul calculates h = f * g Can overlap h with f or g.
Preconditions: |f| bounded by 1.12^26,1.12^25,1.12^26,1.12^25,etc. |g| bounded by 1.12^26,1.12^25,1.12^26,1.12^25,etc.
Postconditions: |h| bounded by 1.12^25,1.12^24,1.12^25,1.12^24,etc.
Notes on implementation strategy:
Using schoolbook multiplication. Karatsuba would save a little in some cost models.
Most multiplications by 2 and 19 are 32-bit precomputations; cheaper than 64-bit postcomputations.
There is one remaining multiplication by 19 in the carry chain; one *19 precomputation can be merged into this, but the resulting data flow is considerably less clean.
There are 12 carries below. 10 of them are 2-way parallelizable and vectorizable. Can get away with 11 carries, but then data flow is much deeper.
With tighter constraints on inputs, can squeeze carries into int32.
Implementation
void FeMul(FieldElement h, f, g) {
var f0 = f[0];
var f1 = f[1];
var f2 = f[2];
var f3 = f[3];
var f4 = f[4];
var f5 = f[5];
var f6 = f[6];
var f7 = f[7];
var f8 = f[8];
var f9 = f[9];
var f1_2 = 2 * f[1];
var f3_2 = 2 * f[3];
var f5_2 = 2 * f[5];
var f7_2 = 2 * f[7];
var f9_2 = 2 * f[9];
var g0 = g[0];
var g1 = g[1];
var g2 = g[2];
var g3 = g[3];
var g4 = g[4];
var g5 = g[5];
var g6 = g[6];
var g7 = g[7];
var g8 = g[8];
var g9 = g[9];
var g1_19 = 19 * g[1]; /* 1.4*2^29 */
var g2_19 = 19 * g[2]; /* 1.4*2^30; still ok */
var g3_19 = 19 * g[3];
var g4_19 = 19 * g[4];
var g5_19 = 19 * g[5];
var g6_19 = 19 * g[6];
var g7_19 = 19 * g[7];
var g8_19 = 19 * g[8];
var g9_19 = 19 * g[9];
var h0 = f0 * g0 +
f1_2 * g9_19 +
f2 * g8_19 +
f3_2 * g7_19 +
f4 * g6_19 +
f5_2 * g5_19 +
f6 * g4_19 +
f7_2 * g3_19 +
f8 * g2_19 +
f9_2 * g1_19;
var h1 = f0 * g1 +
f1 * g0 +
f2 * g9_19 +
f3 * g8_19 +
f4 * g7_19 +
f5 * g6_19 +
f6 * g5_19 +
f7 * g4_19 +
f8 * g3_19 +
f9 * g2_19;
var h2 = f0 * g2 +
f1_2 * g1 +
f2 * g0 +
f3_2 * g9_19 +
f4 * g8_19 +
f5_2 * g7_19 +
f6 * g6_19 +
f7_2 * g5_19 +
f8 * g4_19 +
f9_2 * g3_19;
var h3 = f0 * g3 +
f1 * g2 +
f2 * g1 +
f3 * g0 +
f4 * g9_19 +
f5 * g8_19 +
f6 * g7_19 +
f7 * g6_19 +
f8 * g5_19 +
f9 * g4_19;
var h4 = f0 * g4 +
f1_2 * g3 +
f2 * g2 +
f3_2 * g1 +
f4 * g0 +
f5_2 * g9_19 +
f6 * g8_19 +
f7_2 * g7_19 +
f8 * g6_19 +
f9_2 * g5_19;
var h5 = f0 * g5 +
f1 * g4 +
f2 * g3 +
f3 * g2 +
f4 * g1 +
f5 * g0 +
f6 * g9_19 +
f7 * g8_19 +
f8 * g7_19 +
f9 * g6_19;
var h6 = f0 * g6 +
f1_2 * g5 +
f2 * g4 +
f3_2 * g3 +
f4 * g2 +
f5_2 * g1 +
f6 * g0 +
f7_2 * g9_19 +
f8 * g8_19 +
f9_2 * g7_19;
var h7 = f0 * g7 +
f1 * g6 +
f2 * g5 +
f3 * g4 +
f4 * g3 +
f5 * g2 +
f6 * g1 +
f7 * g0 +
f8 * g9_19 +
f9 * g8_19;
var h8 = f0 * g8 +
f1_2 * g7 +
f2 * g6 +
f3_2 * g5 +
f4 * g4 +
f5_2 * g3 +
f6 * g2 +
f7_2 * g1 +
f8 * g0 +
f9_2 * g9_19;
var h9 = f0 * g9 +
f1 * g8 +
f2 * g7 +
f3 * g6 +
f4 * g5 +
f5 * g4 +
f6 * g3 +
f7 * g2 +
f8 * g1 +
f9 * g0;
FeCombine(h, h0, h1, h2, h3, h4, h5, h6, h7, h8, h9);
}