feSquareGeneric method

void feSquareGeneric(
  1. Element a
)

Squaring works precisely like multiplication above, but thanks to its symmetry we get to group a few terms together.

                     l4   l3   l2   l1   l0  x
                     l4   l3   l2   l1   l0  =
                    ------------------------
                   l4l0 l3l0 l2l0 l1l0 l0l0  +
              l4l1 l3l1 l2l1 l1l1 l0l1       +
         l4l2 l3l2 l2l2 l1l2 l0l2            +
    l4l3 l3l3 l2l3 l1l3 l0l3                 +

l4l4 l3l4 l2l4 l1l4 l0l4 =

 r8   r7   r6   r5   r4   r3   r2   r1   r0

       l4l0    l3l0    l2l0    l1l0    l0l0  +
       l3l1    l2l1    l1l1    l0l1 19×l4l1  +
       l2l2    l1l2    l0l2 19×l4l2 19×l3l2  +
       l1l3    l0l3 19×l4l3 19×l3l3 19×l2l3  +
       l0l4 19×l4l4 19×l3l4 19×l2l4 19×l1l4  =
      --------------------------------------
         r4      r3      r2      r1      r0

With precomputed 2×, 19×, and 2×19× terms, we can compute each limb with only three Mul64 and four Add64, instead of five and eight.

Implementation

void feSquareGeneric(Element a) {
  final l0_2 = a.l0 * BigInt.two;
  final l1_2 = a.l1 * BigInt.two;

  final l1_38 = a.l1 * 38.toBigInt;
  final l2_38 = a.l2 * 38.toBigInt;
  final l3_38 = a.l3 * 38.toBigInt;

  final l3_19 = a.l3 * bigInt19;
  final l4_19 = a.l4 * bigInt19;

  // r0 = l0×l0 + 19×(l1×l4 + l2×l3 + l3×l2 + l4×l1) = l0×l0 + 19×2×(l1×l4 + l2×l3)
  final r0 = Uint128.mul64(a.l0, a.l0)
    ..addMul64(l1_38, a.l4)
    ..addMul64(l2_38, a.l3);

  // r1 = l0×l1 + l1×l0 + 19×(l2×l4 + l3×l3 + l4×l2) = 2×l0×l1 + 19×2×l2×l4 + 19×l3×l3
  final r1 = Uint128.mul64(l0_2, a.l1)
    ..addMul64(l2_38, a.l4)
    ..addMul64(l3_19, a.l3);

  // r2 = l0×l2 + l1×l1 + l2×l0 + 19×(l3×l4 + l4×l3) = 2×l0×l2 + l1×l1 + 19×2×l3×l4
  final r2 = Uint128.mul64(l0_2, a.l2)
    ..addMul64(a.l1, a.l1)
    ..addMul64(l3_38, a.l4);

  // r3 = l0×l3 + l1×l2 + l2×l1 + l3×l0 + 19×l4×l4 = 2×l0×l3 + 2×l1×l2 + 19×l4×l4
  final r3 = Uint128.mul64(l0_2, a.l3)
    ..addMul64(l1_2, a.l2)
    ..addMul64(l4_19, a.l4);

  // r4 = l0×l4 + l1×l3 + l2×l2 + l3×l1 + l4×l0 = 2×l0×l4 + 2×l1×l3 + l2×l2
  final r4 = Uint128.mul64(l0_2, a.l4)
    ..addMul64(l1_2, a.l3)
    ..addMul64(a.l2, a.l2);

  final c0 = r0.shiftRightBy51();
  final c1 = r1.shiftRightBy51();
  final c2 = r2.shiftRightBy51();
  final c3 = r3.shiftRightBy51();
  final c4 = r4.shiftRightBy51();

  final rr0 = (r0.low & maskLow51Bits) + (c4 * bigInt19);
  final rr1 = (r1.low & maskLow51Bits) + c0;
  final rr2 = (r2.low & maskLow51Bits) + c1;
  final rr3 = (r3.low & maskLow51Bits) + c2;
  final rr4 = (r4.low & maskLow51Bits) + c3;

  l0 = rr0;
  l1 = rr1;
  l2 = rr2;
  l3 = rr3;
  l4 = rr4;

  carryPropagateGeneric();
}