matMulGPU function

///////////////////////////////// Matrix Operations (200-299) /// /////////////////////////////////

Implementation

GPUTensor<Matrix> matMulGPU(GPUTensor<Matrix> a, GPUTensor<Matrix> b, CommandBuffer tape) {
  int M = a.shape[0];
  int N = a.shape[1];
  int P = b.shape[1];

  // Instantly reserve VRAM based on shape, bypassing the Dart Heap
  GPUTensor<Matrix> out = GPUTensor<Matrix>.empty(<int>[M, P]);

  tape.putInt(OP_MATMUL);
  tape.putString(a.id);
  tape.putString(b.id);
  tape.putString(out.id);
  tape.putBool(false);
  tape.putBool(false);
  tape.putFloat(1.0);
  tape.putFloat(0.0);
  tape.putBool(true); // <--- Tensor Cores ON for Forward Pass

  int cost = 2 * M * N * P;

  out.creator = GPUNode(
    <GPUTensor>[a, b],
        (CommandBuffer bTape) {
      bTape.putInt(OP_MATMUL);
      bTape.putString('${out.id}_grad');
      bTape.putString(b.id);
      bTape.putString('${a.id}_grad');
      bTape.putBool(false);
      bTape.putBool(true);
      bTape.putFloat(1.0);
      bTape.putFloat(1.0);
      bTape.putBool(true);

      bTape.putInt(OP_MATMUL);
      bTape.putString(a.id);
      bTape.putString('${out.id}_grad');
      bTape.putString('${b.id}_grad');
      bTape.putBool(true);
      bTape.putBool(false);
      bTape.putFloat(1.0);
      bTape.putFloat(1.0);
      bTape.putBool(true);
    },
    opName: 'matMulGPU',
    cost: cost,
  );

  return out;
}