matMulGPU function
///////////////////////////////// Matrix Operations (200-299) /// /////////////////////////////////
Implementation
GPUTensor<Matrix> matMulGPU(GPUTensor<Matrix> a, GPUTensor<Matrix> b, CommandBuffer tape) {
int M = a.shape[0];
int N = a.shape[1];
int P = b.shape[1];
// Instantly reserve VRAM based on shape, bypassing the Dart Heap
GPUTensor<Matrix> out = GPUTensor<Matrix>.empty(<int>[M, P]);
tape.putInt(OP_MATMUL);
tape.putString(a.id);
tape.putString(b.id);
tape.putString(out.id);
tape.putBool(false);
tape.putBool(false);
tape.putFloat(1.0);
tape.putFloat(0.0);
tape.putBool(true); // <--- Tensor Cores ON for Forward Pass
int cost = 2 * M * N * P;
out.creator = GPUNode(
<GPUTensor>[a, b],
(CommandBuffer bTape) {
bTape.putInt(OP_MATMUL);
bTape.putString('${out.id}_grad');
bTape.putString(b.id);
bTape.putString('${a.id}_grad');
bTape.putBool(false);
bTape.putBool(true);
bTape.putFloat(1.0);
bTape.putFloat(1.0);
bTape.putBool(true);
bTape.putInt(OP_MATMUL);
bTape.putString(a.id);
bTape.putString('${out.id}_grad');
bTape.putString('${b.id}_grad');
bTape.putBool(true);
bTape.putBool(false);
bTape.putFloat(1.0);
bTape.putFloat(1.0);
bTape.putBool(true);
},
opName: 'matMulGPU',
cost: cost,
);
return out;
}