matMulBiasReluGPU function
///////////////////////////////// Fused Kernels (1000+) /// /////////////////////////////////
Implementation
GPUTensor<Matrix> matMulBiasReluGPU(
GPUTensor<Matrix> x,
GPUTensor<Matrix> w,
GPUTensor<Vector> b,
CommandBuffer tape,
List<GPUTensor> intermediates,
) {
int M = x.shape[0];
int K = x.shape[1];
int N = w.shape[1];
// Instantly reserve VRAM without building 33MB of Dart Lists!
List<int> outShape = <int>[M, N];
GPUTensor<Matrix> reluOut = GPUTensor<Matrix>.empty(outShape);
GPUTensor<Matrix> preReluOut = GPUTensor<Matrix>.empty(outShape);
// Add to trash so it doesn't leak VRAM!
intermediates.add(preReluOut);
tape.putInt(OP_MATMUL_BIAS_RELU_FORWARD);
tape.putString(x.id);
tape.putString(w.id);
tape.putString(b.id);
tape.putString(reluOut.id);
tape.putString(preReluOut.id);
int cost = (2 * M * K * N) + (M * N) + (M * N);
reluOut.creator = GPUNode(
<GPUTensor>[x, w, b],
(CommandBuffer bTape) {
bTape.putInt(OP_RELU_BACKWARD);
bTape.putString(preReluOut.id);
bTape.putString('${reluOut.id}_grad');
bTape.putString('${preReluOut.id}_grad');
bTape.putInt(OP_MATMUL);
bTape.putString(x.id);
bTape.putString('${preReluOut.id}_grad');
bTape.putString('${w.id}_grad');
bTape.putBool(true);
bTape.putBool(false);
bTape.putFloat(1.0);
bTape.putFloat(1.0);
bTape.putBool(true); // Tensor Cores ON
bTape.putInt(OP_MATMUL);
bTape.putString('${preReluOut.id}_grad');
bTape.putString(w.id);
bTape.putString('${x.id}_grad');
bTape.putBool(false);
bTape.putBool(true);
bTape.putFloat(1.0);
bTape.putFloat(1.0);
bTape.putBool(true); // Tensor Cores ON
bTape.putInt(OP_SUM_REDUCE_COLUMNS);
bTape.putString('${preReluOut.id}_grad');
bTape.putString('${b.id}_grad');
},
opName: 'matMulBiasReluGPU',
cost: cost,
);
return reluOut;
}