matMulBiasReluGPU function

GPUTensor<Matrix> matMulBiasReluGPU(

)

///////////////////////////////// Fused Kernels (1000+) /// /////////////////////////////////

Implementation

GPUTensor<Matrix> matMulBiasReluGPU(
    GPUTensor<Matrix> x,
    GPUTensor<Matrix> w,
    GPUTensor<Vector> b,
    CommandBuffer tape,
    List<GPUTensor> intermediates,
    ) {
  int M = x.shape[0];
  int K = x.shape[1];
  int N = w.shape[1];

  // Instantly reserve VRAM without building 33MB of Dart Lists!
  List<int> outShape = <int>[M, N];
  GPUTensor<Matrix> reluOut = GPUTensor<Matrix>.empty(outShape);
  GPUTensor<Matrix> preReluOut = GPUTensor<Matrix>.empty(outShape);

  // Add to trash so it doesn't leak VRAM!
  intermediates.add(preReluOut);

  tape.putInt(OP_MATMUL_BIAS_RELU_FORWARD);
  tape.putString(x.id);
  tape.putString(w.id);
  tape.putString(b.id);
  tape.putString(reluOut.id);
  tape.putString(preReluOut.id);

  int cost = (2 * M * K * N) + (M * N) + (M * N);

  reluOut.creator = GPUNode(
    <GPUTensor>[x, w, b],
        (CommandBuffer bTape) {
      bTape.putInt(OP_RELU_BACKWARD);
      bTape.putString(preReluOut.id);
      bTape.putString('${reluOut.id}_grad');
      bTape.putString('${preReluOut.id}_grad');

      bTape.putInt(OP_MATMUL);
      bTape.putString(x.id);
      bTape.putString('${preReluOut.id}_grad');
      bTape.putString('${w.id}_grad');
      bTape.putBool(true);
      bTape.putBool(false);
      bTape.putFloat(1.0);
      bTape.putFloat(1.0);
      bTape.putBool(true); // Tensor Cores ON

      bTape.putInt(OP_MATMUL);
      bTape.putString('${preReluOut.id}_grad');
      bTape.putString(w.id);
      bTape.putString('${x.id}_grad');
      bTape.putBool(false);
      bTape.putBool(true);
      bTape.putFloat(1.0);
      bTape.putFloat(1.0);
      bTape.putBool(true); // Tensor Cores ON

      bTape.putInt(OP_SUM_REDUCE_COLUMNS);
      bTape.putString('${preReluOut.id}_grad');
      bTape.putString('${b.id}_grad');
    },
    opName: 'matMulBiasReluGPU',
    cost: cost,
  );

  return reluOut;
}

matMulBiasReluGPU function

Implementation

tensor_math_gpu library