executeGPUGraph<T> function

Tensor<T> executeGPUGraph<T>(
  1. List<Tensor> cpuInputs,
  2. List<GPUTensor> gpuInputs,
  3. GPUTensor gpuOutput,
  4. CommandBuffer forwardTape,
  5. CommandBuffer backwardTape, {
  6. String opName = 'gpu_subgraph',
})

Wraps an entire compiled GPU tape into a single CPU Tensor Autograd node.

Implementation

Tensor<T> executeGPUGraph<T>(
    List<Tensor> cpuInputs,
    List<GPUTensor> gpuInputs,
    GPUTensor gpuOutput,
    CommandBuffer forwardTape,
    CommandBuffer backwardTape, {
      String opName = 'gpu_subgraph',
    }) {
  if (cpuInputs.length != gpuInputs.length) {
    throw Exception("CPU and GPU input lists must be the same length.");
  }

  // 1. Push CPU Data to GPU (Zero-Copy via FFI pointers)
  for (int i = 0; i < cpuInputs.length; i = i + 1) {
    CudaEngine.load(gpuInputs[i].id, cpuInputs[i].dataPtr, gpuInputs[i].shape);
  }

  // 2. Execute the pre-compiled Forward Tape
  CudaEngine.run(forwardTape.bytes());

  // 3. Create the output CPU Tensor & Pull Data
  dynamic dummyVal = _createDummy(gpuOutput.shape);
  Tensor<T> out = Tensor<T>(dummyVal);
  CudaEngine.retrieve(gpuOutput.id, out.dataPtr);

  // 4. Attach the CPU Autograd Node
  out.creator = Node(
    cpuInputs,
        () {
      // A. Zero out GPU gradients for the inputs to prevent accumulation across epochs
      CommandBuffer zeroTape = CommandBuffer();
      for (int i = 0; i < gpuInputs.length; i = i + 1) {
        zeroTape.putInt(OP_ZERO_GRAD);
        zeroTape.putString('${gpuInputs[i].id}_grad');
      }
      CudaEngine.run(zeroTape.bytes());

      // B. Push the accumulated CPU output gradient into the GPU graph
      CudaEngine.load('${gpuOutput.id}_grad', out.gradPtr, gpuOutput.shape);

      // C. Execute the pre-compiled Backward Tape
      CudaEngine.run(backwardTape.bytes());

      // D. Pull GPU input gradients back to the CPU and accumulate them natively
      for (int i = 0; i < cpuInputs.length; i = i + 1) {
        int numElements = 1;
        List<int> sList = gpuInputs[i].shape;
        for (int s = 0; s < sList.length; s = s + 1) {
          numElements = numElements * sList[s];
        }

        Pointer<Float> tempGradPtr = calloc<Float>(numElements);
        CudaEngine.retrieve('${gpuInputs[i].id}_grad', tempGradPtr);

        // INSTANT NATIVE ADDITION (Bypasses Dart GC and Loop entirely)
        CudaEngine.addPointers(cpuInputs[i].gradPtr, tempGradPtr, numElements);

        calloc.free(tempGradPtr);
      }
    },
    opName: opName,
    extraParams: {
      'gpu_output': gpuOutput,
      'gpu_inputs': gpuInputs,
    },
  );

  return out;
}