executeGPUGraph<T> function
Tensor<T>
executeGPUGraph<T>(
- List<
Tensor> cpuInputs, - List<
GPUTensor> gpuInputs, - GPUTensor gpuOutput,
- CommandBuffer forwardTape,
- CommandBuffer backwardTape, {
- String opName = 'gpu_subgraph',
Wraps an entire compiled GPU tape into a single CPU Tensor Autograd node.
Implementation
Tensor<T> executeGPUGraph<T>(
List<Tensor> cpuInputs,
List<GPUTensor> gpuInputs,
GPUTensor gpuOutput,
CommandBuffer forwardTape,
CommandBuffer backwardTape, {
String opName = 'gpu_subgraph',
}) {
if (cpuInputs.length != gpuInputs.length) {
throw Exception("CPU and GPU input lists must be the same length.");
}
// 1. Push CPU Data to GPU (Zero-Copy via FFI pointers)
for (int i = 0; i < cpuInputs.length; i = i + 1) {
CudaEngine.load(gpuInputs[i].id, cpuInputs[i].dataPtr, gpuInputs[i].shape);
}
// 2. Execute the pre-compiled Forward Tape
CudaEngine.run(forwardTape.bytes());
// 3. Create the output CPU Tensor & Pull Data
dynamic dummyVal = _createDummy(gpuOutput.shape);
Tensor<T> out = Tensor<T>(dummyVal);
CudaEngine.retrieve(gpuOutput.id, out.dataPtr);
// 4. Attach the CPU Autograd Node
out.creator = Node(
cpuInputs,
() {
// A. Zero out GPU gradients for the inputs to prevent accumulation across epochs
CommandBuffer zeroTape = CommandBuffer();
for (int i = 0; i < gpuInputs.length; i = i + 1) {
zeroTape.putInt(OP_ZERO_GRAD);
zeroTape.putString('${gpuInputs[i].id}_grad');
}
CudaEngine.run(zeroTape.bytes());
// B. Push the accumulated CPU output gradient into the GPU graph
CudaEngine.load('${gpuOutput.id}_grad', out.gradPtr, gpuOutput.shape);
// C. Execute the pre-compiled Backward Tape
CudaEngine.run(backwardTape.bytes());
// D. Pull GPU input gradients back to the CPU and accumulate them natively
for (int i = 0; i < cpuInputs.length; i = i + 1) {
int numElements = 1;
List<int> sList = gpuInputs[i].shape;
for (int s = 0; s < sList.length; s = s + 1) {
numElements = numElements * sList[s];
}
Pointer<Float> tempGradPtr = calloc<Float>(numElements);
CudaEngine.retrieve('${gpuInputs[i].id}_grad', tempGradPtr);
// INSTANT NATIVE ADDITION (Bypasses Dart GC and Loop entirely)
CudaEngine.addPointers(cpuInputs[i].gradPtr, tempGradPtr, numElements);
calloc.free(tempGradPtr);
}
},
opName: opName,
extraParams: {
'gpu_output': gpuOutput,
'gpu_inputs': gpuInputs,
},
);
return out;
}