main function
void
main()
Implementation
void main() async{
await CudaEngine.initialize(debug: false);
int N = 67108864*2;
List<int> vecShape = <int>[N];
int M = 4096 * 2;
List<int> matShape = <int>[M, M];
int iterations = 50;
print("\n==================================================================");
print(" CUDA ENGINE PERFORMANCE BENCHMARK ");
print("==================================================================");
print("Vector Size: $N elements (~${(N * 4 / 1000000).toStringAsFixed(0)} MB)");
print("Matrix Size: ${M}x$M elements");
print("Iterations: $iterations");
print("Note: VRAM is aggressively wiped and reallocated between each run to test loading speeds of different operations. ");
print("------------------------------------------------------------------\n");
int unaryBytes = N * 8;
int binaryBytes = N * 12;
int elementFlops = N;
int matmulBytes = M * M * 12;
int matmulFlops = 2 * M * M * M;
// --- BINARY OPERATIONS ---
{
GPUTensor<Vector>? vecA;
GPUTensor<Vector>? vecB;
GPUTensor<Vector>? out;
runBenchmarkIsolated(
name: "ADD",
iterations: iterations,
bytesPerOp: binaryBytes,
flopsPerOp: elementFlops,
allocateInputs: () {
vecA = GPUTensor<Vector>.empty(vecShape);
vecB = GPUTensor<Vector>.empty(vecShape);
},
initData: () {
CommandBuffer t = CommandBuffer();
fillTensorGPU(vecA!, 2.0, t);
fillTensorGPU(vecB!, 3.0, t);
CudaEngine.run(t.bytes());
},
buildTape: () {
CommandBuffer tape = CommandBuffer();
out = addGPU<Vector>(vecA!, vecB!, tape);
return tape;
},
freeAll: () {
vecA!.free();
vecB!.free();
out!.free();
}
);
}
{
int markovSeqLen = 67108864; // ~67 Million elements
int numStates = 16;
int order = 2;
int buildBytes = markovSeqLen * 4;
int buildFlops = markovSeqLen;
GPUTensor<Vector>? sequence;
GPUTensor<Matrix>? probTable;
runBenchmarkIsolated(
name: "MARKOV_TBL",
iterations: iterations,
bytesPerOp: buildBytes,
flopsPerOp: buildFlops,
allocateInputs: () {
// 1. Generate random floats in range [-8.0, 8.0] directly in VRAM
sequence = GPUTensor<Vector>.randomUniform(<int>[markovSeqLen], numStates / 2.0);
},
initData: () {
CommandBuffer t = CommandBuffer();
// 2. Shift [-8.0, 8.0] to [0.0, 16.0]
GPUTensor<Vector> shifted = addScalarVectorGPU(sequence!, numStates / 2.0, t);
// 3. Clamp [0.0, 16.0] to [0.0, 15.0] to guarantee valid state indices
GPUTensor<Vector> clamped = clampGPU<Vector>(shifted, 0.0, (numStates - 1).toDouble(), t);
// 4. Overwrite our sequence with the valid randomized states
t.putInt(OP_COPY);
t.putString(clamped.id);
t.putString(sequence!.id);
CudaEngine.run(t.bytes());
// Clean up the temporary conversion tensors so VRAM doesn't leak
shifted.free();
clamped.free();
},
buildTape: () {
CommandBuffer tape = CommandBuffer();
probTable = buildMarkovTableGPU(sequence!, order, numStates, tape);
return tape;
},
freeAll: () {
sequence!.free();
probTable!.free();
});
}
{
int numStates = 16;
int order = 2;
int numHistories = 256; // 16^2
// 10 Million predictions per pass
int predictBatchSize = 10000000;
// Read histories (batch * order * 4), Write Probs (batch * numStates * 4)
int predictBytes = (predictBatchSize * order * 4) + (predictBatchSize * numStates * 4);
int predictFlops = predictBatchSize * numStates;
GPUTensor<Matrix>? historyBatch;
GPUTensor<Matrix>? probTable;
GPUTensor<Matrix>? predictions;
runBenchmarkIsolated(
name: "MARKOV_PRD",
iterations: iterations,
bytesPerOp: predictBytes,
flopsPerOp: predictFlops,
allocateInputs: () {
historyBatch = GPUTensor<Matrix>.empty(<int>[predictBatchSize, order]);
probTable = GPUTensor<Matrix>.empty(<int>[numHistories, numStates]);
},
initData: () {
CommandBuffer t = CommandBuffer();
fillTensorGPU(historyBatch!, 1.0, t);
fillTensorGPU(probTable!, 0.0625, t); // 1/16th uniform prob
CudaEngine.run(t.bytes());
},
buildTape: () {
CommandBuffer tape = CommandBuffer();
predictions = markovPredictGPU(historyBatch!, probTable!, numStates, tape);
return tape;
},
freeAll: () {
historyBatch!.free();
probTable!.free();
predictions!.free();
}
);
}
{
GPUTensor<Vector>? vecA;
GPUTensor<Vector>? vecB;
GPUTensor<Vector>? out;
runBenchmarkIsolated(
name: "SUBTRACT",
iterations: iterations,
bytesPerOp: binaryBytes,
flopsPerOp: elementFlops,
allocateInputs: () {
vecA = GPUTensor<Vector>.empty(vecShape);
vecB = GPUTensor<Vector>.empty(vecShape);
},
initData: () {
CommandBuffer t = CommandBuffer();
fillTensorGPU(vecA!, 5.0, t);
fillTensorGPU(vecB!, 2.0, t);
CudaEngine.run(t.bytes());
},
buildTape: () {
CommandBuffer tape = CommandBuffer();
out = subtractGPU<Vector>(vecA!, vecB!, tape);
return tape;
},
freeAll: () {
vecA!.free();
vecB!.free();
out!.free();
}
);
}
{
GPUTensor<Vector>? vecA;
GPUTensor<Vector>? vecB;
GPUTensor<Vector>? out;
runBenchmarkIsolated(
name: "MULTIPLY",
iterations: iterations,
bytesPerOp: binaryBytes,
flopsPerOp: elementFlops,
allocateInputs: () {
vecA = GPUTensor<Vector>.empty(vecShape);
vecB = GPUTensor<Vector>.empty(vecShape);
},
initData: () {
CommandBuffer t = CommandBuffer();
fillTensorGPU(vecA!, 2.0, t);
fillTensorGPU(vecB!, 3.0, t);
CudaEngine.run(t.bytes());
},
buildTape: () {
CommandBuffer tape = CommandBuffer();
out = multiplyGPU<Vector>(vecA!, vecB!, tape);
return tape;
},
freeAll: () {
vecA!.free();
vecB!.free();
out!.free();
}
);
}
{
GPUTensor<Vector>? vecA;
GPUTensor<Vector>? vecB;
GPUTensor<Vector>? out;
runBenchmarkIsolated(
name: "DIVIDE",
iterations: iterations,
bytesPerOp: binaryBytes,
flopsPerOp: elementFlops,
allocateInputs: () {
vecA = GPUTensor<Vector>.empty(vecShape);
vecB = GPUTensor<Vector>.empty(vecShape);
},
initData: () {
CommandBuffer t = CommandBuffer();
fillTensorGPU(vecA!, 10.0, t);
fillTensorGPU(vecB!, 2.0, t);
CudaEngine.run(t.bytes());
},
buildTape: () {
CommandBuffer tape = CommandBuffer();
out = divideGPU<Vector>(vecA!, vecB!, tape);
return tape;
},
freeAll: () {
vecA!.free();
vecB!.free();
out!.free();
}
);
}
// --- UNARY OPERATIONS ---
{
GPUTensor<Vector>? vecA;
GPUTensor<Vector>? out;
runBenchmarkIsolated(
name: "ABS",
iterations: iterations,
bytesPerOp: unaryBytes,
flopsPerOp: elementFlops,
allocateInputs: () {
vecA = GPUTensor<Vector>.empty(vecShape);
},
initData: () {
CommandBuffer t = CommandBuffer();
fillTensorGPU(vecA!, -2.0, t);
CudaEngine.run(t.bytes());
},
buildTape: () {
CommandBuffer tape = CommandBuffer();
out = absGPU<Vector>(vecA!, tape);
return tape;
},
freeAll: () {
vecA!.free();
out!.free();
}
);
}
{
GPUTensor<Vector>? vecA;
GPUTensor<Vector>? out;
runBenchmarkIsolated(
name: "SQRT",
iterations: iterations,
bytesPerOp: unaryBytes,
flopsPerOp: elementFlops,
allocateInputs: () {
vecA = GPUTensor<Vector>.empty(vecShape);
},
initData: () {
CommandBuffer t = CommandBuffer();
fillTensorGPU(vecA!, 4.0, t);
CudaEngine.run(t.bytes());
},
buildTape: () {
CommandBuffer tape = CommandBuffer();
out = sqrtGPU<Vector>(vecA!, tape);
return tape;
},
freeAll: () {
vecA!.free();
out!.free();
}
);
}
{
GPUTensor<Vector>? vecA;
GPUTensor<Vector>? out;
runBenchmarkIsolated(
name: "LOG",
iterations: iterations,
bytesPerOp: unaryBytes,
flopsPerOp: elementFlops,
allocateInputs: () {
vecA = GPUTensor<Vector>.empty(vecShape);
},
initData: () {
CommandBuffer t = CommandBuffer();
fillTensorGPU(vecA!, 2.718, t);
CudaEngine.run(t.bytes());
},
buildTape: () {
CommandBuffer tape = CommandBuffer();
out = logGPU<Vector>(vecA!, tape);
return tape;
},
freeAll: () {
vecA!.free();
out!.free();
}
);
}
{
GPUTensor<Vector>? vecA;
GPUTensor<Vector>? out;
runBenchmarkIsolated(
name: "POW",
iterations: iterations,
bytesPerOp: unaryBytes,
flopsPerOp: elementFlops,
allocateInputs: () {
vecA = GPUTensor<Vector>.empty(vecShape);
},
initData: () {
CommandBuffer t = CommandBuffer();
fillTensorGPU(vecA!, 3.0, t);
CudaEngine.run(t.bytes());
},
buildTape: () {
CommandBuffer tape = CommandBuffer();
out = powGPU<Vector>(vecA!, 3.0, tape);
return tape;
},
freeAll: () {
vecA!.free();
out!.free();
}
);
}
{
GPUTensor<Vector>? vecA;
GPUTensor<Vector>? out;
runBenchmarkIsolated(
name: "CLAMP",
iterations: iterations,
bytesPerOp: unaryBytes,
flopsPerOp: elementFlops,
allocateInputs: () {
vecA = GPUTensor<Vector>.empty(vecShape);
},
initData: () {
CommandBuffer t = CommandBuffer();
fillTensorGPU(vecA!, 10.0, t);
CudaEngine.run(t.bytes());
},
buildTape: () {
CommandBuffer tape = CommandBuffer();
out = clampGPU<Vector>(vecA!, 0.0, 5.0, tape);
return tape;
},
freeAll: () {
vecA!.free();
out!.free();
}
);
}
// --- MATMUL ---
{
GPUTensor<Matrix>? matA;
GPUTensor<Matrix>? matB;
GPUTensor<Matrix>? outMatmul;
runBenchmarkIsolated(
name: "MATMUL",
iterations: iterations,
bytesPerOp: matmulBytes,
flopsPerOp: matmulFlops,
allocateInputs: () {
matA = GPUTensor<Matrix>.empty(matShape);
matB = GPUTensor<Matrix>.empty(matShape);
},
initData: () {
CommandBuffer t = CommandBuffer();
fillTensorGPU(matA!, 1.0, t);
fillTensorGPU(matB!, 1.0, t);
CudaEngine.run(t.bytes());
},
buildTape: () {
CommandBuffer tape = CommandBuffer();
outMatmul = GPUTensor<Matrix>.empty(matShape);
tape.putInt(OP_MATMUL);
tape.putString(matA!.id);
tape.putString(matB!.id);
tape.putString(outMatmul!.id);
tape.putBool(false);
tape.putBool(false);
tape.putFloat(1.0);
tape.putFloat(0.0);
tape.putBool(true);
return tape;
},
freeAll: () {
matA!.free();
matB!.free();
outMatmul!.free();
}
);
}
// --- LINEAR ALGEBRA & BROADCASTING ---
{
GPUTensor<Matrix>? matA;
GPUTensor<Matrix>? out;
runBenchmarkIsolated(
name: "TRANSPOSE",
iterations: iterations,
bytesPerOp: M * M * 8, // Read M (4 bytes), Write Out (4 bytes)
flopsPerOp: 0, // Zero math, purely memory bound
allocateInputs: () {
matA = GPUTensor<Matrix>.empty(<int>[M, M]);
},
initData: () {
CommandBuffer t = CommandBuffer();
fillTensorGPU(matA!, 1.0, t);
CudaEngine.run(t.bytes());
},
buildTape: () {
CommandBuffer tape = CommandBuffer();
out = transposeGPU(matA!, tape);
return tape;
},
freeAll: () {
matA!.free();
out!.free();
});
}
{
GPUTensor<Matrix>? matA;
GPUTensor<Vector>? vecB;
GPUTensor<Vector>? out;
runBenchmarkIsolated(
name: "MAT_VEC",
iterations: iterations,
bytesPerOp: (M * M * 4) + (M * 4) + (M * 4),
flopsPerOp: 2 * M * M,
allocateInputs: () {
matA = GPUTensor<Matrix>.empty(<int>[M, M]);
vecB = GPUTensor<Vector>.empty(<int>[M]);
},
initData: () {
CommandBuffer t = CommandBuffer();
fillTensorGPU(matA!, 1.0, t);
fillTensorGPU(vecB!, 2.0, t);
CudaEngine.run(t.bytes());
},
buildTape: () {
CommandBuffer tape = CommandBuffer();
out = matVecMulGPU(matA!, vecB!, tape);
return tape;
},
freeAll: () {
matA!.free();
vecB!.free();
out!.free();
});
}
{
GPUTensor<Matrix>? matA;
GPUTensor<Vector>? vecBias;
GPUTensor<Matrix>? out;
runBenchmarkIsolated(
name: "ADD_BIAS",
iterations: iterations,
bytesPerOp: (M * M * 4) + (M * 4) + (M * M * 4),
flopsPerOp: M * M,
allocateInputs: () {
matA = GPUTensor<Matrix>.empty(<int>[M, M]);
vecBias = GPUTensor<Vector>.empty(<int>[M]);
},
initData: () {
CommandBuffer t = CommandBuffer();
fillTensorGPU(matA!, 1.0, t);
fillTensorGPU(vecBias!, 2.0, t);
CudaEngine.run(t.bytes());
},
buildTape: () {
CommandBuffer tape = CommandBuffer();
out = addBiasToMatMulOutGPU(matA!, vecBias!, tape);
return tape;
},
freeAll: () {
matA!.free();
vecBias!.free();
out!.free();
});
}
{
GPUTensor<Matrix>? matA;
GPUTensor<Matrix>? out;
runBenchmarkIsolated(
name: "SCALE_MAT",
iterations: iterations,
bytesPerOp: M * M * 8, // Read M, Write C
flopsPerOp: M * M,
allocateInputs: () {
matA = GPUTensor<Matrix>.empty(<int>[M, M]);
},
initData: () {
CommandBuffer t = CommandBuffer();
fillTensorGPU(matA!, 2.0, t);
CudaEngine.run(t.bytes());
},
buildTape: () {
CommandBuffer tape = CommandBuffer();
out = scaleMatrixGPU(matA!, 3.0, tape);
return tape;
},
freeAll: () {
matA!.free();
out!.free();
});
}
{
GPUTensor<Vector>? vecA;
GPUTensor<Vector>? out;
runBenchmarkIsolated(
name: "ADD_SCALAR",
iterations: iterations,
bytesPerOp: N * 8, // Using the giant Vector N
flopsPerOp: N,
allocateInputs: () {
vecA = GPUTensor<Vector>.empty(<int>[N]); // Giant Vector size
},
initData: () {
CommandBuffer t = CommandBuffer();
fillTensorGPU(vecA!, 1.0, t);
CudaEngine.run(t.bytes());
},
buildTape: () {
CommandBuffer tape = CommandBuffer();
out = addScalarVectorGPU(vecA!, 5.0, tape);
return tape;
},
freeAll: () {
vecA!.free();
out!.free();
});
}
// --- ACTIVATIONS ---
{
GPUTensor<Vector>? vecA;
GPUTensor<Vector>? out;
runBenchmarkIsolated(
name: "RELU",
iterations: iterations,
bytesPerOp: unaryBytes, // Read N, Write N
flopsPerOp: elementFlops, // 1 max() operation per element
allocateInputs: () {
vecA = GPUTensor<Vector>.empty(vecShape);
},
initData: () {
CommandBuffer t = CommandBuffer();
// Mix of values doesn't matter for memory bandwidth, fill with 1.0
fillTensorGPU(vecA!, 1.0, t);
CudaEngine.run(t.bytes());
},
buildTape: () {
CommandBuffer tape = CommandBuffer();
out = reluGPU(vecA!, tape);
return tape;
},
freeAll: () {
vecA!.free();
out!.free();
});
}
{
GPUTensor<Vector>? vecA;
GPUTensor<Vector>? out;
runBenchmarkIsolated(
name: "SIGMOID",
iterations: iterations,
bytesPerOp: unaryBytes,
flopsPerOp: elementFlops * 3, // exp, add, div
allocateInputs: () {
vecA = GPUTensor<Vector>.empty(vecShape);
},
initData: () {
CommandBuffer t = CommandBuffer();
fillTensorGPU(vecA!, 0.5, t);
CudaEngine.run(t.bytes());
},
buildTape: () {
CommandBuffer tape = CommandBuffer();
out = sigmoidGPU(vecA!, tape);
return tape;
},
freeAll: () {
vecA!.free();
out!.free();
});
}
{
GPUTensor<Vector>? vecA;
GPUTensor<Vector>? out;
runBenchmarkIsolated(
name: "TANH",
iterations: iterations,
bytesPerOp: unaryBytes,
flopsPerOp: elementFlops * 3, // hardware dependent, roughly 3 ops
allocateInputs: () {
vecA = GPUTensor<Vector>.empty(vecShape);
},
initData: () {
CommandBuffer t = CommandBuffer();
fillTensorGPU(vecA!, 0.5, t);
CudaEngine.run(t.bytes());
},
buildTape: () {
CommandBuffer tape = CommandBuffer();
out = vectorTanhGPU(vecA!, tape);
return tape;
},
freeAll: () {
vecA!.free();
out!.free();
});
}
{
GPUTensor<Vector>? vecA;
GPUTensor<Vector>? out;
runBenchmarkIsolated(
name: "GELU",
iterations: iterations,
bytesPerOp: unaryBytes,
flopsPerOp: elementFlops * 5, // GELU is mathematically heavier
allocateInputs: () {
vecA = GPUTensor<Vector>.empty(vecShape);
},
initData: () {
CommandBuffer t = CommandBuffer();
fillTensorGPU(vecA!, 0.5, t);
CudaEngine.run(t.bytes());
},
buildTape: () {
CommandBuffer tape = CommandBuffer();
out = geluGPU(vecA!, tape);
return tape;
},
freeAll: () {
vecA!.free();
out!.free();
});
}
{
int softmaxBytes = M * M * 12; // Reads row for max, reads for sum, writes out (rough approx)
int softmaxFlops = M * M * 3;
GPUTensor<Matrix>? matA;
GPUTensor<Matrix>? out;
runBenchmarkIsolated(
name: "SOFTMAX",
iterations: iterations,
bytesPerOp: softmaxBytes,
flopsPerOp: softmaxFlops,
allocateInputs: () {
matA = GPUTensor<Matrix>.empty(matShape);
},
initData: () {
CommandBuffer t = CommandBuffer();
fillTensorGPU(matA!, 1.0, t);
CudaEngine.run(t.bytes());
},
buildTape: () {
CommandBuffer tape = CommandBuffer();
out = softmaxMatrixGPU(matA!, tape);
return tape;
},
freeAll: () {
matA!.free();
out!.free();
});
}
// --- LOSS FUNCTIONS (Memory Bound) ---
{
GPUTensor<Vector>? vecA;
GPUTensor<Vector>? vecB;
GPUTensor<Scalar>? outLoss;
runBenchmarkIsolated(
name: "BCE_LOSS",
iterations: iterations,
bytesPerOp: N * 8, // Read Preds (4), Read Targets (4), write 1 scalar
flopsPerOp: N * 4, // log, mul, add, mul
allocateInputs: () {
vecA = GPUTensor<Vector>.empty(<int>[N]);
vecB = GPUTensor<Vector>.empty(<int>[N]);
},
initData: () {
CommandBuffer t = CommandBuffer();
fillTensorGPU(vecA!, 0.5, t); // Preds between 0 and 1
fillTensorGPU(vecB!, 1.0, t); // Targets 0 or 1
CudaEngine.run(t.bytes());
},
buildTape: () {
CommandBuffer tape = CommandBuffer();
outLoss = binaryCrossEntropyGPU<Vector>(vecA!, vecB!, tape);
return tape;
},
freeAll: () {
vecA!.free();
vecB!.free();
outLoss!.free();
});
}
{
GPUTensor<Vector>? vecA;
GPUTensor<Vector>? vecB;
GPUTensor<Scalar>? outLoss;
runBenchmarkIsolated(
name: "MSE_VEC",
iterations: iterations,
bytesPerOp: N * 8, // Read Preds, Read Targets
flopsPerOp: N * 3, // sub, square, add (reduction)
allocateInputs: () {
vecA = GPUTensor<Vector>.empty(<int>[N]);
vecB = GPUTensor<Vector>.empty(<int>[N]);
},
initData: () {
CommandBuffer t = CommandBuffer();
fillTensorGPU(vecA!, 2.0, t);
fillTensorGPU(vecB!, 1.0, t);
CudaEngine.run(t.bytes());
},
buildTape: () {
CommandBuffer tape = CommandBuffer();
outLoss = mseGPU(vecA!, vecB!, tape);
return tape;
},
freeAll: () {
vecA!.free();
vecB!.free();
outLoss!.free();
});
}
{
GPUTensor<Matrix>? matA;
GPUTensor<Matrix>? matB;
GPUTensor<Scalar>? outLoss;
runBenchmarkIsolated(
name: "MSE_MAT",
iterations: iterations,
bytesPerOp: M * M * 8,
flopsPerOp: M * M * 3,
allocateInputs: () {
matA = GPUTensor<Matrix>.empty(<int>[M, M]);
matB = GPUTensor<Matrix>.empty(<int>[M, M]);
},
initData: () {
CommandBuffer t = CommandBuffer();
fillTensorGPU(matA!, 2.0, t);
fillTensorGPU(matB!, 1.0, t);
CudaEngine.run(t.bytes());
},
buildTape: () {
CommandBuffer tape = CommandBuffer();
outLoss = mseMatrixGPU(matA!, matB!, tape);
return tape;
},
freeAll: () {
matA!.free();
matB!.free();
outLoss!.free();
});
}
// --- REDUCTIONS (Memory Bound) ---
{
GPUTensor<Vector>? vecA;
GPUTensor<Scalar>? outSum;
runBenchmarkIsolated(
name: "SUM_VEC",
iterations: iterations,
bytesPerOp: N * 4, // Read N, write 1
flopsPerOp: N,
allocateInputs: () {
vecA = GPUTensor<Vector>.empty(<int>[N]);
},
initData: () {
CommandBuffer t = CommandBuffer();
fillTensorGPU(vecA!, 1.0, t);
CudaEngine.run(t.bytes());
},
buildTape: () {
CommandBuffer tape = CommandBuffer();
outSum = sumGPU(vecA!, tape);
return tape;
},
freeAll: () {
vecA!.free();
outSum!.free();
});
}
{
GPUTensor<Matrix>? matA;
GPUTensor<Vector>? outCols;
runBenchmarkIsolated(
name: "SUM_COLS",
iterations: iterations,
bytesPerOp: (M * M * 4) + (M * 4), // Read MxM, Write M
flopsPerOp: M * M,
allocateInputs: () {
matA = GPUTensor<Matrix>.empty(<int>[M, M]);
},
initData: () {
CommandBuffer t = CommandBuffer();
fillTensorGPU(matA!, 1.0, t);
CudaEngine.run(t.bytes());
},
buildTape: () {
CommandBuffer tape = CommandBuffer();
outCols = sumReduceColumnsGPU(matA!, tape);
return tape;
},
freeAll: () {
matA!.free();
outCols!.free();
});
}
{
GPUTensor<Matrix>? matA;
GPUTensor<Vector>? outRows;
runBenchmarkIsolated(
name: "SUM_ROWS",
iterations: iterations,
bytesPerOp: (M * M * 4) + (M * 4), // Read MxM, Write M
flopsPerOp: M * M,
allocateInputs: () {
matA = GPUTensor<Matrix>.empty(<int>[M, M]);
},
initData: () {
CommandBuffer t = CommandBuffer();
fillTensorGPU(matA!, 1.0, t);
CudaEngine.run(t.bytes());
},
buildTape: () {
CommandBuffer tape = CommandBuffer();
outRows = sumReduceRowsGPU(matA!, tape);
return tape;
},
freeAll: () {
matA!.free();
outRows!.free();
});
}
// --- EMBEDDING LOOKUPS (Memory Scatter/Gather Bound) ---
{
int vocabSize = 32000;
int embedDim = 768; // Standard LLM/BERT hidden size
int numTokens = 1048576; // 1 Million tokens
int embedBytes = (numTokens * 4) + // Read indices
(numTokens * embedDim * 4) + // Gather from weights
(numTokens * embedDim * 4); // Write output
int embedFlops = 0; // Pure memory routing
GPUTensor<Vector>? indices;
GPUTensor<Matrix>? weights;
GPUTensor<Matrix>? outEmbed;
runBenchmarkIsolated(
name: "EMBED_VEC",
iterations: iterations,
bytesPerOp: embedBytes,
flopsPerOp: embedFlops,
allocateInputs: () {
indices = GPUTensor<Vector>.empty(<int>[numTokens]);
weights = GPUTensor<Matrix>.empty(<int>[vocabSize, embedDim]);
},
initData: () {
CommandBuffer t = CommandBuffer();
// We MUST fill indices with a valid vocab index (e.g., 0) so we don't segfault
fillTensorGPU(indices!, 0.0, t);
fillTensorGPU(weights!, 1.0, t);
CudaEngine.run(t.bytes());
},
buildTape: () {
CommandBuffer tape = CommandBuffer();
outEmbed = embeddingLookupGPU(indices!, weights!, tape);
return tape;
},
freeAll: () {
indices!.free();
weights!.free();
outEmbed!.free();
});
}
{
int vocabSize = 32000;
int embedDim = 768;
int batchSize = 1024;
int seqLength = 1024;
int embedBytes = (batchSize * seqLength * 4) +
(batchSize * seqLength * embedDim * 4) +
(batchSize * seqLength * embedDim * 4);
GPUTensor<Matrix>? batchIndices;
GPUTensor<Matrix>? weights;
GPUTensor<Tensor3D>? outBatchEmbed;
runBenchmarkIsolated(
name: "EMBED_MAT",
iterations: iterations,
bytesPerOp: embedBytes,
flopsPerOp: 0,
allocateInputs: () {
batchIndices = GPUTensor<Matrix>.empty(<int>[batchSize, seqLength]);
weights = GPUTensor<Matrix>.empty(<int>[vocabSize, embedDim]);
},
initData: () {
CommandBuffer t = CommandBuffer();
fillTensorGPU(batchIndices!, 0.0, t); // Safe valid index
fillTensorGPU(weights!, 1.0, t);
CudaEngine.run(t.bytes());
},
buildTape: () {
CommandBuffer tape = CommandBuffer();
outBatchEmbed = embeddingLookupBatchGPU(batchIndices!, weights!, tape);
return tape;
},
freeAll: () {
batchIndices!.free();
weights!.free();
outBatchEmbed!.free();
});
}
// --- TENSOR MANIPULATION & ROUTING (Memory Bound) ---
{
int sliceRows = M;
int sliceCols = M ~/ 2; // Slice half the matrix
int sliceBytes = (sliceRows * sliceCols * 4) * 2; // Read slice, Write slice
GPUTensor<Matrix>? matA;
GPUTensor<Matrix>? outSlice;
runBenchmarkIsolated(
name: "SLICE_COL",
iterations: iterations,
bytesPerOp: sliceBytes,
flopsPerOp: 0,
allocateInputs: () {
matA = GPUTensor<Matrix>.empty(<int>[M, M]);
},
initData: () {
CommandBuffer t = CommandBuffer();
fillTensorGPU(matA!, 1.0, t);
CudaEngine.run(t.bytes());
},
buildTape: () {
CommandBuffer tape = CommandBuffer();
outSlice = sliceColumnGPU(matA!, 0, sliceCols, tape);
return tape;
},
freeAll: () {
matA!.free();
outSlice!.free();
});
}
{
// Make a very wide matrix so the row extraction moves enough bytes to measure properly
int wideCols = 1048576; // 1 Million elements per row
int rowBytes = wideCols * 4 * 2; // Read row, Write vector
GPUTensor<Matrix>? matA;
GPUTensor<Vector>? outRow;
runBenchmarkIsolated(
name: "SLICE_ROW",
iterations: iterations,
bytesPerOp: rowBytes,
flopsPerOp: 0,
allocateInputs: () {
matA = GPUTensor<Matrix>.empty(<int>[128, wideCols]);
},
initData: () {
CommandBuffer t = CommandBuffer();
fillTensorGPU(matA!, 1.0, t);
CudaEngine.run(t.bytes());
},
buildTape: () {
CommandBuffer tape = CommandBuffer();
outRow = selectRowGPU(matA!, 64, tape);
return tape;
},
freeAll: () {
matA!.free();
outRow!.free();
});
}
{
int depth = 64;
int size = 1024;
int slice3DBytes = (size * size * 4) * 2;
GPUTensor<Tensor3D>? t3D;
GPUTensor<Matrix>? outMat;
runBenchmarkIsolated(
name: "SLICE_3D",
iterations: iterations,
bytesPerOp: slice3DBytes,
flopsPerOp: 0,
allocateInputs: () {
t3D = GPUTensor<Tensor3D>.empty(<int>[depth, size, size]);
},
initData: () {
CommandBuffer t = CommandBuffer();
fillTensorGPU(t3D!, 1.0, t);
CudaEngine.run(t.bytes());
},
buildTape: () {
CommandBuffer tape = CommandBuffer();
outMat = selectMatrixFrom3DGPU(t3D!, 32, tape);
return tape;
},
freeAll: () {
t3D!.free();
outMat!.free();
});
}
{
int halfN = N ~/ 2;
int concatBytes = N * 4 * 2; // Read N total, Write N total
GPUTensor<Vector>? vecA;
GPUTensor<Vector>? vecB;
GPUTensor<Vector>? outConcat;
runBenchmarkIsolated(
name: "CONCAT_VEC",
iterations: iterations,
bytesPerOp: concatBytes,
flopsPerOp: 0,
allocateInputs: () {
vecA = GPUTensor<Vector>.empty(<int>[halfN]);
vecB = GPUTensor<Vector>.empty(<int>[halfN]);
},
initData: () {
CommandBuffer t = CommandBuffer();
fillTensorGPU(vecA!, 1.0, t);
fillTensorGPU(vecB!, 2.0, t);
CudaEngine.run(t.bytes());
},
buildTape: () {
CommandBuffer tape = CommandBuffer();
outConcat = concatenateGPU(vecA!, vecB!, tape);
return tape;
},
freeAll: () {
vecA!.free();
vecB!.free();
outConcat!.free();
});
}
{
int numMatrices = 16;
int rows = 1024;
int cols = 1024;
int stackBytes = (numMatrices * rows * cols * 4) * 2;
List<GPUTensor<Matrix>>? matrices;
GPUTensor<Tensor3D>? outStack;
runBenchmarkIsolated(
name: "STACK_MAT",
iterations: iterations,
bytesPerOp: stackBytes,
flopsPerOp: 0,
allocateInputs: () {
matrices = List.generate(numMatrices, (_) => GPUTensor<Matrix>.empty(<int>[rows, cols]));
},
initData: () {
CommandBuffer t = CommandBuffer();
for (var m in matrices!) {
fillTensorGPU(m, 1.0, t);
}
CudaEngine.run(t.bytes());
},
buildTape: () {
CommandBuffer tape = CommandBuffer();
outStack = stackMatricesGPU(matrices!, tape);
return tape;
},
freeAll: () {
for (var m in matrices!) {
m.free();
}
outStack!.free();
});
}
{
// Simulating Multi-Head Attention Head Scattering
int seqLen = 4096;
int dHead = 64;
int numHeads = 12;
int dModel = numHeads * dHead;
int scatterBytes = (seqLen * dModel * 4) * 2; // Gather from 12 heads, write to 1 model tensor
List<GPUTensor<Matrix>>? heads;
GPUTensor<Matrix>? outScatter;
runBenchmarkIsolated(
name: "SCAT_HEADS",
iterations: iterations,
bytesPerOp: scatterBytes,
flopsPerOp: 0,
allocateInputs: () {
heads = List.generate(numHeads, (_) => GPUTensor<Matrix>.empty(<int>[seqLen, dHead]));
},
initData: () {
CommandBuffer t = CommandBuffer();
for (var h in heads!) {
fillTensorGPU(h, 1.0, t);
}
CudaEngine.run(t.bytes());
},
buildTape: () {
CommandBuffer tape = CommandBuffer();
outScatter = scatterHeadsGPU(heads!, dModel, tape);
return tape;
},
freeAll: () {
for (var h in heads!) {
h.free();
}
outScatter!.free();
});
}
{
int inSize = 4000;
int pad = 48; // Total out size = 4096 x 4096
int outSize = inSize + (2 * pad);
int padBytes = (inSize * inSize * 4) + (outSize * outSize * 4); // Read inner, write outer (padded zeros are free)
GPUTensor<Matrix>? matA;
GPUTensor<Matrix>? outPad;
runBenchmarkIsolated(
name: "PAD_2D",
iterations: iterations,
bytesPerOp: padBytes,
flopsPerOp: 0,
allocateInputs: () {
matA = GPUTensor<Matrix>.empty(<int>[inSize, inSize]);
},
initData: () {
CommandBuffer t = CommandBuffer();
fillTensorGPU(matA!, 1.0, t);
CudaEngine.run(t.bytes());
},
buildTape: () {
CommandBuffer tape = CommandBuffer();
outPad = padMatrixGPU(matA!, pad, tape);
return tape;
},
freeAll: () {
matA!.free();
outPad!.free();
});
}
// --- NORMALIZATION & REGULARIZATION ---
{
int dModel = M; // 8192
int normBytes = (M * M * 4) + (dModel * 4 * 4) + (M * M * 4); // Read Input, Read Params (Gamma, Beta, Mean, Var), Write Output
int normFlops = M * M * 8; // Mean, Var, Normalize, Scale, Shift
GPUTensor<Matrix>? input;
GPUTensor<Vector>? gamma;
GPUTensor<Vector>? beta;
GPUTensor<Vector>? mean;
GPUTensor<Vector>? rstd;
GPUTensor<Matrix>? outNorm;
runBenchmarkIsolated(
name: "LAYER_NORM",
iterations: iterations,
bytesPerOp: normBytes,
flopsPerOp: normFlops,
allocateInputs: () {
input = GPUTensor<Matrix>.empty(<int>[M, M]);
gamma = GPUTensor<Vector>.empty(<int>[dModel]);
beta = GPUTensor<Vector>.empty(<int>[dModel]);
mean = GPUTensor<Vector>.empty(<int>[dModel]);
rstd = GPUTensor<Vector>.empty(<int>[dModel]);
},
initData: () {
CommandBuffer t = CommandBuffer();
fillTensorGPU(input!, 1.0, t);
fillTensorGPU(gamma!, 1.0, t);
fillTensorGPU(beta!, 0.0, t);
fillTensorGPU(mean!, 0.0, t);
fillTensorGPU(rstd!, 1.0, t);
CudaEngine.run(t.bytes());
},
buildTape: () {
CommandBuffer tape = CommandBuffer();
outNorm = layerNormMatrixGPU(input!, gamma!, beta!, mean!, rstd!, 1e-5, tape);
return tape;
},
freeAll: () {
input!.free();
gamma!.free();
beta!.free();
mean!.free();
rstd!.free();
outNorm!.free();
});
}
{
int dropBytes = (M * M * 4) + (M * M * 4) + (M * M * 4); // Read Input, Write Mask, Write Output
int dropFlops = M * M; // Random RNG comparison per element
GPUTensor<Matrix>? input;
GPUTensor<Matrix>? outDrop;
runBenchmarkIsolated(
name: "DROPOUT",
iterations: iterations,
bytesPerOp: dropBytes,
flopsPerOp: dropFlops,
allocateInputs: () {
input = GPUTensor<Matrix>.empty(<int>[M, M]);
},
initData: () {
CommandBuffer t = CommandBuffer();
fillTensorGPU(input!, 1.0, t);
CudaEngine.run(t.bytes());
},
buildTape: () {
CommandBuffer tape = CommandBuffer();
outDrop = dropoutGPU<Matrix>(input!, 0.5, tape);
return tape;
},
freeAll: () {
input!.free();
outDrop!.free();
});
}
// --- ADVANCED METRICS & LOSSES (Compositional) ---
{
int dotBytes = N * 16; // Multiply (Read 2N, Write 1N), Sum (Read 1N, Write 1)
int dotFlops = N * 2;
GPUTensor<Vector>? vecA;
GPUTensor<Vector>? vecB;
GPUTensor<Scalar>? outDot;
runBenchmarkIsolated(
name: "DOT_PROD",
iterations: iterations,
bytesPerOp: dotBytes,
flopsPerOp: dotFlops,
allocateInputs: () {
vecA = GPUTensor<Vector>.empty(<int>[N]);
vecB = GPUTensor<Vector>.empty(<int>[N]);
},
initData: () {
CommandBuffer t = CommandBuffer();
fillTensorGPU(vecA!, 2.0, t);
fillTensorGPU(vecB!, 3.0, t);
CudaEngine.run(t.bytes());
},
buildTape: () {
CommandBuffer tape = CommandBuffer();
outDot = dotProductGPU(vecA!, vecB!, tape);
return tape;
},
freeAll: () {
vecA!.free();
vecB!.free();
outDot!.free();
});
}
{
int normBytes = N * 12; // Pow (Read 1N, Write 1N), Sum (Read 1N, Write 1)
int normFlops = N * 2;
GPUTensor<Vector>? vecA;
GPUTensor<Scalar>? outNorm;
runBenchmarkIsolated(
name: "L2_NORM",
iterations: iterations,
bytesPerOp: normBytes,
flopsPerOp: normFlops,
allocateInputs: () {
vecA = GPUTensor<Vector>.empty(<int>[N]);
},
initData: () {
CommandBuffer t = CommandBuffer();
fillTensorGPU(vecA!, 2.0, t);
CudaEngine.run(t.bytes());
},
buildTape: () {
CommandBuffer tape = CommandBuffer();
outNorm = l2NormGPU(vecA!, tape);
return tape;
},
freeAll: () {
vecA!.free();
outNorm!.free();
});
}
{
int distBytes = N * 24; // Sub (3N), Pow (2N), Sum (1N)
int distFlops = N * 3;
GPUTensor<Vector>? vecA;
GPUTensor<Vector>? vecB;
GPUTensor<Scalar>? outDist;
runBenchmarkIsolated(
name: "EUC_DIST",
iterations: iterations,
bytesPerOp: distBytes,
flopsPerOp: distFlops,
allocateInputs: () {
vecA = GPUTensor<Vector>.empty(<int>[N]);
vecB = GPUTensor<Vector>.empty(<int>[N]);
},
initData: () {
CommandBuffer t = CommandBuffer();
fillTensorGPU(vecA!, 5.0, t);
fillTensorGPU(vecB!, 2.0, t);
CudaEngine.run(t.bytes());
},
buildTape: () {
CommandBuffer tape = CommandBuffer();
outDist = euclideanDistanceGPU(vecA!, vecB!, tape);
return tape;
},
freeAll: () {
vecA!.free();
vecB!.free();
outDist!.free();
});
}
{
int cosBytes = N * 40; // Dot (4N), NormA (3N), NormB (3N) -> Total 10N reads/writes
int cosFlops = N * 6;
GPUTensor<Vector>? vecA;
GPUTensor<Vector>? vecB;
GPUTensor<Scalar>? outCos;
runBenchmarkIsolated(
name: "COS_SIM",
iterations: iterations,
bytesPerOp: cosBytes,
flopsPerOp: cosFlops,
allocateInputs: () {
vecA = GPUTensor<Vector>.empty(<int>[N]);
vecB = GPUTensor<Vector>.empty(<int>[N]);
},
initData: () {
CommandBuffer t = CommandBuffer();
fillTensorGPU(vecA!, 1.0, t);
fillTensorGPU(vecB!, 1.0, t);
CudaEngine.run(t.bytes());
},
buildTape: () {
CommandBuffer tape = CommandBuffer();
outCos = cosineSimilarityGPU(vecA!, vecB!, tape);
return tape;
},
freeAll: () {
vecA!.free();
vecB!.free();
outCos!.free();
});
}
{
int maeBytes = N * 24; // Sub (3N), Abs (2N), Sum (1N)
int maeFlops = N * 3;
GPUTensor<Vector>? vecA;
GPUTensor<Vector>? vecB;
GPUTensor<Scalar>? outMae;
runBenchmarkIsolated(
name: "MAE_LOSS",
iterations: iterations,
bytesPerOp: maeBytes,
flopsPerOp: maeFlops,
allocateInputs: () {
vecA = GPUTensor<Vector>.empty(<int>[N]);
vecB = GPUTensor<Vector>.empty(<int>[N]);
},
initData: () {
CommandBuffer t = CommandBuffer();
fillTensorGPU(vecA!, 5.0, t);
fillTensorGPU(vecB!, 3.0, t);
CudaEngine.run(t.bytes());
},
buildTape: () {
CommandBuffer tape = CommandBuffer();
outMae = maeLossGPU(vecA!, vecB!, tape);
return tape;
},
freeAll: () {
vecA!.free();
vecB!.free();
outMae!.free();
});
}
CudaEngine.dispose();
}