runBenchmarkIsolated function
void
runBenchmarkIsolated({
- required String name,
- required int iterations,
- required int bytesPerOp,
- required int flopsPerOp,
- required void allocateInputs(),
- required void initData(),
- required CommandBuffer buildTape(),
- required void freeAll(),
Implementation
void runBenchmarkIsolated({
required String name,
required int iterations,
required int bytesPerOp,
required int flopsPerOp,
required void Function() allocateInputs,
required void Function() initData,
required CommandBuffer Function() buildTape,
required void Function() freeAll,
}) {
Stopwatch allocSw = Stopwatch();
// 1. Time Allocation of inputs
allocSw.start();
allocateInputs();
allocSw.stop();
// 2. Initialize data (Excluded from overhead)
initData();
// 3. Time Allocation of outputs and tape generation
allocSw.start();
CommandBuffer tape = buildTape();
allocSw.stop();
Uint8List tapeBytes = tape.bytes();
// Warmup Engine
for (int i = 0; i < 3; i = i + 1) {
CudaEngine.run(tapeBytes);
}
// 4. Time Execution
Stopwatch execSw = Stopwatch();
execSw.start();
for (int i = 0; i < iterations; i = i + 1) {
CudaEngine.run(tapeBytes);
}
execSw.stop();
// 5. Time Deallocation
Stopwatch freeSw = Stopwatch();
freeSw.start();
freeAll();
freeSw.stop();
// Calculations
double seconds = execSw.elapsedMicroseconds / 1000000.0;
double avgMs = (seconds * 1000.0) / iterations;
double gbPerSec = (bytesPerOp * iterations / 1000000000.0) / seconds;
double tflops = (flopsPerOp * iterations / 1000000000000.0) / seconds;
String timeStr = avgMs.toStringAsFixed(2).padLeft(6);
String gbStr = gbPerSec.toStringAsFixed(2).padLeft(8);
String tflopsStr = tflops.toStringAsFixed(4).padLeft(8);
String allocStr = (allocSw.elapsedMicroseconds / 1000.0).toStringAsFixed(2).padLeft(6);
String freeStr = (freeSw.elapsedMicroseconds / 1000.0).toStringAsFixed(2).padLeft(6);
print("[BENCHMARK] ${name.padRight(10)} | Time: $timeStr ms | Bandwidth: $gbStr GB/s | Compute: $tflopsStr TFLOPs Overhead | Alloc/Tape: $allocStr ms | Free: $freeStr ms");
}