runBenchmarkIsolated function

void runBenchmarkIsolated({
  1. required String name,
  2. required int iterations,
  3. required int bytesPerOp,
  4. required int flopsPerOp,
  5. required void allocateInputs(),
  6. required void initData(),
  7. required CommandBuffer buildTape(),
  8. required void freeAll(),
})

Implementation

void runBenchmarkIsolated({
  required String name,
  required int iterations,
  required int bytesPerOp,
  required int flopsPerOp,
  required void Function() allocateInputs,
  required void Function() initData,
  required CommandBuffer Function() buildTape,
  required void Function() freeAll,
}) {
  Stopwatch allocSw = Stopwatch();

  // 1. Time Allocation of inputs
  allocSw.start();
  allocateInputs();
  allocSw.stop();

  // 2. Initialize data (Excluded from overhead)
  initData();

  // 3. Time Allocation of outputs and tape generation
  allocSw.start();
  CommandBuffer tape = buildTape();
  allocSw.stop();

  Uint8List tapeBytes = tape.bytes();

  // Warmup Engine
  for (int i = 0; i < 3; i = i + 1) {
    CudaEngine.run(tapeBytes);
  }

  // 4. Time Execution
  Stopwatch execSw = Stopwatch();
  execSw.start();
  for (int i = 0; i < iterations; i = i + 1) {
    CudaEngine.run(tapeBytes);
  }
  execSw.stop();

  // 5. Time Deallocation
  Stopwatch freeSw = Stopwatch();
  freeSw.start();
  freeAll();
  freeSw.stop();

  // Calculations
  double seconds = execSw.elapsedMicroseconds / 1000000.0;
  double avgMs = (seconds * 1000.0) / iterations;
  double gbPerSec = (bytesPerOp * iterations / 1000000000.0) / seconds;
  double tflops = (flopsPerOp * iterations / 1000000000000.0) / seconds;

  String timeStr = avgMs.toStringAsFixed(2).padLeft(6);
  String gbStr = gbPerSec.toStringAsFixed(2).padLeft(8);
  String tflopsStr = tflops.toStringAsFixed(4).padLeft(8);
  String allocStr = (allocSw.elapsedMicroseconds / 1000.0).toStringAsFixed(2).padLeft(6);
  String freeStr = (freeSw.elapsedMicroseconds / 1000.0).toStringAsFixed(2).padLeft(6);

  print("[BENCHMARK] ${name.padRight(10)} | Time: $timeStr ms | Bandwidth: $gbStr GB/s | Compute: $tflopsStr TFLOPs Overhead  | Alloc/Tape: $allocStr ms | Free: $freeStr ms");
}