loadEvalSuiteFromDir function

EvalSuite loadEvalSuiteFromDir(

Directory root, {
required GraderRegistry graderRegistry,

})

Loads an EvalSuite from a directory laid out as:

suites/<suite_name>/
  suite.json                 ← suite metadata (incl. agent_name)
  tasks/                     ← one JSON file per task; sub-folders
    positive/                  are allowed for organization
      my_task.json
    negative/
      my_other_task.json

suite.json schema:

{
  "name": "card_agent_capability",
  "agent_name": "card_agent",
  "kind": "capability",
  "requireReferenceSolution": true,
  "taskPassThreshold": 1.0
}

Each <task>.json follows the schema documented on JsonEvalTask. Note that agent_name belongs on the suite, not on individual tasks — every task in a suite targets the same agent.

Why a directory layout? Big task sets quickly outgrow a single JSON blob. One file per task lets product owners review/PR a task at a time, lets git diffs be readable, and lets sub-folders group tasks by failure bucket / source / difficulty.

Implementation

EvalSuite loadEvalSuiteFromDir(
  Directory root, {
  required GraderRegistry graderRegistry,
}) {
  if (!root.existsSync()) {
    throw ArgumentError('Suite directory does not exist: ${root.path}');
  }

  // 1. Suite metadata.
  final suiteFile = File('${root.path}/suite.json');
  if (!suiteFile.existsSync()) {
    throw StateError(
      'Missing suite.json at ${suiteFile.path}. '
      'Every suite directory must declare its name + kind.',
    );
  }
  final suiteJson =
      jsonDecode(suiteFile.readAsStringSync()) as Map<String, dynamic>;
  final name = (suiteJson['name'] as String?)?.trim();
  if (name == null || name.isEmpty) {
    throw StateError('${suiteFile.path}: "name" is required and non-empty.');
  }
  final agentName = (suiteJson['agent_name'] as String?)?.trim();
  if (agentName == null || agentName.isEmpty) {
    throw StateError(
      '${suiteFile.path}: "agent_name" is required and non-empty.',
    );
  }
  final kindStr = (suiteJson['kind'] as String?)?.trim() ?? 'mixed';
  final kind = SuiteKind.values.firstWhere(
    (k) => k.name == kindStr,
    orElse: () => throw StateError(
      '${suiteFile.path}: invalid kind "$kindStr". '
      'Use one of ${SuiteKind.values.map((k) => k.name).toList()}',
    ),
  );
  final requireReferenceSolution =
      suiteJson['requireReferenceSolution'] as bool? ?? false;
  final taskPassThreshold =
      (suiteJson['taskPassThreshold'] as num?)?.toDouble() ?? 1.0;

  // 2. Task files (recursive, alphabetical within each folder).
  final tasksDir = Directory('${root.path}/tasks');
  if (!tasksDir.existsSync()) {
    throw StateError(
      'Missing tasks/ directory at ${tasksDir.path}. '
      'Place one task JSON file per test under tasks/.',
    );
  }
  final taskFiles = <File>[];
  for (final entity in tasksDir.listSync(recursive: true)) {
    if (entity is File && entity.path.toLowerCase().endsWith('.json')) {
      taskFiles.add(entity);
    }
  }
  if (taskFiles.isEmpty) {
    throw StateError(
      'No task files found under ${tasksDir.path}. '
      'Add at least one task JSON before loading the suite.',
    );
  }
  // Stable order: alphabetical by full path. Same task, same position
  // across runs — important for diff/saturation reports.
  taskFiles.sort((a, b) => a.path.compareTo(b.path));

  final tasks = <EvalTask>[
    for (final f in taskFiles)
      _decodeTask(f, rootDir: root, graderRegistry: graderRegistry),
  ];

  return EvalSuite(
    name: name,
    agentName: agentName,
    kind: kind,
    tasks: tasks,
    requireReferenceSolution: requireReferenceSolution,
    taskPassThreshold: taskPassThreshold,
  );
}

loadEvalSuiteFromDir function

Implementation

eval library