loadEvalSuiteFromDir function
Loads an EvalSuite from a directory laid out as:
suites/<suite_name>/
suite.json ← suite metadata (incl. agent_name)
tasks/ ← one JSON file per task; sub-folders
positive/ are allowed for organization
my_task.json
negative/
my_other_task.json
suite.json schema:
{
"name": "card_agent_capability",
"agent_name": "card_agent",
"kind": "capability",
"requireReferenceSolution": true,
"taskPassThreshold": 1.0
}
Each <task>.json follows the schema documented on JsonEvalTask.
Note that agent_name belongs on the suite, not on individual tasks
— every task in a suite targets the same agent.
Why a directory layout? Big task sets quickly outgrow a single JSON blob. One file per task lets product owners review/PR a task at a time, lets git diffs be readable, and lets sub-folders group tasks by failure bucket / source / difficulty.
Implementation
EvalSuite loadEvalSuiteFromDir(
Directory root, {
required GraderRegistry graderRegistry,
}) {
if (!root.existsSync()) {
throw ArgumentError('Suite directory does not exist: ${root.path}');
}
// 1. Suite metadata.
final suiteFile = File('${root.path}/suite.json');
if (!suiteFile.existsSync()) {
throw StateError(
'Missing suite.json at ${suiteFile.path}. '
'Every suite directory must declare its name + kind.',
);
}
final suiteJson =
jsonDecode(suiteFile.readAsStringSync()) as Map<String, dynamic>;
final name = (suiteJson['name'] as String?)?.trim();
if (name == null || name.isEmpty) {
throw StateError('${suiteFile.path}: "name" is required and non-empty.');
}
final agentName = (suiteJson['agent_name'] as String?)?.trim();
if (agentName == null || agentName.isEmpty) {
throw StateError(
'${suiteFile.path}: "agent_name" is required and non-empty.',
);
}
final kindStr = (suiteJson['kind'] as String?)?.trim() ?? 'mixed';
final kind = SuiteKind.values.firstWhere(
(k) => k.name == kindStr,
orElse: () => throw StateError(
'${suiteFile.path}: invalid kind "$kindStr". '
'Use one of ${SuiteKind.values.map((k) => k.name).toList()}',
),
);
final requireReferenceSolution =
suiteJson['requireReferenceSolution'] as bool? ?? false;
final taskPassThreshold =
(suiteJson['taskPassThreshold'] as num?)?.toDouble() ?? 1.0;
// 2. Task files (recursive, alphabetical within each folder).
final tasksDir = Directory('${root.path}/tasks');
if (!tasksDir.existsSync()) {
throw StateError(
'Missing tasks/ directory at ${tasksDir.path}. '
'Place one task JSON file per test under tasks/.',
);
}
final taskFiles = <File>[];
for (final entity in tasksDir.listSync(recursive: true)) {
if (entity is File && entity.path.toLowerCase().endsWith('.json')) {
taskFiles.add(entity);
}
}
if (taskFiles.isEmpty) {
throw StateError(
'No task files found under ${tasksDir.path}. '
'Add at least one task JSON before loading the suite.',
);
}
// Stable order: alphabetical by full path. Same task, same position
// across runs — important for diff/saturation reports.
taskFiles.sort((a, b) => a.path.compareTo(b.path));
final tasks = <EvalTask>[
for (final f in taskFiles)
_decodeTask(f, rootDir: root, graderRegistry: graderRegistry),
];
return EvalSuite(
name: name,
agentName: agentName,
kind: kind,
tasks: tasks,
requireReferenceSolution: requireReferenceSolution,
taskPassThreshold: taskPassThreshold,
);
}