benchmarkScopedExactScan static method
Measure the scoped exact-scan branch of searchMetaHybrid.
Seeds one scoped source with scopedChunkCount chunks (each ~256 ASCII
bytes) plus a distractor source so the post-filter path has work to do,
then runs searchMetaHybrid with sourceIds=[scopedSourceId] across the
supplied bm25Weights. Each variant captures query_metrics
scoped_exact_scan_* counters before disposing the handle, so the caller
can verify that scoped BM25 ranks are served from the active term index
without query-time chunk-body reads or tokenization.
Intentionally meta-only — no hydration, no assemble — so the recorded
scoped-scan bytes are not blurred with full_hydrate_* materialization.
Implementation
static Future<ScopedExactScanBenchResult> benchmarkScopedExactScan({
int scopedChunkCount = 50,
int distractorChunkCount = 10,
int topK = 4,
int chunkContentChars = 256,
List<double> bm25Weights = const [0.0, 0.5],
String? restoreDbPath,
String? dbPathOverride,
}) async {
const collectionId = 'bench-scoped-exact-scan';
const queryText = 'install checksum smoke';
final queryEmbedding = Float32List.fromList([1.0, 0.0, 0.0, 0.0]);
final benchDbPath =
dbPathOverride ??
"${(await getApplicationDocumentsDirectory()).path}/scoped_exact_scan_bench.sqlite";
final benchFile = File(benchDbPath);
if (await benchFile.exists()) {
await benchFile.delete();
}
final tokenizerFile = File('$benchDbPath.tokenizer.json');
if (await tokenizerFile.exists()) {
await tokenizerFile.delete();
}
await initDbPool(dbPath: benchDbPath, maxSize: 4);
await initDb();
await source_rag.initSourceDb();
await tokenizerFile.writeAsString(
'{"version":"1.0","truncation":null,"padding":null,'
'"added_tokens":[],"normalizer":null,'
'"pre_tokenizer":{"type":"Whitespace"},"post_processor":null,'
'"decoder":null,"model":{"type":"WordLevel",'
'"vocab":{"[UNK]":0,"install":1,"checksum":2,"smoke":3,"setup":4},'
'"unk_token":"[UNK]"}}',
flush: true,
);
await initTokenizer(tokenizerPath: tokenizerFile.path);
String makeChunkContent(int seed) {
const words = [
'install',
'checksum',
'smoke',
'setup',
'verify',
'archive',
'manifest',
'release',
'config',
'package',
];
final buffer = StringBuffer();
var i = seed;
while (buffer.length < chunkContentChars) {
if (buffer.isNotEmpty) buffer.write(' ');
buffer.write(words[i % words.length]);
i++;
}
return buffer.toString().substring(0, chunkContentChars);
}
Future<int> seedSource({
required String name,
required int chunkCount,
required int seedBase,
}) async {
final source = await source_rag.addSourceInCollection(
collectionId: collectionId,
content: 'bench source $name',
metadata: '{"source":"$name"}',
name: name,
);
await source_rag.updateSourceStatus(
sourceId: source.sourceId,
status: 'completed',
);
var cursor = 0;
final chunkData = <source_rag.ChunkData>[];
for (var i = 0; i < chunkCount; i++) {
final content = makeChunkContent(seedBase + i);
chunkData.add(
source_rag.ChunkData(
content: content,
chunkIndex: i,
startPos: cursor,
endPos: cursor + content.length,
chunkType: 'general',
embedding: Float32List.fromList([
1.0 - (i % 16) * 0.01,
(i % 16) * 0.01,
0.0,
0.0,
]),
),
);
cursor += content.length + 1;
}
await source_rag.addChunks(sourceId: source.sourceId, chunks: chunkData);
return source.sourceId;
}
try {
final scopedSourceId = await seedSource(
name: 'scoped',
chunkCount: scopedChunkCount,
seedBase: 0,
);
await seedSource(
name: 'distractor',
chunkCount: distractorChunkCount,
seedBase: 7,
);
await source_rag.rebuildChunkHnswIndexForCollection(
collectionId: collectionId,
);
await source_rag.rebuildChunkBm25IndexForCollection(
collectionId: collectionId,
);
final variants = <QueryPayloadVariantStats>[];
for (final bm25Weight in bm25Weights) {
source_rag.SearchHandle? handle;
query_metrics.resetQueryContentReadStats();
final sw = Stopwatch()..start();
try {
handle = await source_rag.searchMetaHybrid(
collectionId: collectionId,
queryText: queryText,
queryEmbedding: queryEmbedding,
options: source_rag.SearchMetaHybridOptions(
topK: topK,
vectorWeight: 1.0 - bm25Weight,
bm25Weight: bm25Weight,
sourceIds: _toInt64List([scopedSourceId]),
adjacentChunks: 0,
),
);
final hits = await handle.hitMeta();
sw.stop();
final nativeReadStats = query_metrics.takeQueryContentReadStats();
variants.add(
_variantStatsFromNative(
label:
'scoped exact scan (chunks=$scopedChunkCount, bm25=${bm25Weight.toStringAsFixed(2)})',
elapsedMs: sw.elapsedMicroseconds / 1000.0,
hitCount: hits.length,
nativeReadStats: nativeReadStats,
),
);
} finally {
if (handle != null) {
await handle.dispose();
}
}
}
return ScopedExactScanBenchResult(
scopedChunkCount: scopedChunkCount,
distractorChunkCount: distractorChunkCount,
chunkContentBytes: chunkContentChars,
topK: topK,
bm25Weights: List.unmodifiable(bm25Weights),
variants: List.unmodifiable(variants),
);
} finally {
query_metrics.resetQueryContentReadStats();
await closeDbPool();
if (await benchFile.exists()) {
await benchFile.delete();
}
if (await tokenizerFile.exists()) {
await tokenizerFile.delete();
}
if (restoreDbPath != null) {
await initDbPool(dbPath: restoreDbPath, maxSize: 4);
}
}
}