benchmarkScopedExactScan static method

Future<ScopedExactScanBenchResult> benchmarkScopedExactScan({
  1. int scopedChunkCount = 50,
  2. int distractorChunkCount = 10,
  3. int topK = 4,
  4. int chunkContentChars = 256,
  5. List<double> bm25Weights = const [0.0, 0.5],
  6. String? restoreDbPath,
  7. String? dbPathOverride,
})

Measure the scoped exact-scan branch of searchMetaHybrid.

Seeds one scoped source with scopedChunkCount chunks (each ~256 ASCII bytes) plus a distractor source so the post-filter path has work to do, then runs searchMetaHybrid with sourceIds=[scopedSourceId] across the supplied bm25Weights. Each variant captures query_metrics scoped_exact_scan_* counters before disposing the handle, so the caller can verify that scoped BM25 ranks are served from the active term index without query-time chunk-body reads or tokenization.

Intentionally meta-only — no hydration, no assemble — so the recorded scoped-scan bytes are not blurred with full_hydrate_* materialization.

Implementation

static Future<ScopedExactScanBenchResult> benchmarkScopedExactScan({
  int scopedChunkCount = 50,
  int distractorChunkCount = 10,
  int topK = 4,
  int chunkContentChars = 256,
  List<double> bm25Weights = const [0.0, 0.5],
  String? restoreDbPath,
  String? dbPathOverride,
}) async {
  const collectionId = 'bench-scoped-exact-scan';
  const queryText = 'install checksum smoke';
  final queryEmbedding = Float32List.fromList([1.0, 0.0, 0.0, 0.0]);

  final benchDbPath =
      dbPathOverride ??
      "${(await getApplicationDocumentsDirectory()).path}/scoped_exact_scan_bench.sqlite";
  final benchFile = File(benchDbPath);
  if (await benchFile.exists()) {
    await benchFile.delete();
  }
  final tokenizerFile = File('$benchDbPath.tokenizer.json');
  if (await tokenizerFile.exists()) {
    await tokenizerFile.delete();
  }

  await initDbPool(dbPath: benchDbPath, maxSize: 4);
  await initDb();
  await source_rag.initSourceDb();
  await tokenizerFile.writeAsString(
    '{"version":"1.0","truncation":null,"padding":null,'
    '"added_tokens":[],"normalizer":null,'
    '"pre_tokenizer":{"type":"Whitespace"},"post_processor":null,'
    '"decoder":null,"model":{"type":"WordLevel",'
    '"vocab":{"[UNK]":0,"install":1,"checksum":2,"smoke":3,"setup":4},'
    '"unk_token":"[UNK]"}}',
    flush: true,
  );
  await initTokenizer(tokenizerPath: tokenizerFile.path);

  String makeChunkContent(int seed) {
    const words = [
      'install',
      'checksum',
      'smoke',
      'setup',
      'verify',
      'archive',
      'manifest',
      'release',
      'config',
      'package',
    ];
    final buffer = StringBuffer();
    var i = seed;
    while (buffer.length < chunkContentChars) {
      if (buffer.isNotEmpty) buffer.write(' ');
      buffer.write(words[i % words.length]);
      i++;
    }
    return buffer.toString().substring(0, chunkContentChars);
  }

  Future<int> seedSource({
    required String name,
    required int chunkCount,
    required int seedBase,
  }) async {
    final source = await source_rag.addSourceInCollection(
      collectionId: collectionId,
      content: 'bench source $name',
      metadata: '{"source":"$name"}',
      name: name,
    );
    await source_rag.updateSourceStatus(
      sourceId: source.sourceId,
      status: 'completed',
    );
    var cursor = 0;
    final chunkData = <source_rag.ChunkData>[];
    for (var i = 0; i < chunkCount; i++) {
      final content = makeChunkContent(seedBase + i);
      chunkData.add(
        source_rag.ChunkData(
          content: content,
          chunkIndex: i,
          startPos: cursor,
          endPos: cursor + content.length,
          chunkType: 'general',
          embedding: Float32List.fromList([
            1.0 - (i % 16) * 0.01,
            (i % 16) * 0.01,
            0.0,
            0.0,
          ]),
        ),
      );
      cursor += content.length + 1;
    }
    await source_rag.addChunks(sourceId: source.sourceId, chunks: chunkData);
    return source.sourceId;
  }

  try {
    final scopedSourceId = await seedSource(
      name: 'scoped',
      chunkCount: scopedChunkCount,
      seedBase: 0,
    );
    await seedSource(
      name: 'distractor',
      chunkCount: distractorChunkCount,
      seedBase: 7,
    );
    await source_rag.rebuildChunkHnswIndexForCollection(
      collectionId: collectionId,
    );
    await source_rag.rebuildChunkBm25IndexForCollection(
      collectionId: collectionId,
    );

    final variants = <QueryPayloadVariantStats>[];
    for (final bm25Weight in bm25Weights) {
      source_rag.SearchHandle? handle;
      query_metrics.resetQueryContentReadStats();
      final sw = Stopwatch()..start();
      try {
        handle = await source_rag.searchMetaHybrid(
          collectionId: collectionId,
          queryText: queryText,
          queryEmbedding: queryEmbedding,
          options: source_rag.SearchMetaHybridOptions(
            topK: topK,
            vectorWeight: 1.0 - bm25Weight,
            bm25Weight: bm25Weight,
            sourceIds: _toInt64List([scopedSourceId]),
            adjacentChunks: 0,
          ),
        );
        final hits = await handle.hitMeta();
        sw.stop();
        final nativeReadStats = query_metrics.takeQueryContentReadStats();

        variants.add(
          _variantStatsFromNative(
            label:
                'scoped exact scan (chunks=$scopedChunkCount, bm25=${bm25Weight.toStringAsFixed(2)})',
            elapsedMs: sw.elapsedMicroseconds / 1000.0,
            hitCount: hits.length,
            nativeReadStats: nativeReadStats,
          ),
        );
      } finally {
        if (handle != null) {
          await handle.dispose();
        }
      }
    }

    return ScopedExactScanBenchResult(
      scopedChunkCount: scopedChunkCount,
      distractorChunkCount: distractorChunkCount,
      chunkContentBytes: chunkContentChars,
      topK: topK,
      bm25Weights: List.unmodifiable(bm25Weights),
      variants: List.unmodifiable(variants),
    );
  } finally {
    query_metrics.resetQueryContentReadStats();
    await closeDbPool();
    if (await benchFile.exists()) {
      await benchFile.delete();
    }
    if (await tokenizerFile.exists()) {
      await tokenizerFile.delete();
    }
    if (restoreDbPath != null) {
      await initDbPool(dbPath: restoreDbPath, maxSize: 4);
    }
  }
}