evaluate method

  1. @override
Future<EvaluationResult> evaluate(
  1. Iterable<ChatMessage> messages,
  2. ChatResponse modelResponse, {
  3. ChatConfiguration? chatConfiguration,
  4. Iterable<EvaluationContext>? additionalContext,
  5. CancellationToken? cancellationToken,
})
override

Evaluates modelResponse and returns an EvaluationResult.

messages is the full conversation history that produced modelResponse. chatConfiguration is required when the evaluator itself uses an AI model. additionalContext provides domain-specific context beyond what is in messages.

Implementation

@override
Future<EvaluationResult> evaluate(
  Iterable<ChatMessage> messages,
  ChatResponse modelResponse, {
  ChatConfiguration? chatConfiguration,
  Iterable<EvaluationContext>? additionalContext,
  CancellationToken? cancellationToken,
}) async {
  final metric = NumericMetric(bleuMetricName);
  final result = EvaluationResult.fromList([metric]);

  final responseText = modelResponse.text;
  if (responseText.isEmpty) {
    metric.addDiagnostic(EvaluationDiagnostic.error(
        'The modelResponse supplied for evaluation was null or empty.'));
    return result;
  }

  final ctx = additionalContext?.whereType<BLEUEvaluatorContext>().firstOrNull;
  if (ctx == null) {
    metric.addDiagnostic(EvaluationDiagnostic.error(
        'A BLEUEvaluatorContext was not found in additionalContext.'));
    return result;
  }
  if (ctx.references.isEmpty) {
    metric.addDiagnostic(EvaluationDiagnostic.error(
        'The supplied BLEUEvaluatorContext contained no references.'));
    return result;
  }

  final start = DateTime.now();
  final references = ctx.references
      .map((r) => SimpleWordTokenizer.wordTokenize(r))
      .toList();
  final hypothesis = SimpleWordTokenizer.wordTokenize(responseText);
  final score = BLEUAlgorithm.sentenceBLEU(
    references,
    hypothesis,
    weights: BLEUAlgorithm.defaultBLEUWeights,
    smoothingFunction: SmoothingFunction.method4,
  );
  final duration = DateTime.now().difference(start);

  metric.value = score;
  metric.addOrUpdateDurationMetadata(duration);
  metric.addOrUpdateContext(ctx);
  metric.interpretation = metric.interpret();
  return result;
}