evaluate method

  1. @override
Future<EvaluationResult> evaluate(
  1. Iterable<ChatMessage> messages,
  2. ChatResponse modelResponse, {
  3. ChatConfiguration? chatConfiguration,
  4. Iterable<EvaluationContext>? additionalContext,
  5. CancellationToken? cancellationToken,
})
override

Evaluates modelResponse and returns an EvaluationResult.

messages is the full conversation history that produced modelResponse. chatConfiguration is required when the evaluator itself uses an AI model. additionalContext provides domain-specific context beyond what is in messages.

Implementation

@override
Future<EvaluationResult> evaluate(
  Iterable<ChatMessage> messages,
  ChatResponse modelResponse, {
  ChatConfiguration? chatConfiguration,
  Iterable<EvaluationContext>? additionalContext,
  CancellationToken? cancellationToken,
}) async {
  final relevance = NumericMetric(relevanceMetricName);
  final truth = NumericMetric(truthMetricName);
  final completeness = NumericMetric(completenessMetricName);
  final result =
      EvaluationResult.fromList([relevance, truth, completeness]);

  if (chatConfiguration == null) {
    const msg =
        'chatConfiguration is required for AI-based evaluators.';
    relevance.addDiagnostic(EvaluationDiagnostic.error(msg));
    truth.addDiagnostic(EvaluationDiagnostic.error(msg));
    completeness.addDiagnostic(EvaluationDiagnostic.error(msg));
    return result;
  }

  if (modelResponse.text.isEmpty) {
    const msg =
        'The modelResponse supplied for evaluation was null or empty.';
    relevance.addDiagnostic(EvaluationDiagnostic.error(msg));
    truth.addDiagnostic(EvaluationDiagnostic.error(msg));
    completeness.addDiagnostic(EvaluationDiagnostic.error(msg));
    return result;
  }

  final msgList = messages.toList();
  final lastUser = msgList.cast<ChatMessage?>().lastWhere(
        (m) => m?.role == ChatRole.user,
        orElse: () => null,
      );

  if (lastUser == null || lastUser.text.isEmpty) {
    const msg = 'No user message found in the conversation history.';
    relevance.addDiagnostic(EvaluationDiagnostic.error(msg));
    truth.addDiagnostic(EvaluationDiagnostic.error(msg));
    completeness.addDiagnostic(EvaluationDiagnostic.error(msg));
    return result;
  }

  final history =
      msgList.where((m) => m != lastUser).map((m) => m.text).join('\n');
  final instructions =
      _buildPrompt(lastUser.text, modelResponse.text, history);

  final start = DateTime.now();
  final evalResponse =
      await chatConfiguration.chatClient.getResponse(
    messages: [ChatMessage.fromText(ChatRole.user, instructions)],
    options: _chatOptions,
    cancellationToken: cancellationToken,
  );
  final duration = DateTime.now().difference(start);

  final rating =
      RelevanceTruthAndCompletenessRating.tryParse(evalResponse.text);

  if (rating == null || rating.isInconclusive) {
    const msg = 'Could not parse scores from the evaluation response.';
    relevance.addDiagnostic(EvaluationDiagnostic.error(msg));
    truth.addDiagnostic(EvaluationDiagnostic.error(msg));
    completeness.addDiagnostic(EvaluationDiagnostic.error(msg));
    return result;
  }

  relevance.value = rating.relevance.toDouble();
  relevance.reason = rating.relevanceReasoning;
  relevance.addOrUpdateChatMetadata(evalResponse, duration: duration);
  relevance.interpretation = relevance.interpretScore();

  truth.value = rating.truth.toDouble();
  truth.reason = rating.truthReasoning;
  truth.addOrUpdateChatMetadata(evalResponse, duration: duration);
  truth.interpretation = truth.interpretScore();

  completeness.value = rating.completeness.toDouble();
  completeness.reason = rating.completenessReasoning;
  completeness.addOrUpdateChatMetadata(evalResponse, duration: duration);
  completeness.interpretation = completeness.interpretScore();

  return result;
}