evaluate method
Future<EvaluationResult>
evaluate(
- Iterable<
ChatMessage> messages, - ChatResponse modelResponse, {
- ChatConfiguration? chatConfiguration,
- Iterable<
EvaluationContext> ? additionalContext, - CancellationToken? cancellationToken,
override
Evaluates modelResponse and returns an EvaluationResult.
messages is the full conversation history that produced
modelResponse. chatConfiguration is required when the evaluator
itself uses an AI model. additionalContext provides domain-specific
context beyond what is in messages.
Implementation
@override
Future<EvaluationResult> evaluate(
Iterable<ChatMessage> messages,
ChatResponse modelResponse, {
ChatConfiguration? chatConfiguration,
Iterable<EvaluationContext>? additionalContext,
CancellationToken? cancellationToken,
}) async {
final relevance = NumericMetric(relevanceMetricName);
final truth = NumericMetric(truthMetricName);
final completeness = NumericMetric(completenessMetricName);
final result =
EvaluationResult.fromList([relevance, truth, completeness]);
if (chatConfiguration == null) {
const msg =
'chatConfiguration is required for AI-based evaluators.';
relevance.addDiagnostic(EvaluationDiagnostic.error(msg));
truth.addDiagnostic(EvaluationDiagnostic.error(msg));
completeness.addDiagnostic(EvaluationDiagnostic.error(msg));
return result;
}
if (modelResponse.text.isEmpty) {
const msg =
'The modelResponse supplied for evaluation was null or empty.';
relevance.addDiagnostic(EvaluationDiagnostic.error(msg));
truth.addDiagnostic(EvaluationDiagnostic.error(msg));
completeness.addDiagnostic(EvaluationDiagnostic.error(msg));
return result;
}
final msgList = messages.toList();
final lastUser = msgList.cast<ChatMessage?>().lastWhere(
(m) => m?.role == ChatRole.user,
orElse: () => null,
);
if (lastUser == null || lastUser.text.isEmpty) {
const msg = 'No user message found in the conversation history.';
relevance.addDiagnostic(EvaluationDiagnostic.error(msg));
truth.addDiagnostic(EvaluationDiagnostic.error(msg));
completeness.addDiagnostic(EvaluationDiagnostic.error(msg));
return result;
}
final history =
msgList.where((m) => m != lastUser).map((m) => m.text).join('\n');
final instructions =
_buildPrompt(lastUser.text, modelResponse.text, history);
final start = DateTime.now();
final evalResponse =
await chatConfiguration.chatClient.getResponse(
messages: [ChatMessage.fromText(ChatRole.user, instructions)],
options: _chatOptions,
cancellationToken: cancellationToken,
);
final duration = DateTime.now().difference(start);
final rating =
RelevanceTruthAndCompletenessRating.tryParse(evalResponse.text);
if (rating == null || rating.isInconclusive) {
const msg = 'Could not parse scores from the evaluation response.';
relevance.addDiagnostic(EvaluationDiagnostic.error(msg));
truth.addDiagnostic(EvaluationDiagnostic.error(msg));
completeness.addDiagnostic(EvaluationDiagnostic.error(msg));
return result;
}
relevance.value = rating.relevance.toDouble();
relevance.reason = rating.relevanceReasoning;
relevance.addOrUpdateChatMetadata(evalResponse, duration: duration);
relevance.interpretation = relevance.interpretScore();
truth.value = rating.truth.toDouble();
truth.reason = rating.truthReasoning;
truth.addOrUpdateChatMetadata(evalResponse, duration: duration);
truth.interpretation = truth.interpretScore();
completeness.value = rating.completeness.toDouble();
completeness.reason = rating.completenessReasoning;
completeness.addOrUpdateChatMetadata(evalResponse, duration: duration);
completeness.interpretation = completeness.interpretScore();
return result;
}