inference method
Run full inference: image → structured JSON output.
imageTensor: preprocessed image tensor (1, 3, H, W)
prompt: task prompt string (e.g., <s_cord-v2>, <s_docvqa>)
maxLength: maximum generation length (default from config)
Returns a DonutResult with raw tokens, decoded text, and parsed JSON.
Example:
// Document parsing (CORD receipt dataset)
final result = model.inference(
imageTensor: preprocessedImage,
prompt: '<s_cord-v2>',
);
print(result.json);
// {'menu': [{'nm': 'Lemon Tea', 'price': '3.50'}], ...}
// Visual Question Answering
final vqaResult = model.inference(
imageTensor: preprocessedImage,
prompt: '<s_docvqa><s_question>What is the total?</s_question><s_answer>',
);
print(vqaResult.text);
Implementation
DonutResult inference({
required Tensor imageTensor,
required String prompt,
int? maxLength,
}) {
if (tokenizer == null) {
throw StateError('Tokenizer not loaded. Call setTokenizer() or '
'loadTokenizer() before inference.');
}
// 1. Encode image
final encoderOutput = encode(imageTensor);
// 2. Encode prompt to tokens
final promptTokens = tokenizer!.encode(prompt);
// 3. Generate tokens auto-regressively
final generatedTokens = decode(
encoderOutput: encoderOutput,
promptTokens: promptTokens,
maxLength: maxLength,
eosTokenId: tokenizer!.eosTokenId,
);
// 4. Decode tokens to text
final outputText = tokenizer!.decode(generatedTokens);
// 5. Parse tokens to JSON
final parsedJson = token2json(outputText);
return DonutResult(
tokens: generatedTokens,
text: outputText,
json: parsedJson,
);
}