forward method
Implementation
Tensor forward(
Tensor audio,
Tensor video,
List<int> inputTextTokens,
List<Tensor> tracker,
) {
// 1. Get full sequence embeddings from each modality
final audioSeqEmbeds = audioEncoder.forward(
audio,
tracker,
); // [SeqA, Embed]
final videoSeqEmbeds = videoEncoder.forward(
video,
tracker,
); // [SeqV, Embed]
final textSeqEmbeds = textEncoder.forward(
inputTextTokens,
tracker,
); // [SeqT, Embed]
// 2. Concatenate all sequences
final combinedSequence = Tensor.concat([
audioSeqEmbeds,
videoSeqEmbeds,
textSeqEmbeds,
]);
tracker.add(combinedSequence);
// 3. Optional: Pass through a fusion transformer for deeper interaction
final fusedOutput = fusionEncoder.forwardEmbeddings(
combinedSequence,
tracker,
);
return fusedOutput; // This is the encoderOutput for the TextDecoder
}