forward method

Tensor forward(
  1. Tensor audio,
  2. Tensor video,
  3. List<int> inputTextTokens,
  4. List<Tensor> tracker,
)

Implementation

Tensor forward(
  Tensor audio,
  Tensor video,
  List<int> inputTextTokens,
  List<Tensor> tracker,
) {
  // 1. Get full sequence embeddings from each modality
  final audioSeqEmbeds = audioEncoder.forward(
    audio,
    tracker,
  ); // [SeqA, Embed]
  final videoSeqEmbeds = videoEncoder.forward(
    video,
    tracker,
  ); // [SeqV, Embed]
  final textSeqEmbeds = textEncoder.forward(
    inputTextTokens,
    tracker,
  ); // [SeqT, Embed]

  // 2. Concatenate all sequences
  final combinedSequence = Tensor.concat([
    audioSeqEmbeds,
    videoSeqEmbeds,
    textSeqEmbeds,
  ]);
  tracker.add(combinedSequence);

  // 3. Optional: Pass through a fusion transformer for deeper interaction
  final fusedOutput = fusionEncoder.forwardEmbeddings(
    combinedSequence,
    tracker,
  );
  return fusedOutput; // This is the encoderOutput for the TextDecoder
}