MultimodalTransformer constructor

MultimodalTransformer({
  1. required AudioTransformer audioEncoder,
  2. required VideoTransformer videoEncoder,
  3. required TextTransformer textEncoder,
  4. required int jointEmbedSize,
  5. int fusionLayers = 2,
  6. int fusionHeads = 4,
  7. int maxTotalSeqLen = 200,
})

Implementation

MultimodalTransformer({
  required this.audioEncoder,
  required this.videoEncoder,
  required this.textEncoder, // This will be used as a pre-processor for input text
  required this.jointEmbedSize, // e.g., 128
  int fusionLayers = 2,
  int fusionHeads = 4,
  int maxTotalSeqLen = 200, // Max length of combined modality sequences
}) : fusionEncoder = TransformerEncoder(
       vocabSize: 0, // No direct token embeddings
       embedSize: jointEmbedSize,
       blockSize: maxTotalSeqLen,
       numLayers: fusionLayers,
       numHeads: fusionHeads,
     ),
     assert(
       audioEncoder.embedSize == jointEmbedSize &&
           videoEncoder.embedSize == jointEmbedSize &&
           textEncoder.embedSize == jointEmbedSize,
       "All encoder outputs must match jointEmbedSize for concatenation",
     );