MultimodalTransformer constructor
MultimodalTransformer({
- required AudioTransformer audioEncoder,
- required VideoTransformer videoEncoder,
- required TextTransformer textEncoder,
- required int jointEmbedSize,
- int fusionLayers = 2,
- int fusionHeads = 4,
- int maxTotalSeqLen = 200,
Implementation
MultimodalTransformer({
required this.audioEncoder,
required this.videoEncoder,
required this.textEncoder, // This will be used as a pre-processor for input text
required this.jointEmbedSize, // e.g., 128
int fusionLayers = 2,
int fusionHeads = 4,
int maxTotalSeqLen = 200, // Max length of combined modality sequences
}) : fusionEncoder = TransformerEncoder(
vocabSize: 0, // No direct token embeddings
embedSize: jointEmbedSize,
blockSize: maxTotalSeqLen,
numLayers: fusionLayers,
numHeads: fusionHeads,
),
assert(
audioEncoder.embedSize == jointEmbedSize &&
videoEncoder.embedSize == jointEmbedSize &&
textEncoder.embedSize == jointEmbedSize,
"All encoder outputs must match jointEmbedSize for concatenation",
);