VisionTransformer constructor

VisionTransformer({

required int imageSize,
required int patchSize,
int numChannels = 3,
required int embedSize,
required int numClasses,
int numLayers = 2,
int numHeads = 4,

})

Implementation

VisionTransformer({
  required this.imageSize,
  required this.patchSize,
  this.numChannels = 3,
  required this.embedSize,
  required this.numClasses,
  this.numLayers = 2, // Reduced for faster example execution
  this.numHeads = 4, // Reduced for faster example execution
})  : assert(imageSize % patchSize == 0,
          "Image size must be divisible by patch size"),
      assert(embedSize % numHeads == 0,
          "Embed size must be divisible by numHeads"),
      // Patch embedding converts (patch_size * patch_size * num_channels) into embedSize
      patchProjection =
          Layer.fromNeurons(patchSize * patchSize * numChannels, embedSize),
      // Initialize CLS token as a learnable vector
      clsToken = ValueVector.fromDoubleList(List.generate(
          embedSize, (j) => math.Random().nextDouble() * 0.02 - 0.01)),
      // Calculate the number of patches along one side
      // Example: 224 / 16 = 14 patches per side -> 14 * 14 = 196 patches total
      // Plus 1 for the [CLS] token: (num_patches + 1) positions
      positionEmbeddings = List.generate(
          (imageSize ~/ patchSize) * (imageSize ~/ patchSize) + 1,
          (i) => ValueVector.fromDoubleList(List.generate(
              embedSize, (j) => math.Random().nextDouble() * 0.02 - 0.01))),
      // The TransformerEncoder is used as the backbone
      // Its blockSize must match the sequence length (patches + CLS token)
      transformerEncoder = TransformerEncoder(
        vocabSize: 0, // Not used, as embeddings are provided directly
        embedSize: embedSize,
        blockSize: (imageSize ~/ patchSize) * (imageSize ~/ patchSize) + 1,
        numLayers: numLayers,
        numHeads: numHeads,
      ),
      // Classification head
      mlpHead = Layer.fromNeurons(embedSize, numClasses);

VisionTransformer constructor

Implementation

VisionTransformer class