matMul method

Tensor matMul(
  1. Tensor other
)

Implementation

Tensor matMul(Tensor other) {
  // Define matrix dimensions: A(M x K) * B(K x N) = C(M x N)
  int M = shape[0];
  int K = shape[1];
  int N = other.shape[1];

  print("M: $M, K: $K, N: $N");

  // 4. Allocate memory for the matrices that C can understand.
  // Matrices are stored as flat, 1D arrays in row-major order.
  final Pointer<Float> hostA = calloc<Float>(M * K);
  final Pointer<Float> hostB = calloc<Float>(K * N);
  final Pointer<Float> hostC = calloc<Float>(M * N); // For the result

  final matrixA = hostA.asTypedList(M * K);
  final matrixB = hostB.asTypedList(K * N);

  // Matrix A (4x3)
  matrixA.setAll(0, data);

  // Matrix B (3x4)
  matrixB.setAll(0, other.data);

  print('Matrix A:');
  printMatrix(hostA, M, K);
  print('Matrix B:');
  printMatrix(hostB, K, N);

  // 6. Call the CUDA function!
  print('\nCalling CUDA function...');
  engine.matMulCuda(hostA, hostB, hostC, M, N, K);
  print('CUDA function returned.');

  // 7. Print the result from matrix C.
  print('\nResult Matrix C:');
  printMatrix(hostC, M, N);

  // 8. CRITICAL: Free the allocated memory to avoid leaks.
  calloc.free(hostA);
  calloc.free(hostB);
  calloc.free(hostC);

  return Tensor([M, N], hostC.asTypedList(M * N));
}