matMul method
Implementation
Tensor matMul(Tensor other) {
// Define matrix dimensions: A(M x K) * B(K x N) = C(M x N)
int M = shape[0];
int K = shape[1];
int N = other.shape[1];
print("M: $M, K: $K, N: $N");
// 4. Allocate memory for the matrices that C can understand.
// Matrices are stored as flat, 1D arrays in row-major order.
final Pointer<Float> hostA = calloc<Float>(M * K);
final Pointer<Float> hostB = calloc<Float>(K * N);
final Pointer<Float> hostC = calloc<Float>(M * N); // For the result
final matrixA = hostA.asTypedList(M * K);
final matrixB = hostB.asTypedList(K * N);
// Matrix A (4x3)
matrixA.setAll(0, data);
// Matrix B (3x4)
matrixB.setAll(0, other.data);
print('Matrix A:');
printMatrix(hostA, M, K);
print('Matrix B:');
printMatrix(hostB, K, N);
// 6. Call the CUDA function!
print('\nCalling CUDA function...');
engine.matMulCuda(hostA, hostB, hostC, M, N, K);
print('CUDA function returned.');
// 7. Print the result from matrix C.
print('\nResult Matrix C:');
printMatrix(hostC, M, N);
// 8. CRITICAL: Free the allocated memory to avoid leaks.
calloc.free(hostA);
calloc.free(hostB);
calloc.free(hostC);
return Tensor([M, N], hostC.asTypedList(M * N));
}