mnicely · jongyunhur · Nov 25, 2025
diff --git a/nvml_cublas.cu b/nvml_cublas.cu
@@ -104,6 +104,9 @@ void calculate( int const &m, int const &n, int const &k, nvmlClass &nvml ) {
 
     /* Fill the matrices with test data */
     /* Assume square matrices */
+    /* Note: We generate data in Row-Major format, but cuBLAS interprets it as Column-Major.
+     * This means matrices are effectively transposed when passed to cuBLAS.
+     * To compensate, we will compute C = B × A instead of C = A × B. */
     for ( int i = 0; i < m * m; i++ ) {
         h_A[i] = std::rand( ) / static_cast<data_type>( RAND_MAX );
         h_B[i] = std::rand( ) / static_cast<data_type>( RAND_MAX );
@@ -124,7 +127,8 @@ void calculate( int const &m, int const &n, int const &k, nvmlClass &nvml ) {
     data_type *d_C_ptr = thrust::raw_pointer_cast( &d_C[0] );
 
     /* Performs operation using cublas */
-    cublasSgemm( handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, d_A_ptr, lda, d_B_ptr, ldb, &beta, d_C_ptr, ldc );
+    /* Compute C = B * A to compensate for Row-Major to Column-Major mismatch. */
+    cublasSgemm( handle, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, &alpha, d_B_ptr, ldb, d_A_ptr, lda, &beta, d_C_ptr, ldc );
     CUDA_RT_CALL( cudaDeviceSynchronize( ) );
 
     /* Allocate host memory for reading back the result from device memory */