I am very new to CUDA programming and was reading the 'CUDA C Programming Guide' provided by nvidia.
(http://developer.download.nvidia.com/compute/DevZone/docs/html/C/doc/CUDA_C_Programming_Guide.pdf)
In the page 25, it has the following C code that does the matrix multiplication. Can you please tell me how can I make that code run on two devices? (if I have two nvida CUDA capable cards installed in my computer). Can you please show me with an example.
// Matrices are stored in row-major order:
// M(row, col) = *(M.elements + row * M.stride + col)
typedef struct {
int width;
int height;
int stride;
float* elements;
} Matrix;
// Get a matrix element
__device__ float GetElement(const Matrix A, int row, int col)
{
return A.elements[row * A.stride + col];
}
// Set a matrix element
__device__ void SetElement(Matrix A, int row, int col, float value)
{
A.elements[row * A.stride + col] = value;
}
// Get the BLOCK_SIZExBLOCK_SIZE sub-matrix Asub of A that is
// located col sub-matrices to the right and row sub-matrices down
// from the upper-left corner of A
__device__ Matrix GetSubMatrix(Matrix A, int row, int col)
{
Matrix Asub;
Asub.width = BLOCK_SIZE;
Asub.height = BLOCK_SIZE;
Asub.stride = A.stride;
Asub.elements = &A.elements[A.stride * BLOCK_SIZE * row + BLOCK_SIZE * col];
return Asub;
}
// Thread block size
#define BLOCK_SIZE 16
// Forward declaration of the matrix multiplication kernel
__global__ void MatMulKernel(const Matrix, const Matrix, Matrix);
// Matrix multiplication - Host code
// Matrix dimensions are assumed to be multiples of BLOCK_SIZE
void MatMul(const Matrix A, const Matrix B, Matrix C)
{
// Load A and B to device memory
Matrix d_A;
d_A.width = d_A.stride = A.width; d_A.height = A.height;
size_t size = A.width * A.height * sizeof(float);
cudaMalloc(&d_A.elements, size);
cudaMemcpy(d_A.elements, A.elements, size, cudaMemcpyHostToDevice);
Matrix d_B;
d_B.width = d_B.stride = B.width; d_B.height = B.height;
size = B.width * B.height * sizeof(float);
cudaMalloc(&d_B.elements, size);
cudaMemcpy(d_B.elements, B.elements, size, cudaMemcpyHostToDevice);
// Allocate C in device memory
Matrix d_C;
d_C.width = d_C.stride = C.width; d_C.height = C.height;
size = C.width * C.height * sizeof(float);
cudaMalloc(&d_C.elements, size);
// Invoke kernel
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(B.width / dimBlock.x, A.height / dimBlock.y);
MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);
// Read C from device memory
cudaMemcpy(C.elements, d_C.elements, size, cudaMemcpyDeviceToHost);
// Free device memory
cudaFree(d_A.elements);
cudaFree(d_B.elements);
cudaFree(d_C.elements);
}
// Matrix multiplication kernel called by MatMul()
__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C)
{
// Block row and column
int blockRow = blockIdx.y;
int blockCol = blockIdx.x;
// Each thread block computes one sub-matrix Csub of C
Matrix Csub = GetSubMatrix(C, blockRow, blockCol);
// Each thread computes one element of Csub
// by accumulating results into Cvalue
float Cvalue = 0;
// Thread row and column within Csub
int row = threadIdx.y;
int col = threadIdx.x;
// Loop over all the sub-matrices of A and B that are
// required to compute Csub
// Multiply each pair of sub-matrices together
// and accumulate the results
for (int m = 0; m < (A.width / BLOCK_SIZE); ++m)
{
// Get sub-matrix Asub of A
Matrix Asub = GetSubMatrix(A, blockRow, m);
// Get sub-matrix Bsub of B
Matrix Bsub = GetSubMatrix(B, m, blockCol);
// Shared memory used to store Asub and Bsub respectively
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
// Load Asub and Bsub from device memory to shared memory
// Each thread loads one element of each sub-matrix
As[row][col] = GetElement(Asub, row, col);
Bs[row][col] = GetElement(Bsub, row, col);
// Synchronize to make sure the sub-matrices are loaded
// before starting the computation
__syncthreads();
// Multiply Asub and Bsub together
for (int e = 0; e < BLOCK_SIZE; ++e)
Cvalue += As[row][e] * Bs[e][col];
// Synchronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
__syncthreads();
}
// Write Csub to device memory
// Each thread writes one element
SetElement(Csub, row, col, Cvalue);
}
There is no "automatic" way to run a CUDA kernel on multiple GPUs.
You will need to devise a way to decompose the matrix multiplication problem into independent operations that can be run in parallel (so one on each GPU in parallel). As a simple example:
C = A.B is equivalent to C = [A].[B1|B2] = [A.B1|A.B2] where B1 and B2 are suitably sized matrices containing the columns of the matrix B and | denotes columnwise concantenation. You can calculate A.B1 and A.B2 as separate matrix multiplication operations, and then perform the concatenation when copying the resulting submatrices back to host memory.
Once you have a suitable decomposition scheme, you then implement it using the standard multi-gpu facilities in the CUDA 4.x API. For a great overview of multi-GPU programming using the CUDA APIs, I recommend watching Paulius Micikevicius' excellent talk from GTC 2012, which available as a streaming video and PDF here.
The basics are described in the CUDA C Programming Guide under section 3.2.6.
Basically, you can set on which GPU your current host thread operates on by calling cudaSetDevice(). Still you have to write your own code, to decompose your routines to be split across multiple GPUs.
Related
I have many small 2D arrays (e.g. M x 32 x 40) and fewer larger 2D arrays (e.g. N x 200 x 300).
I would like to 'put' the smaller matrices at indices n,i,j in the larger arrays (upper left index of the array at batch index n). These small arrays could overlap and should be aggregated by functions that are associative and commutative say plus, multiply, etc.
I figure this is a pretty basic scenario that many people should have come across, right? Is there a cuda implementation that supports this in an efficient way?
Typical values M = 10^6, N = 10^4
This is a reduction operation.
In addition to what is expressed in the comments, I'll make the assumption that the distribution of the M matrices in terms of which of the N matrices they belong to, is relatively uniform, i.e. evenly distributed. This means for the dimensions given, that there will be approximately 100 of the M matrices that intended to update N matrix 0, 100 for N matrix 1, and so on. Furthermore, if we inspect the n array, we would observe a uniformly random pattern of indices (i.e. no clumping or grouping).
Given that, in what may be a first for me, I'll suggest a lock/critical section algorithm, using the plumbing from here. Each threadblock will take one of the M arrays, and attempt to acquire a lock so that it can update the appropriate N array. When finished, release the lock.
I considered other approaches as well, some of which are evident in the code. In any event, for the stated conditions, the lock based approach had a kernel runtime of about 40ms on my V100 GPU, which was the best I observed.
I would also note that the stated dimensions result in a data working set of ~8GB. Not that that is a problem, just be aware if running this code as-is on your laptop GPU.
Here's an example:
$ cat t34.cu
#include <iostream>
#include <cstdlib>
const int N = 10000;
const int M = 1000000;
const int Mx = 32;
const int My = 40;
const int Nx = 200;
const int Ny = 300;
const int nTPB = 256;
template <typename T>
__host__ __device__
T reduction_op(T &a, const T &b){ return a+b;}
template <typename T>
__global__ void k(const T * __restrict__ M, T * __restrict__ N, const int * __restrict__ n, const int * __restrict__ i, const int * __restrict__ j, const int num_M){
for (int ii = 0; ii < num_M; ii++){
if (n[ii] == blockIdx.x) {
for (int jj = threadIdx.x; jj < Mx*My; jj += blockDim.x){
int y = jj/Mx;
int x = jj - (y*Mx);
N[blockIdx.x*Nx*Ny + i[ii] + (j[ii]+y)*Nx + x] = reduction_op(
N[blockIdx.x*Nx*Ny + i[ii] + (j[ii]+y)*Nx + x], M[ii*Mx*My + y*Mx + x]);}
}
__syncthreads();}
}
// assumes Ny is whole-number divisible by sl
template <typename T>
__global__ void ki(const T * __restrict__ M, T * __restrict__ N, const int * __restrict__ n, const int * __restrict__ i, const int * __restrict__ j, const int num_M, const int sl){
extern __shared__ T s[];
for (int c = 0; c < Ny; c+=sl){ // process per chunk of N array
// load shared
for (int t = threadIdx.x; t < sl*Nx; t += blockDim.x) s[t] = N[blockIdx.x*Nx*Ny + c*Nx + t];
__syncthreads();
// process chunk stack
for (int ii = 0; ii < num_M; ii++){ // iterate through "stack"
if ((n[ii] == blockIdx.x) && (j[ii] < (c+sl)) && ((j[ii]+My) > c)) {
for (int jj = threadIdx.x; jj < sl*Mx; jj += blockDim.x){
int y = jj/Mx;
int x = jj - (y*Mx);
//y += c;
if ((y+c >= j[ii]) && (y+c < (j[ii]+My)))
s[y*Nx+x+i[ii]] = reduction_op(s[y*Nx+x+i[ii]], M[ii*Mx*My + (y+c-j[ii])*Mx + x]);}
}
__syncthreads();}
// save shared
for (int t = threadIdx.x; t < sl*Nx; t += blockDim.x) N[blockIdx.x*Nx*Ny + c*Nx + t] = s[t];
}
}
template <typename T>
__global__ void ka(const T * __restrict__ M, T * __restrict__ N, const int * __restrict__ n, const int * __restrict__ i, const int * __restrict__ j, const int num_M){
int x = threadIdx.x;
for (int y = threadIdx.y; y < My; y += blockDim.y)
atomicAdd(N+n[blockIdx.x]*Nx*Ny+(j[blockIdx.x]+y)*Nx+i[blockIdx.x]+x, M[blockIdx.x*Mx*My+y*Mx+x]);
}
__device__ void acquire_semaphore(volatile int *lock){
while (atomicCAS((int *)lock, 0, 1) != 0);
}
__device__ void release_semaphore(volatile int *lock){
*lock = 0;
__threadfence();
}
template <typename T>
__global__ void kl(const T * __restrict__ M, T * __restrict__ N, const int * __restrict__ n, const int * __restrict__ i, const int * __restrict__ j, const int num_M, int * __restrict__ locks){
if ((threadIdx.x == 0) && (threadIdx.y == 0))
acquire_semaphore(locks+n[blockIdx.x]);
__syncthreads();
//begin critical section
int x = threadIdx.x;
for (int y = threadIdx.y; y < My; y += blockDim.y){
N[n[blockIdx.x]*Nx*Ny + i[blockIdx.x] + (j[blockIdx.x]+y)*Nx + x] = reduction_op(
N[n[blockIdx.x]*Nx*Ny + i[blockIdx.x] + (j[blockIdx.x]+y)*Nx + x], M[blockIdx.x*Mx*My + y*Mx + x]);}
// end critical section
__threadfence(); // not strictly necessary for the lock, but to make any global updates in the critical section visible to other threads in the grid
__syncthreads();
if ((threadIdx.x == 0) && (threadIdx.y == 0))
release_semaphore(locks+n[blockIdx.x]);
}
typedef float mt;
int main(){
mt *d_M, *h_M, *d_N, *h_N, *r1, *r2;
int *d_n, *h_n, *d_i, *h_i, *d_j, *h_j;
h_M = new mt[M*Mx*My];
h_N = new mt[N*Nx*Ny];
r1 = new mt[N*Nx*Ny];
r2 = new mt[N*Nx*Ny];
h_n = new int[M];
h_i = new int[M];
h_j = new int[M];
cudaMalloc(&d_M, M*Mx*My*sizeof(mt));
cudaMalloc(&d_N, N*Nx*Ny*sizeof(mt));
cudaMalloc(&d_n, M*sizeof(int));
cudaMalloc(&d_i, M*sizeof(int));
cudaMalloc(&d_j, M*sizeof(int));
for (int i = 0; i < M; i++){
h_n[i] = rand()%N;
h_i[i] = rand()%(Nx - Mx);
h_j[i] = rand()%(Ny - My);}
for (int i = 0; i < N*Nx*Ny; i++) h_N[i] = (mt)(i%3);
for (int i = 0; i < M*Mx*My; i++) h_M[i] = (mt)((i%3)+1);
cudaMemcpy(d_M, h_M, M*Mx*My*sizeof(mt), cudaMemcpyHostToDevice);
cudaMemcpy(d_N, h_N, N*Nx*Ny*sizeof(mt), cudaMemcpyHostToDevice);
cudaMemcpy(d_n, h_n, M*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_i, h_i, M*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_j, h_j, M*sizeof(int), cudaMemcpyHostToDevice);
#ifdef USE_SINGLE_N
cudaMemset(d_n, 0, M*sizeof(int));
#endif
#if 0
const int sl = 40;
const int sb = sl * Nx * sizeof(mt);
ki<<<N, nTPB, sb>>>(d_M, d_N, d_n, d_i, d_j, M, sl);
cudaMemcpy(r2, d_N, N*Nx*Ny*sizeof(mt), cudaMemcpyDeviceToHost);
#endif
dim3 block(Mx, 8);
#if 0
ka<<<M, block>>>(d_M, d_N, d_n, d_i, d_j, M);
cudaMemcpy(r2, d_N, N*Nx*Ny*sizeof(mt), cudaMemcpyDeviceToHost);
#endif
int *d_locks;
cudaMalloc(&d_locks, N*sizeof(int));
cudaMemset(d_locks, 0, N*sizeof(int));
kl<<<M, block>>>(d_M, d_N, d_n, d_i, d_j, M, d_locks);
cudaMemcpy(r2, d_N, N*Nx*Ny*sizeof(mt), cudaMemcpyDeviceToHost);
cudaMemcpy(d_N, h_N, N*Nx*Ny*sizeof(mt), cudaMemcpyHostToDevice);
k<<<N, nTPB>>>(d_M, d_N, d_n, d_i, d_j, M);
cudaMemcpy(r1, d_N, N*Nx*Ny*sizeof(mt), cudaMemcpyDeviceToHost);
for (int i = 0; i < N*Nx*Ny; i++) if (r1[i] != r2[i]) {std::cout << "mismatch at: " << i << " was: " << r2[i] << " should be: " << r1[i] << std::endl; return 0;}
}
$ nvcc -o t34 t34.cu -O3 -lineinfo
$ nvprof ./t34
==17970== NVPROF is profiling process 17970, command: ./t34
==17970== Profiling application: ./t34
==17970== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 34.57% 3.09036s 2 1.54518s 1.54294s 1.54742s [CUDA memcpy DtoH]
33.18% 2.96615s 1 2.96615s 2.96615s 2.96615s void k<float>(float const *, float*, int const *, int const *, int const *, int)
31.81% 2.84401s 6 474.00ms 1.4255ms 1.27035s [CUDA memcpy HtoD]
0.45% 39.949ms 1 39.949ms 39.949ms 39.949ms void kl<float>(float const *, float*, int const *, int const *, int const *, int, int*)
0.00% 2.1120us 1 2.1120us 2.1120us 2.1120us [CUDA memset]
API calls: 96.13% 8.94558s 8 1.11820s 1.9203ms 4.51030s cudaMemcpy
3.60% 334.59ms 6 55.765ms 277.58us 330.37ms cudaMalloc
0.15% 13.752ms 8 1.7190ms 1.3268ms 2.2025ms cuDeviceTotalMem
0.11% 10.472ms 808 12.959us 172ns 728.50us cuDeviceGetAttribute
0.01% 997.81us 8 124.73us 100.93us 176.73us cuDeviceGetName
0.00% 69.047us 2 34.523us 32.349us 36.698us cudaLaunchKernel
0.00% 68.013us 1 68.013us 68.013us 68.013us cudaMemset
0.00% 46.172us 8 5.7710us 1.8940us 23.025us cuDeviceGetPCIBusId
0.00% 8.5060us 16 531ns 260ns 1.5030us cuDeviceGet
0.00% 3.7870us 8 473ns 229ns 881ns cuDeviceGetUuid
0.00% 3.3980us 3 1.1320us 610ns 2.0780us cuDeviceGetCount
$
Extended discussion:
On performance:
This is a memory bound algorithm. Therefore, we can estimate optimal kernel performance by determining the minimum number of memory reads and writes needed to perform the operation, then dividing by the available memory bandwidth, to determine the optimal or lower-bound for kernel duration. Unfortunately the determination of the minimum number of reads and writes depends on the positioning of the M matrices, so cannot be easily generally determined, without inspecting the n, i, and j matrices.
However we can look for another way to estimate. Another approach to estimation would be to observe that each M matrix update will require reading 2 values and writing one value. If we then use that as our estimate, we come up with M*Mx*My*3*sizeof(element_of_M)/GPU_memory_bandwidth. On my V100 (~700GB/s BW) this works out to about 20ms lower bound on kernel duration.
On approaches considered:
"naive" approach, kernel k: Each threadblock will be responsible for one of the N matrices, and will iterate through the M matrices, inspecting n to determine if the M matrices will update the assigned N matrix. This gives a non-optimal run time of ~3s but seems to be mostly invariant performance-wise based on the distribution of n, and can use an "arbitrary" reduction op.
attempt at "optimal" approach, kernel ki: Each threadblock will be responsible for one of the N matrices, but will only load a chunk of that matrix at a time. It will then proceed through the M matrices updating that chunk, similar the the k kernel. This necessitates more loops through the matrices, but should "almost" only load or save each global memory item the minimum number of times necessary. Nevertheless, the run time is really long, ~40s
atomic approach, kernel ka: Each threadblock will be responsible for one of the M matrices, and will atomically update the relevant N matrix. Simplicity. And the runtime is "fast" at ~40ms. (The atomic approach may be even faster than this is non-uniform n distributions. I witnessed kernel runtimes as low as 8ms!) However this is not readily generalizable to operations that don't have an atomic equivalent, such as multiply.
lock based approach, kernel kl: Like the atomic approach, each threadblock will be responsible for one of the M matrices, and will first acquire a lock on the relevant N matrix. The lock means that atomics are not necessary. For the uniformly distributed n case presented, it has about the same performance as the atomic case. It has the benefit that it can handle other reduction ops, such as multiply, readily. A disadvantage is that in the presence of non-uniformly-random distribution in n the performance can suffer, with a worst case in the ballpark of the naive kernel (3-5s).
Overall if the requirement for an arbitrary reduction operator can be dropped (e.g. only use addition, for example) then the atomic method may be best.
I have a nxnxm ndarray that I would like to reduce down the m-axis. pyopencl has a built in ReductionKernel that is the following:
class pyopencl.reduction.ReductionKernel(ctx, dtype_out, neutral, reduce_expr, map_expr=None, arguments=None, name="reduce_kernel", options=[], preamble="")
If written the following way, it successfully sums a single vector down to a scalar:
krnl = ReductionKernel(context, numpy.float32, neutral="0",reduce_expr="a+b", map_expr="x[i]", arguments="__global float *x")
sum_A = krnl(d_A).get()
where sum_A is a float and d_A is the vector on device memory.
I'd like to call this kernel and pass a m-length column for each index in the nxn matrix. My strategy was to pass the entire nxnxm ndarray to a parent kernel and then use enqueue_kernel pass an array to ReductionKernel but I'm unsure of how the syntax works in terms of receiving the sum. By the way, the matrix is already in row-order so the m-length arrays are already contiguous.
__kernel reduce(global float* input,
global float* output,
const unsigned int m,
const unsigned int n)
{
const int i = get_global_id(0);
const int j = get_global_id(1);
float array[m];
//Initialize an array of input[i*m+j*m*n] to input[i*m+j*m*n + m]
for(k = 0, k < m, k++)
{
array[k] = input[i*m+j*m*n+k];
}
//enqueue ReductionKernel with this array
//Place result in output[i*n+j]
}
I am new to using cuda and the magma libraries. I'm trying out some functions on a test problem, a 2D heat equation. The code I wrote seemed to work perfectly for grid sizes of 32, 64, and 128. But it produced wrong results for grid sizes of 256 or larger. I am only posting part of the code here, just enough to reproduce the error. Transferring the final matrix and looking at it in matlab shows that the second call to magmablas_dgemm introduced errors into the solution.
Is there anyone out there who can see why this code would break for larger grid sizes?
int main(int argc, char* argv[])
{
// Get parameters for problem set up
int side_width = atoi(argv[1]); //assuming square grid, N/32 integer
double dx = 2.0 / (side_width-1);
double dt = 0.25 * dx;
//double Tend = dt*3;// 0.5;
// create memory pointers for derivative operator matrices and solution matrix
double* U;
double* Dleft;
double* Dright;
double* dev_U;
double* dev_Dleft;
double* dev_Dright;
//initialize the MAGMA system
magma_init();
magma_int_t N = side_width;
// temp variables required by MAGMA functions
magma_int_t *piv, info, err;
piv = (magma_int_t*)malloc(N*sizeof(magma_int_t));
// Allocate memory for matrices on host and device
err = magma_dmalloc_cpu(&U, N*N);
err += magma_dmalloc_cpu(&Dleft, N*N);
err += magma_dmalloc_cpu(&Dright, N*N);
err += magma_dmalloc(&dev_U, N*N);
err += magma_dmalloc(&dev_Dleft, N*N);
err += magma_dmalloc(&dev_Dright, N*N);
if (err){
printf("error in allocation. err number = %d\n", err);
exit(1);
}
// zero out matrices (not efficient but correct)
for (int k=0; k<N*N; ++k ){
U[k] = 1.0;
Dleft[k] = 0.0;
Dright[k] = 0.0;
}
//create derivative operator matrices
double a = dt/2.0/dx/dx;
double b = dt/dx/dx;
Dleft[0] = 1.0;
Dleft[N*N-1] = 1.0;
for (int k=1; k<N-1; ++k) {
Dleft[k*N + k-1] = -a;
Dleft[k*N + k] = 1+b;
Dleft[k*N + k+1] = -a;
Dright[k*N + k-1] = a;
Dright[k*N + k] = 1-b;
Dright[k*N + k+1] = a;
}
// Determine block and thread amounts
int grid_dim = ((side_width + 31)/32) ;
int block_dim = 32;
dim3 gridDim(grid_dim, grid_dim);
dim3 blockDim(block_dim, block_dim);
//copy data from host to device
magma_dsetmatrix(N, N, U, N, dev_U, N);
magma_dsetmatrix(N, N, Dleft, N, dev_Dleft, N);
magma_dsetmatrix(N, N, Dright, N, dev_Dright, N);
// LU factorize the left hand operator matrix
magma_dgetrf_gpu(N, N, dev_Dleft, N, piv, &info);
double tn = 0; //time counter
// needed to take first step outside while loop because of some tricky transpose nonsense happening
tn += dt;
// compute explicit step : Uhat=Dright*U^T
magmablas_dgemm(MagmaTrans,MagmaNoTrans, N, N, N, 1.0f, dev_Dright, N, dev_U, N, 0.0f, dev_U, N);
// implicit step solve : Dleft*U=Uhat
magma_dgetrs_gpu(MagmaTrans, N, N, dev_Dleft, N, piv, dev_U, N, &info);
// compute explicit step : Uhat=Dright*U^T
magmablas_dgemm(MagmaTrans, MagmaTrans, N, N, N, 1.0f, dev_Dright, N, dev_U, N, 0.0f, dev_U, N);
printf("GPU matrix U at time %3.3f \n ", tn);
magma_dprint_gpu(16, 16, dev_U, N);
//copy solution from device to host
magma_dgetmatrix(N, N, dev_U, N, U, N);
//write data to file
char filename[256];
char str_t[128];
sprintf(str_t, "%d", N );
sprintf(filename, "ADI_%s.bin", str_t);
FILE* fileID = fopen(filename, "wb");
for (int i=0; i<N*N; ++i){
fwrite(&U[i],sizeof(double),1,fileID);
}
fclose(fileID);
free(U);
free(Dleft);
free(Dright);
magma_free(dev_U);
magma_free(dev_Dleft);
magma_free(dev_Dright);
free(piv);
magma_finalize();
return 0;
}
To the best of my knowledge, BLAS/LAPACK gemm has never supported in-place operations, ie.
C := alpha*op( A )*op( B ) + beta*C
cannot be transformed into
A := alpha*op( A )*op( B ) + beta*A
or
B := alpha*op( A )*op( B ) + beta*B
with any guarantee of correctness, even for the canonical case with alpha = 1, beta = 0. If you can follow the fortran, I would recommend having a look at the reference code from the Dongarra group. That implementation will break if the pointer for the matrix passed as C aliaises either A or B.
In multi-threaded or massively parallel BLAS implementations, this is particularly true. Most parallel execution environments don't support any sort of strong or fixed execution ordering. That can mean that operations which unintentionally work in serial versions of linear algebra routines break in parallel, because of the lack of execution order guarantee. If a routine in a parallel BLAS or LAPACK implementation doesn't explicitly say it supports in-place operations, don't assume otherwise, because there be dragons and all of that...
Your MAGMA gemm calls only work at small sizes by accident, and probably because very small matrix sizes don't expose enough parallelism to hit the correctness problems that will arise from aliasing an input and output pointer. If you change your code so that the inputs and output are different memory allocations, I suspect the problem will disappear.
Thank for #hubs , when call cublasSgemv should notice that CUBLAS_OP_T is also transpose vector.
/*I am learning cuda and cublas for a month, and I want to test the performance of cublas for further use. But in my matrix-vector multiplication using cublasSgemv , the answer is wrong.
I initialize Matrix A and Vector x in row-major. I sent them to device using cudaMemcpy, and call the function cublasSgemv , because the A is row-major, I transpose it using a parameter CUBLAS_OP_T.*/
//the row is 50,and col is 10, A[i]=i;x[i]=1; And A matrix is row major.
//the answer I get is 45,545,.....4545,0,0,0,0,0,0,0,0,........0
int main(){
int row=50;
int col=10;
int N=row*col;
float*A=new float[N];
float* y_gpu=new float[50];
for (int i=0;i<N;i++)
{
A[i]=(float)i;
}
float* x=new float[10];
for (int i=0;i<10;i++)
{
x[i]=1;
}
GpuVec(A,x,y_gpu,row,col); //call the function
for(int i=0;i<50;i++){
cout<<" "<<y_gpu[i]<<endl; //
}
return 0;
}
int GpuVec(const float* A,const float* x, float* y,const int row,const int col){
cudaError_t cudastat;
cublasStatus_t stat;
int size=row*col;
cublasHandle_t handle;
float* d_A; //device matrix
float* d_x; //device vector
float* d_y; //device result
cudastat=cudaMalloc((void**)&d_A,size*sizeof(float));
cudastat=cudaMalloc((void**)&d_x,col*sizeof(float));
cudastat=cudaMalloc((void**)&d_y,row*sizeof(float));// when I copy y to d_y ,can I cout d_y?
cudaMemcpy(d_A,A,sizeof(float)*size,cudaMemcpyHostToDevice); //copy A to device d_A
cudaMemcpy(d_x,x,sizeof(float)*col,cudaMemcpyHostToDevice); //copy x to device d_x
float alf=1.0;
float beta=0;
stat=cublasCreate(&handle);
stat=cublasSgemv(handle,CUBLAS_OP_T,col,row,&alf,d_A,col,d_x,1,&beta,d_y,1);//swap col and row
cudaMemcpy(y,d_y,sizeof(float)*row,cudaMemcpyDeviceToHost); // copy device result to host
cudaFree(d_A);
cudaFree(d_x);
cudaFree(d_y);
cublasDestroy(handle);
return 0;
}
To use two-dimensional arrays stored in row-major order in cublas (that works with column-major order) you can call the gemv in this way.
stat = cublasSgemv(handle, CUBLAS_OP_T, col, row, &alf, d_A, col, d_x, 1, &beta, d_y, 1);
You have to swap m (rows) and n (columns) in the call, too, to perform y = A * x, but it allows you to use the cublas call without transposing the original array.
For my GPU programming class, we've been tasked with completing certain parts of a non-square matrix multiplication program. Specifically, the kernel function and initializing the thread block and kernel grid dimensions.
I've based my code on the CUDA C Programming Guide's matrix multiplication code, but instead of using structs as they do, I have modified mine to use only the parameters given (since we're not allowed to change parameters). We are provided with the 3 matrices A, B, and C, as well as the dimensions of them- m x k, k x n, and m x n, respectively. Where the struct used A.height, I've used dimension m, where it used B.width, I've used dimension n, etc.
I've run into several problems, the first of which is that my program doesn't pass the included test, which verifies the correctness of the product matrix C. I assume that there is something wrong in my matrix multiplication code, then, and that the issue probably arises from me adapting the struct code.
#include <stdio.h>
__global__ void mysgemm(int m, int n, int k, const float *A, const float *B,
float* C) {
/********************************************************************
*
* Compute C = A x B
* where A is a (m x k) matrix
* where B is a (k x n) matrix
* where C is a (m x n) matrix
*
********************************************************************/
// INSERT KERNEL CODE HERE
// Each thread computes one element of C
// by accumulating results into Cvalue
float Cvalue = 0;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
for (int e = 0; e < k; ++e){
Cvalue += (A[row * k + e]) * (B[e * n + col]);
}
C[row * n + col] = Cvalue;
}
My other problem, which I'm even less sure about, involves the code to initialize the thread block and kernel grid dimensions.
// Initialize thread block and kernel grid dimensions ---------------------
const unsigned int BLOCK_SIZE = 16; // Use 16x16 thread blocks
//INSERT CODE HERE
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(n / dimBlock.x, m / dimBlock.y);
// Invoke CUDA kernel -----------------------------------------------------
//INSERT CODE HERE
mysgemm<<<dimGrid, dimBlock>>>(m, n, k, A, B, C);
I understand dimBlock, but I don't understand dimGrid, and don't have a proper idea of what to use as parameters for it. When I run the code as is, the kernel won't even launch if the matrix I pass in doesn't have a dimension that is a power of 2. And if I do use a power of 2, the test still fails.
I apologize if I've been too wordy. This is my first post and I wanted to give as many details as possible. Hopefully someone can help walk me through these issues.
The following kernel I'm posting below is a variant of the one I posted in
CUDA: Tiled matrix-matrix multiplication with shared memory and matrix size which is non-multiple of the block size
in that it does not use shared memory.
__global__ void MatMulNoShared(float* A, float* B, float* C, int ARows, int ACols, int BRows, int BCols, int CRows, int CCols) {
float CValue = 0;
int Row = blockIdx.y*TILE_DIM + threadIdx.y;
int Col = blockIdx.x*TILE_DIM + threadIdx.x;
for (int k = 0; k < (TILE_DIM + ACols - 1)/TILE_DIM; k++) {
for (int n = 0; n < TILE_DIM; ++n)
if ((k*TILE_DIM + n < ACols && Row < ARows) && (k*TILE_DIM + n < BRows && Col < BCols))
CValue += A[Row*ACols + k*TILE_DIM + n] * B[(k*TILE_DIM + n)*BCols + Col];
}
if (Row < CRows && Col < CCols) C[((blockIdx.y * blockDim.y + threadIdx.y)*CCols)+(blockIdx.x*blockDim.x)+threadIdx.x]=CValue;
}
The two if statements in the kernel are the if statements mentioned in the answer by Eric.
For the sake of your convenience, I'm posting the full code below:
#include <stdio.h>
#include <math.h>
#include <conio.h>
#define TILE_DIM 16 // Tile dimension
#define DIMX 373
#define DIMY 242
#define DIMZ 533
__global__ void MatMulNoShared(float* A, float* B, float* C, int ARows, int ACols, int BRows, int BCols, int CRows, int CCols) {
float CValue = 0;
int Row = blockIdx.y*TILE_DIM + threadIdx.y;
int Col = blockIdx.x*TILE_DIM + threadIdx.x;
for (int k = 0; k < (TILE_DIM + ACols - 1)/TILE_DIM; k++) {
for (int n = 0; n < TILE_DIM; ++n)
if ((k*TILE_DIM + n < ACols && Row < ARows) && (k*TILE_DIM + n < BRows && Col < BCols))
CValue += A[Row*ACols + k*TILE_DIM + n] * B[(k*TILE_DIM + n)*BCols + Col];
}
if (Row < CRows && Col < CCols) C[((blockIdx.y * blockDim.y + threadIdx.y)*CCols)+(blockIdx.x*blockDim.x)+threadIdx.x]=CValue;
}
int main() {
int CCols = DIMZ, CRows=DIMX, ACols=DIMY, ARows=DIMX, BCols=DIMZ, BRows=DIMY;
dim3 dimBlock(TILE_DIM, TILE_DIM, 1);
dim3 dimGrid;
dimGrid.x = (CCols + dimBlock.x - 1)/dimBlock.x;
dimGrid.y = (CRows + dimBlock.y - 1)/dimBlock.y;
float *deviceA, *deviceB, *deviceC;
float* hostA = (float*)malloc(DIMX*DIMY*sizeof(float));
float* hostB = (float*)malloc(DIMY*DIMZ*sizeof(float));
float* hostC = (float*)malloc(DIMX*DIMZ*sizeof(float));
float* hostCp = (float*)malloc(DIMX*DIMZ*sizeof(float));
for (int x = 0; x<DIMX; x++)
for (int y = 0; y<DIMY; y++) {
hostA[x*DIMY+y] = rand()/(float)RAND_MAX;
hostB[x*DIMY+y] = rand()/(float)RAND_MAX;
}
cudaMalloc((void **)&deviceA, DIMX*DIMY*sizeof(float));
cudaMalloc((void **)&deviceB, DIMY*DIMZ*sizeof(float));
cudaMalloc((void **)&deviceC, DIMX*DIMZ*sizeof(float));
cudaMemcpy(deviceA, hostA, DIMX*DIMY*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(deviceB, hostB, DIMY*DIMZ*sizeof(float), cudaMemcpyHostToDevice);
MatMulNoShared<<<dimGrid , dimBlock>>>(deviceA , deviceB , deviceC , ARows , ACols, BRows ,BCols , CRows , CCols);
cudaMemcpy(hostC, deviceC, DIMX*DIMZ*sizeof(float), cudaMemcpyDeviceToHost);
return 0;
}
Note that the two instructions
dimGrid.x = (CCols + dimBlock.x - 1)/dimBlock.x;
dimGrid.y = (CRows + dimBlock.y - 1)/dimBlock.y;
ensure a full tiled coverage of the matrices, as mentioned at point 1. of Eric's answer.
Your code currently only works when m and n are multiples of 16, which is your block size.
Two things you can do now to make it work on arbitrary sizes.
Make the gird size large enough to cover the whole matrix C. Instead of using the floor of n/blockdim.x as you have done, you could use the ceil of that value by
(n+blockdim.x-1)/blockdim.x
After you have done step 1, the matrix you are multiplying will be a little bit larger because of the ceiling operation. you could then limit the multiplying to the exact size of the result matrix C by adding an if clause in the kernel.
Please refer to CUDA docs for more details, especially the programming guide.
http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html