Getting nan results using Peer-to-Peer in Tesla K80 Cluster - c

I'm applying UVA and OpenMP in my algorithm to make it powerful.
The thing is that when I launch a parallel kernel, that is for example, 3 CPU threads launch one kernel at the same time. One thread has nan values.
It seems that GPU X cannot read a variable from GPU0.
That is weird taking into account that I grant access to every GPU to 0 (In this case 1 and 2).
Is there a problem to use UVA and OpenMP together? Or is a problem of the code?
Here is the code and the results.
I've created a MCVE to demonstrate the error here:
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <math.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "math_constants.h"
#include <omp.h>
#include <cufft.h>
inline bool IsGPUCapableP2P(cudaDeviceProp *pProp)
{
#ifdef _WIN32
return (bool)(pProp->tccDriver ? true : false);
#else
return (bool)(pProp->major >= 2);
#endif
}
inline bool IsAppBuiltAs64()
{
#if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64)
return 1;
#else
return 0;
#endif
}
__global__ void kernelFunction(cufftComplex *I, int i, int N)
{
int j = threadIdx.x + blockDim.x * blockIdx.x;
int k = threadIdx.y + blockDim.y * blockIdx.y;
if(j==0 & k==0){
printf("I'm thread %d and I'm reading device_I[0] = %f\n", i, I[N*j+k].x);
}
}
__host__ int main(int argc, char **argv) {
int num_gpus;
cudaGetDeviceCount(&num_gpus);
if(num_gpus < 1){
printf("No CUDA capable devices were detected\n");
return 1;
}
if (!IsAppBuiltAs64()){
printf("%s is only supported with on 64-bit OSs and the application must be built as a 64-bit target. Test is being waived.\n", argv[0]);
exit(EXIT_SUCCESS);
}
printf("Number of host CPUs:\t%d\n", omp_get_num_procs());
printf("Number of CUDA devices:\t%d\n", num_gpus);
for(int i = 0; i < num_gpus; i++){
cudaDeviceProp dprop;
cudaGetDeviceProperties(&dprop, i);
printf("> GPU%d = \"%15s\" %s capable of Peer-to-Peer (P2P)\n", i, dprop.name, (IsGPUCapableP2P(&dprop) ? "IS " : "NOT"));
//printf(" %d: %s\n", i, dprop.name);
}
printf("---------------------------\n");
num_gpus = 3; //The case that fails
omp_set_num_threads(num_gpus);
if(num_gpus > 1){
for(int i=1; i<num_gpus; i++){
cudaDeviceProp dprop0, dpropX;
cudaGetDeviceProperties(&dprop0, 0);
cudaGetDeviceProperties(&dpropX, i);
int canAccessPeer0_x, canAccessPeerx_0;
cudaDeviceCanAccessPeer(&canAccessPeer0_x, 0, i);
cudaDeviceCanAccessPeer(&canAccessPeerx_0 , i, 0);
printf("> Peer-to-Peer (P2P) access from %s (GPU%d) -> %s (GPU%d) : %s\n", dprop0.name, 0, dpropX.name, i, canAccessPeer0_x ? "Yes" : "No");
printf("> Peer-to-Peer (P2P) access from %s (GPU%d) -> %s (GPU%d) : %s\n", dpropX.name, i, dprop0.name, 0, canAccessPeerx_0 ? "Yes" : "No");
if(canAccessPeer0_x == 0 || canAccessPeerx_0 == 0){
printf("Two or more SM 2.0 class GPUs are required for %s to run.\n", argv[0]);
printf("Support for UVA requires a GPU with SM 2.0 capabilities.\n");
printf("Peer to Peer access is not available between GPU%d <-> GPU%d, waiving test.\n", 0, i);
exit(EXIT_SUCCESS);
}else{
cudaSetDevice(0);
printf("Granting access from 0 to %d...\n", i);
cudaDeviceEnablePeerAccess(i,0);
cudaSetDevice(i);
printf("Granting access from %d to 0...\n", i);
cudaDeviceEnablePeerAccess(0,0);
printf("Checking GPU%d and GPU%d for UVA capabilities...\n", 0, 1);
const bool has_uva = (dprop0.unifiedAddressing && dpropX.unifiedAddressing);
printf("> %s (GPU%d) supports UVA: %s\n", dprop0.name, 0, (dprop0.unifiedAddressing ? "Yes" : "No"));
printf("> %s (GPU%d) supports UVA: %s\n", dpropX.name, i, (dpropX.unifiedAddressing ? "Yes" : "No"));
if (has_uva){
printf("Both GPUs can support UVA, enabling...\n");
}
else{
printf("At least one of the two GPUs does NOT support UVA, waiving test.\n");
exit(EXIT_SUCCESS);
}
}
}
}
int M = 512;
int N = 512;
cufftComplex *host_I = (cufftComplex*)malloc(M*N*sizeof(cufftComplex));
for(int i=0;i<M;i++){
for(int j=0;j<N;j++){
host_I[N*i+j].x = 0.001;
host_I[N*i+j].y = 0;
}
}
cufftComplex *device_I;
cudaSetDevice(0);
cudaMalloc((void**)&device_I, sizeof(cufftComplex)*M*N);
cudaMemset(device_I, 0, sizeof(cufftComplex)*M*N);
cudaMemcpy2D(device_I, sizeof(cufftComplex), host_I, sizeof(cufftComplex), sizeof(cufftComplex), M*N, cudaMemcpyHostToDevice);
dim3 threads(32,32);
dim3 blocks(M/threads.x, N/threads.y);
dim3 threadsPerBlockNN = threads;
dim3 numBlocksNN = blocks;
#pragma omp parallel
{
unsigned int i = omp_get_thread_num();
unsigned int num_cpu_threads = omp_get_num_threads();
// set and check the CUDA device for this CPU thread
int gpu_id = -1;
cudaSetDevice(i % num_gpus); // "% num_gpus" allows more CPU threads than GPU devices
cudaGetDevice(&gpu_id);
//printf("CPU thread %d (of %d) uses CUDA device %d\n", cpu_thread_id, num_cpu_threads, gpu_id);
kernelFunction<<<numBlocksNN, threadsPerBlockNN>>>(device_I, i, N);
cudaDeviceSynchronize();
}
cudaFree(device_I);
for(int i=1; i<num_gpus; i++){
cudaSetDevice(0);
cudaDeviceDisablePeerAccess(i);
cudaSetDevice(i);
cudaDeviceDisablePeerAccess(0);
}
for(int i=0; i<num_gpus; i++ ){
cudaSetDevice(i);
cudaDeviceReset();
}
free(host_I);
}
The results are:
Both GPUs can support UVA, enabling...
I'm thread 0 and I'm reading device_I[0] = 0.001000
I'm thread 2 and I'm reading device_I[0] = 0.001000
I'm thread 1 and I'm reading device_I[0] = -nan
The command line to compile is:
nvcc -Xcompiler -fopenmp -lgomp -arch=sm_37 main.cu -lcufft
Here is the result of simpleP2P:
[miguel.carcamo#belka simpleP2P]$ ./simpleP2P
[./simpleP2P] - Starting...
Checking for multiple GPUs...
CUDA-capable device count: 8
> GPU0 = " Tesla K80" IS capable of Peer-to-Peer (P2P)
> GPU1 = " Tesla K80" IS capable of Peer-to-Peer (P2P)
> GPU2 = " Tesla K80" IS capable of Peer-to-Peer (P2P)
> GPU3 = " Tesla K80" IS capable of Peer-to-Peer (P2P)
> GPU4 = " Tesla K80" IS capable of Peer-to-Peer (P2P)
> GPU5 = " Tesla K80" IS capable of Peer-to-Peer (P2P)
> GPU6 = " Tesla K80" IS capable of Peer-to-Peer (P2P)
> GPU7 = " Tesla K80" IS capable of Peer-to-Peer (P2P)
Checking GPU(s) for support of peer to peer memory access...
> Peer-to-Peer (P2P) access from Tesla K80 (GPU0) -> Tesla K80 (GPU1) : Yes
> Peer-to-Peer (P2P) access from Tesla K80 (GPU1) -> Tesla K80 (GPU0) : Yes
Enabling peer access between GPU0 and GPU1...
Checking GPU0 and GPU1 for UVA capabilities...
> Tesla K80 (GPU0) supports UVA: Yes
> Tesla K80 (GPU1) supports UVA: Yes
Both GPUs can support UVA, enabling...
Allocating buffers (64MB on GPU0, GPU1 and CPU Host)...
Creating event handles...
cudaMemcpyPeer / cudaMemcpy between GPU0 and GPU1: 0.79GB/s
Preparing host buffer and memcpy to GPU0...
Run kernel on GPU1, taking source data from GPU0 and writing to GPU1...
Run kernel on GPU0, taking source data from GPU1 and writing to GPU0...
Copy data back to host from GPU0 and verify results...
Verification error # element 0: val = nan, ref = 0.000000
Verification error # element 1: val = nan, ref = 4.000000
Verification error # element 2: val = nan, ref = 8.000000
Verification error # element 3: val = nan, ref = 12.000000
Verification error # element 4: val = nan, ref = 16.000000
Verification error # element 5: val = nan, ref = 20.000000
Verification error # element 6: val = nan, ref = 24.000000
Verification error # element 7: val = nan, ref = 28.000000
Verification error # element 8: val = nan, ref = 32.000000
Verification error # element 9: val = nan, ref = 36.000000
Verification error # element 10: val = nan, ref = 40.000000
Verification error # element 11: val = nan, ref = 44.000000
Enabling peer access...
Shutting down...
Test failed!

It seems, based on the debugging in the comments, that the problem was ultimately related to the system that was being used, not OP's code.
K80 is a dual-GPU device, so it has a PCIE bridge chip on-board. Proper use of this configuration, especially when using Peer-to-Peer (P2P) traffic requires proper settings in the upstream PCIE switches and/or root complex. These settings are normally made by the system BIOS, and are not normally/typically software-configurable.
One possible indicator when these settings are incorrect is that the simpleP2P CUDA sample code will report errors during results validation. Therefore, a good test on any system where you are having trouble with P2P code is to run this particular CUDA sample code (simpleP2P). If validation errors are reported (see OP's posting for an example), then these should be addressed first, before any attempt is made to debug the user's P2P code.
The best recommendation is to use a system that has been validated by the system vendor for K80 usage. This is generally good practice for any usage of Tesla GPUs, as these GPUs tend to make significant demands on the host system from the standpoints of:
power delivery
cooling requirements
system compatibility (two examples are the types of PCIE settings being discussed here, as well as resource mapping and bootability issues also referred to by OP in the comments)
OEM validated systems will generally have the fewest issues associated with the above requirements/demands that Tesla GPUs place on the host system.
For this particular issue, troubleshooting starts with the simpleP2P test. When validation errors are observed in that test (but no other CUDA runtime errors are reported) then the PCIE settings may be suspect. The easiest way to attempt to address these are by checking for a newer/updated system BIOS which may have the settings correct for this type of usage, or else will offer a BIOS setup option that allows the user to make the necessary changes. The settings involved here are PCIE ACS settings, and if a BIOS setup option is available, those terms will likely be involved. Since BIOS setup varies from system to system, it's not possible to be specific here.
If the BIOS update and/or settings modification does not resolve the issue, then it's probably not fixable for that particular system type. It's possible to troubleshoot the process a bit further using the final steps described here but such troubleshooting, even if successful, cannot lead to a permanent (i.e. will survive a reboot) fix without BIOS modifications.
If the simpleP2P test runs correctly, then debug focus should return to the user's code. General recommendations of using proper cuda error checking and running the code with cuda-memcheck apply. Furthermore, the simpleP2P sample source code can be then referred to as an example of correct usage of P2P functionality.
Note that in general, P2P support may vary by GPU or GPU family. The ability to run P2P on one GPU type or GPU family does not necessarily indicate it will work on another GPU type or family, even in the same system/setup. The final determinant of GPU P2P support are the tools provided that query the runtime via cudaDeviceCanAccessPeer. P2P support can vary by system and other factors as well. No statements made here are a guarantee of P2P support for any particular GPU in any particular setup.

Related

Get maximum available register for Linux PCI device

I am currently debugging a Linux kernel driver.
I want to sweep a PCI device's mmio registers to scan for certain information.
This is the function I wrote so far.
void _sweep_registers(struct pci_dev *dev)
{
int i;
int activecontrolstatus;
int activestatus;
for (i = 0; i < AMD_P2C_MSG_INTSTS; i++) {
activecontrolstatus = readl(privdata->mmio + i);
activestatus = activecontrolstatus >> 4;
dev_info(&dev->dev, "activecontrolstatus = %d / activestatus = %d",
activecontrolstatus, activestatus);
}
}
Currently I am reading mmio until what's specified in AMD_P2C_MSG_INTSTS (which is 0x10694).
But how far can I actually go?
I have zero knowledge of Linux kernel development and only rudimentary knowledge of C.
Background
My goal is to find information about which sensors of the AMD Sensor Fusion Hub are marked as active.
They should be under the register 0x1068C, but are not on my system (it's 0x0, but at least an accelerometer is available, so the bitmask should at least match 0x1).
I want to see, whether they are stored somewhere else.

PAPI_num_counters() shows the system doesn't have available counters

I have a question regarding PAPI (Performance Application Programming Interface). I downloaded and installed PAPI library. Still not sure how to use it correctly and what additional things I need, to make it work. I am trying to use it in C. I have this simple program:
int retval;
retval = PAPI_library_init(PAPI_VER_CURRENT);
if (retval != PAPI_VER_CURRENT && retval > 0) {
printf("PAPI error: 1\n");
exit(1);
}
if (retval < 0)
printf("PAPI error: 2\n");
retval = PAPI_is_initialized();
if (retval != PAPI_LOW_LEVEL_INITED)
printf("PAPI error: 2\n");
int num_hwcntrs = 0;
if ((num_hwcntrs = PAPI_num_counters()) <= PAPI_OK)
printf("This system has %d available counters. \n", num_hwcntrs);
I have included papi.h library and I am compiling with gcc -lpapi flag. I added library in path so it is able to compile and run, but as a result I get this:
This system has 0 available counters.
Thought initialization seems to work as it doesn't give error code.
Any advice or suggestion would be helpful to determine what I have not done right or missed to run it correctly. I mean, I should have available counters in my system, more precisely I need cache miss and cache hit counters.
I tried to count available counters after I run this another simple program and it gave error code -25:
int numEvents = 2;
long long values[2];
int events[2] = {PAPI_L3_TCA,PAPI_L3_TCM};
printf("PAPI error: %d\n", PAPI_start_counters(events, numEvents));
UPDATE: I just tried to check from terminal hardware information with command: papi_avail | more; and I got this:
Available PAPI preset and user defined events plus hardware information.
PAPI version : 5.7.0.0
Operating system : Linux 4.15.0-45-generic
Vendor string and code : GenuineIntel (1, 0x1)
Model string and code : Intel(R) Core(TM) i5-6200U CPU # 2.30GHz (78, 0x4e)
CPU revision : 3.000000
CPUID : Family/Model/Stepping 6/78/3, 0x06/0x4e/0x03
CPU Max MHz : 2800
CPU Min MHz : 400
Total cores : 4
SMT threads per core : 2
Cores per socket : 2
Sockets : 1
Cores per NUMA region : 4
NUMA regions : 1
Running in a VM : no
Number Hardware Counters : 0
Max Multiplex Counters : 384
Fast counter read (rdpmc): no
PAPI Preset Events
Name Code Avail Deriv Description (Note)
PAPI_L1_DCM 0x80000000 No No Level 1 data cache misses
PAPI_L1_ICM 0x80000001 No No Level 1 instruction cache misses
PAPI_L2_DCM 0x80000002 No No Level 2 data cache misses
PAPI_L2_ICM 0x80000003 No No Level 2 instruction cache misses
.......
So because Number Hardware Counters is 0, I can't use this tool to count cache misses with PAPI's preset events? Is there any configuration that can be useful or should I forget about it till I change my laptop?

VIDIOC_ENUMINPUT Not returning any video standards

I have been playing around with a userspace application based on uvc driver based on v4l2. I have been trying to get the capabilities of my integrated webcam (this is a laptop), and then I got into one problem. My driver does not set any video standard flags against VIDIOC_ENUMINPUT ioctl. Following is my code.
struct v4l2_capability caps;
memset(&caps, 0, sizeof(caps));
if(-1 == ioctl(fd, VIDIOC_QUERYCAP, &caps)) {
perror("Unable to query capabilities");
return errno;
}
printf(
"-------- VIDIOC_QUERYCAP --------\n"
"Driver = %s\n"
"Card = %s\n"
"Bus Info = %s\n"
"Version = %d\n"
"Capabilities = %#x\n"
"Device Caps = %#x\n",
caps.driver,
caps.card,
caps.bus_info,
caps.version,
caps.capabilities,
caps.device_caps);
int index;
if(-1 == ioctl(fd, VIDIOC_G_INPUT, &index)) {
perror("Unable to get current input index");
return errno;
}
struct v4l2_input input;
memset(&input, 0, sizeof(input));
input.index = index;
if(-1 == ioctl(fd, VIDIOC_ENUMINPUT, &input)) {
perror("Unabel to query attributes of video input");
return errno;
}
printf(
"--------- VIDIOC_ENUMINPUT ---------\n"
"Index = %d\n"
"Name = %s\n"
"Type = %d\n"
"Audio Set = %d\n"
"Video Stds = %lld\n"
"Status = %d\n"
"Capabilities = %d\n",
input.index,
input.name,
input.type,
input.audioset,
input.std,
input.status,
input.capabilities);
And the output looks like the following.
-------- VIDIOC_QUERYCAP --------
Driver = uvcvideo
Card = Integrated_Webcam_HD: Integrate
Bus Info = usb-0000:00:1d.0-1.6
Version = 266001
Capabilities = 0x84200001
Device Caps = 0x4200001
--------- VIDIOC_ENUMINPUT ---------
Index = 0
Name = Camera 1
Type = 2
Audio Set = 0
Video Stds = 0 // <--- Problem here.
Status = 0
Capabilities = 0
Notice that the video standards flag is set to 0. To further drill down the problem, I tried VIDIOC_G_STD ioctl, as follows,
struct v4l2_standard std;
memset(&std, 0, sizeof(std));
if(-1 == ioctl(fd, VIDIOC_G_STD, &std)) {
perror("Error");
return errno;
}
But receives the following error.
Error: Inappropriate ioctl for device
What could be the conclusion? Am I doing anything wrong here?
Platform Details
Linux linux 4.15.0-20-generic #21-Ubuntu SMP Tue Apr 24 06:16:15 UTC 2018 x86_64 x86_64 x86_64 GNU/Linux
Driver version: 4.15.17
Device node : /dev/video0 (only one device)
I think I found the answer myself. On closer evaluation, I have found that the integrated webcam on my laptop is on the USB bus internally. USB class devices are an exception for v4l2 video standard ioctl. As per the documentation,
Special rules apply to devices such as USB cameras where the notion of video standards makes little sense. More generally for any capture or output device which is incapable of capturing fields or frames at the nominal rate of the video standard, or that does not support the video standard formats at all. Here the driver shall set the std field of struct v4l2_input and struct v4l2_output to zero and the VIDIOC_G_STD, VIDIOC_S_STD, ioctl VIDIOC_QUERYSTD and ioctl VIDIOC_ENUMSTD ioctls shall return the ENOTTY error code or the EINVAL error code.
Thus, I think my camera falls into one of these categories, and STD query is not really applicable in my case. I'm not sure if this is true for MIPI or Parallel buses. I will update once I do a little more experiment with those hardware.

How can I get number of Cores in cuda device?

I am looking for a function that count number of core of my cuda device. I know each microprocessor have specific cores, and my cuda device has 2 microprocessors.
I searched a lot to find a property function that count number of cores per microprocessor but I couldn't. I use the code below but I still need number of cores?
cuda 7.0
program language C
visual studio 2013
Code:
void printDevProp(cudaDeviceProp devProp)
{ printf("%s\n", devProp.name);
printf("Major revision number: %d\n", devProp.major);
printf("Minor revision number: %d\n", devProp.minor);
printf("Total global memory: %u", devProp.totalGlobalMem);
printf(" bytes\n");
printf("Number of multiprocessors: %d\n", devProp.multiProcessorCount);
printf("Total amount of shared memory per block: %u\n",devProp.sharedMemPerBlock);
printf("Total registers per block: %d\n", devProp.regsPerBlock);
printf("Warp size: %d\n", devProp.warpSize);
printf("Maximum memory pitch: %u\n", devProp.memPitch);
printf("Total amount of constant memory: %u\n", devProp.totalConstMem);
return;
}
The cores per multiprocessor is the only "missing" piece of data. That data is not provided directly in the cudaDeviceProp structure, but it can be inferred based on published data and more published data from the devProp.major and devProp.minor entries, which together make up the CUDA compute capability of the device.
Something like this should work:
#include "cuda_runtime_api.h"
// you must first call the cudaGetDeviceProperties() function, then pass
// the devProp structure returned to this function:
int getSPcores(cudaDeviceProp devProp)
{
int cores = 0;
int mp = devProp.multiProcessorCount;
switch (devProp.major){
case 2: // Fermi
if (devProp.minor == 1) cores = mp * 48;
else cores = mp * 32;
break;
case 3: // Kepler
cores = mp * 192;
break;
case 5: // Maxwell
cores = mp * 128;
break;
case 6: // Pascal
if ((devProp.minor == 1) || (devProp.minor == 2)) cores = mp * 128;
else if (devProp.minor == 0) cores = mp * 64;
else printf("Unknown device type\n");
break;
case 7: // Volta and Turing
if ((devProp.minor == 0) || (devProp.minor == 5)) cores = mp * 64;
else printf("Unknown device type\n");
break;
case 8: // Ampere
if (devProp.minor == 0) cores = mp * 64;
else if (devProp.minor == 6) cores = mp * 128;
else if (devProp.minor == 9) cores = mp * 128; // ada lovelace
else printf("Unknown device type\n");
break;
case 9: // Hopper
if (devProp.minor == 0) cores = mp * 128;
else printf("Unknown device type\n");
break;
default:
printf("Unknown device type\n");
break;
}
return cores;
}
(coded in browser)
"cores" is a bit of a marketing term. The most common connotation in my opinion is to equate it with SP units in the SM. That is the meaning I have demonstrated here. I've also omitted cc 1.x devices from this, as those device types are no longer supported in CUDA 7.0 and CUDA 7.5
A pythonic version is here
In linux you can run the following command to get the number of CUDA cores:
nvidia-settings -q CUDACores -t
To get the output of this command in C, use the popen function.
As Vraj Pandya already said, there is a function (_ConvertSMVer2Cores) in the Common/helper_cuda.h file on nvidia's cuda-samples github repository, which provides this functionality. You just need to multiply its result with the multiprocessor count from the GPU.
Just wanted to provide a current link.
#include <cuda.h>
#include <cuda_runtime.h>
#include <helper_cuda.h> // You need to place this file somewhere where it can be
// found by the linker.
// The file itself seems to also require the
// `helper_string.h` file (in the same folder as
// `helper_cuda.h`).
int deviceID;
cudaDeviceProp props;
cudaGetDevice(&deviceID);
cudaGetDeviceProperties(&props, deviceID);
int CUDACores = _ConvertSMVer2Cores(props.major, props.minor) * props.multiProcessorCount;
Maybe this might help a bit more.
https://devtalk.nvidia.com/default/topic/470848/cuda-programming-and-performance/what-39-s-the-proper-way-to-detect-sp-cuda-cores-count-per-sm-/post/4414371/#4414371
"there is a library helper_cuda.h which contains a routine
_ConvertSMVer2Cores(int major, int minor) which takes the compute capability level
of the GPU and returns the number of cores (stream processors) in each SM or SMX"
-from the post.

detecting NVIDIA GPUs without CUDA

I would like to extract a rather limited set of information about NVIDIA GPUs without linking against the CUDA libraries. The only information that is needed is compute capability and name of the GPU, more than this could be useful but it is not required. The code should be written in C (or C++). The information would be used at configure-time (when the CUDA toolkit is not available) and at run-time (when the executed binary is not compiled with CUDA support) to suggest the user that a supported GPU is present in the system.
As far as I understand, this is possible through the driver API, but I am not very familiar with the technical details of what this would require. So my questions are:
What are the exact steps to fulfill at least the minimum requirement (see above);
Is there such open-source code available?
Note that the my first step would be to have some code for Linux, but ultimately I'd need platform-independent code. Considering the platform-availability of CUDA, for a complete solution this would involve code for on x86/AMD64 for Linux, Mac OS, and Windows (at least for now, the list could get soon extended with ARM).
Edit
What I meant by "it's possible through the driver API" is that one should be able to load libcuda.so dynamically and query the device properties through the driver API. I'm not sure about the details, though.
Unfortunately NVML doesn't provide information about device compute capability.
What you need to do is:
Load CUDA library manually (application is not linked against libcuda)
If the library doesn't exist then CUDA driver is not installed
Find pointers to necessary functions in the library
Use driver API to query information about available GPUs
I hope this code will be helpful. I've tested it under Linux but with minor modifications it should also compile under Windows.
#include <cuda.h>
#include <stdio.h>
#ifdef WINDOWS
#include <Windows.h>
#else
#include <dlfcn.h>
#endif
void * loadCudaLibrary() {
#ifdef WINDOWS
return LoadLibraryA("nvcuda.dll");
#else
return dlopen ("libcuda.so", RTLD_NOW);
#endif
}
void (*getProcAddress(void * lib, const char *name))(void){
#ifdef WINDOWS
return (void (*)(void)) GetProcAddress(lib, name);
#else
return (void (*)(void)) dlsym(lib,(const char *)name);
#endif
}
int freeLibrary(void *lib)
{
#ifdef WINDOWS
return FreeLibrary(lib);
#else
return dlclose(lib);
#endif
}
typedef CUresult CUDAAPI (*cuInit_pt)(unsigned int Flags);
typedef CUresult CUDAAPI (*cuDeviceGetCount_pt)(int *count);
typedef CUresult CUDAAPI (*cuDeviceComputeCapability_pt)(int *major, int *minor, CUdevice dev);
int main() {
void * cuLib;
cuInit_pt my_cuInit = NULL;
cuDeviceGetCount_pt my_cuDeviceGetCount = NULL;
cuDeviceComputeCapability_pt my_cuDeviceComputeCapability = NULL;
if ((cuLib = loadCudaLibrary()) == NULL)
return 1; // cuda library is not present in the system
if ((my_cuInit = (cuInit_pt) getProcAddress(cuLib, "cuInit")) == NULL)
return 1; // sth is wrong with the library
if ((my_cuDeviceGetCount = (cuDeviceGetCount_pt) getProcAddress(cuLib, "cuDeviceGetCount")) == NULL)
return 1; // sth is wrong with the library
if ((my_cuDeviceComputeCapability = (cuDeviceComputeCapability_pt) getProcAddress(cuLib, "cuDeviceComputeCapability")) == NULL)
return 1; // sth is wrong with the library
{
int count, i;
if (CUDA_SUCCESS != my_cuInit(0))
return 1; // failed to initialize
if (CUDA_SUCCESS != my_cuDeviceGetCount(&count))
return 1; // failed
for (i = 0; i < count; i++)
{
int major, minor;
if (CUDA_SUCCESS != my_cuDeviceComputeCapability(&major, &minor, i))
return 1; // failed
printf("dev %d CUDA compute capability major %d minor %d\n", i, major, minor);
}
}
freeLibrary(cuLib);
return 0;
}
Test on Linux:
$ gcc -ldl main.c
$ ./a.out
dev 0 CUDA compute capability major 2 minor 0
dev 1 CUDA compute capability major 2 minor 0
Test on linux with no CUDA driver
$ ./a.out
$ echo $?
1
Cheers
Sure these people know the answer:
http://www.ozone3d.net/gpu_caps_viewer
but i can only know that i could be done with an installation of CUDA or OpenCL.
I think one way could be using OpenGL directly, maybe that is what you were talking about with the driver API, but i can only give you these example (CUDA required):
http://www.naic.edu/~phil/hardware/nvidia/doc/src/deviceQuery/deviceQuery.cpp
First, I think NVIDIA NVML is the API you are looking for. Second, there is an open-source project based on NVML called PAPI NVML.
I solved this problem by using and linking statically against the CUDA 6.0 SDK. It produces an application that works also well on a machines that does not have NVIDIA cards or on machines that the SDK is not installed. In such case it will indicate that there are zero CUDA capable devices.
There is an example in the samples included with the CUDA SDK calld deviceQuery - I used snippets from it to write the following code. I decide if a CUDA capable devices are present and if so which has the higest compute capabilities:
#include <cuda_runtime.h>
struct GpuCap
{
bool QueryFailed; // True on error
int DeviceCount; // Number of CUDA devices found
int StrongestDeviceId; // ID of best CUDA device
int ComputeCapabilityMajor; // Major compute capability (of best device)
int ComputeCapabilityMinor; // Minor compute capability
};
GpuCap GetCapabilities()
{
GpuCap gpu;
gpu.QueryFailed = false;
gpu.StrongestDeviceId = -1;
gpu.ComputeCapabilityMajor = -1;
gpu.ComputeCapabilityMinor = -1;
cudaError_t error_id = cudaGetDeviceCount(&gpu.DeviceCount);
if (error_id != cudaSuccess)
{
gpu.QueryFailed = true;
gpu.DeviceCount = 0;
return gpu;
}
if (gpu.DeviceCount == 0)
return gpu; // "There are no available device(s) that support CUDA
// Find best device
for (int dev = 0; dev < gpu.DeviceCount; ++dev)
{
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, dev);
if (deviceProp.major > gpu.ComputeCapabilityMajor)
{
gpu.ComputeCapabilityMajor = dev;
gpu.ComputeCapabilityMajor = deviceProp.major;
gpu.ComputeCapabilityMinor = 0;
}
if (deviceProp.minor > gpu.ComputeCapabilityMinor)
{
gpu.ComputeCapabilityMajor = dev;
gpu.ComputeCapabilityMinor = deviceProp.minor;
}
}
return gpu;
}

Resources