Related
Am trying to verify a message using RSA public key with help of WolfCrypt library. The message signing and verification is done using Openssl is successful with below commands.
openssl dgst -sha256 -sign private.pem -out Message.sign.rsa1024-sha-256 Message.txt
openssl dgst -sha256 -verify public.der -signture Message.sign.rsa1024-sha-256 Message.txt
Now, while trying to write a program using WolfCrypt library to verify the message(the program is not complete, Am stuck at parse public key part), the program is raising segmentation fault while parsing the public key itself. However, while trying to debug the program using GDB, the parse key section of code executes does not raise any segmentation fault and going to next step and exiting normally.
To avoid segfault, I tried malloc instead, still getting alloc(): memory corruption error.
It looks like the problem lies in wc_RsaPublicKeyDecode parameters. on GDB, while step-in to this function, the parameters looks empty. Any suggestion is welcome.
#include <stdio.h>
#include <stdlib.h>
/* Import APIs for Signing and Verification */
#include "wolfssl/wolfcrypt/rsa.h"
#include "wolfssl/wolfcrypt/hash.h"
#include "wolfssl/wolfcrypt/signature.h"
/* Import WolfSSL Types */
#include "wolfssl/wolfcrypt/types.h"
typedef struct wrap_Key
{
word32 _KeyIndex;
RsaKey _RsaKey;
}Key_t;
typedef struct wrap_Signature
{
/* Signature Algorithms */
enum wc_SignatureType _TYPE;
enum wc_HashType _DIGEST;
/* Message & Signature */
byte *_Message;
word32 _MessageLength;
byte *_Signature;
word32 _SignatureLength;
byte *_KeyBuffer;
word32 _KeyBufferLength;
/* RSA Key Structure */
Key_t _PKCS;
}Signature_t;
static int wrap_ReadFileToBuffer( byte **BufferData, word32* BufferLength, byte* URI )
{
int ret = EXIT_SUCCESS;
FILE *file = NULL;
file = fopen(URI, "r");
if( NULL == file )
{
printf( "Error! Unable to stat file.\r\n" );
return EXIT_FAILURE;
}
/* Get content length & Reset Cursor */
fseek( file, 0, SEEK_END );
*BufferLength = (word32) ftell(file);
fseek( file, 0, SEEK_SET );
/* Allocate Enough Buffer */
*BufferData = (byte*)(malloc( *BufferLength ));
if( NULL == *BufferData )
{
fclose(file);
printf("Error! Memory Allocation Failed.\r\n");
return EXIT_FAILURE;
}
/* Read File Content */
if( ( ret = fread( *BufferData, 1, *BufferLength, file ) )
!= *BufferLength )
{
fclose(file);
printf("Error! Unable to read file.\r\n");
return EXIT_FAILURE;
}
fclose(file);
return ret;
}
Signature_t *RSA1;
int main()
{
int ret = EXIT_SUCCESS;
RSA1 = malloc(sizeof(Signature_t));
/* Define Signagure & Type */
RSA1->_TYPE = WC_SIGNATURE_TYPE_RSA;
RSA1->_DIGEST = WC_HASH_TYPE_SHA256;
/* Initialize Message & Signature */
RSA1->_Message = NULL;
RSA1->_Signature = NULL;
/* Verify does the Hash given above is supproted? */
if( wc_HashGetDigestSize( RSA1->_DIGEST ) <= 0 )
{
printf("Hash type %d not supported!\n", RSA1->_DIGEST);
return EXIT_FAILURE;
}
if( wrap_ReadFileToBuffer( &(RSA1->_Message),
&(RSA1->_MessageLength), "Message.txt" ) <= 0 )
{
printf("Error! Reading Message Failed.\r\n");
return EXIT_FAILURE;
}
if( wrap_ReadFileToBuffer( &(RSA1->_Signature),
&(RSA1->_SignatureLength),
"Message.sign.rsa1024-sha-256" ) <= 0 )
{
printf("Error! Reading Signature Failed.\r\n");
return EXIT_FAILURE;
}
if( wrap_ReadFileToBuffer( &(RSA1->_KeyBuffer), &(RSA1->_KeyBufferLength),
"public.der" ) <= 0 )
{
printf("Error! Reading Key Failed.\r\n");
return EXIT_FAILURE;
}
if( ( ret = wc_InitRsaKey( &(RSA1->_PKCS._RsaKey), NULL ) ) )
{
printf("Error! Initialize Key Failed: -%d.\r\n", -ret);
return EXIT_FAILURE;
}
RSA1->_PKCS._KeyIndex = 0;
if( ( ret = wc_RsaPublicKeyDecode( RSA1->_KeyBuffer,
&RSA1->_PKCS._KeyIndex,
&RSA1->_PKCS._RsaKey,
RSA1->_KeyBufferLength ) ) )
{
printf("Error! Reading Key Failed: -%d.\r\n", -ret);
return EXIT_FAILURE;
}
free(RSA1);
printf("WolfCrypt - Sample program!\r\n");
return ret;
}
While trying to debug using GDB, found that after wc_InitRsaKey function, the entire structure *RSA1 is getting char array(byte here) is missing it's data.
(gdb) p *RSA1
$43 = {_TYPE = WC_SIGNATURE_TYPE_RSA, _DIGEST = WC_HASH_TYPE_SHA256,
_Message = 0x5555557585d0 "This is sample message to be signed!\n", _MessageLength = 37,
_Signature = 0x555555758600 "$\372\324#\340M\353\"\216\226\302V\372\265\210\242\377\362\343ɮ\032\021\206K\016/\f2\002\020!\274\234\024\212,\034\276\276,31\217\277\274sP\341c\024=u\236\233l\207\330\320>Ė\300K\211]\325\322x\307_9\251\017#\021'\225Oƞ\276\311\a\177\063`\016\271G8\r;\201\036,7x\246\251Wd\246j\273\272\220\304\354\244\305\370\027\321\312\017\250n\336'\375v{\251\267\270\237M", _SignatureLength = 128,
_KeyBuffer = 0x555555758690 "0\201\237\060\r\006\t*\206H\206\367\r\001\001\001\005",
_KeyBufferLength = 162, _PKCS = {_KeyIndex = 0, _RsaKey = {n = {used = 0, alloc = 0,
sign = 0, dp = 0x0}, e = {used = 0, alloc = 0, sign = 0, dp = 0x0}, d = {used = 0,
alloc = 0, sign = 0, dp = 0x0}, p = {used = 0, alloc = 0, sign = 0, dp = 0x0}, q = {
used = 0, alloc = 0, sign = 0, dp = 0x0}, dP = {used = 0, alloc = 0, sign = 0,
dp = 0x0}, dQ = {used = 0, alloc = 0, sign = 0, dp = 0x0}, u = {used = 0, alloc = 0,
sign = 0, dp = 0x0}, heap = 0x0, data = 0x0, type = 0, state = 0, dataLen = 0,
dataIsAlloc = 0 '\000'}}}
(gdb) n
133 RSA1->_PKCS._KeyIndex = 0;
(gdb) p *RSA1
$44 = {_TYPE = WC_SIGNATURE_TYPE_RSA, _DIGEST = WC_HASH_TYPE_SHA256,
_Message = 0x5555557585d0 "", _MessageLength = 37, _Signature = 0x555555758600 "",
_SignatureLength = 128, _KeyBuffer = 0x555555758690 "", _KeyBufferLength = 162, _PKCS = {
_KeyIndex = 0, _RsaKey = {n = {used = 0, alloc = 0, sign = 0, dp = 0x0}, e = {used = 0,
alloc = 0, sign = 0, dp = 0x0}, d = {used = 0, alloc = 0, sign = 0, dp = 0x0}, p = {
used = 0, alloc = 0, sign = 0, dp = 0x0}, q = {used = 0, alloc = 0, sign = 0,
dp = 0x0}, dP = {used = 0, alloc = 0, sign = 0, dp = 0x0}, dQ = {used = 0, alloc = 0,
sign = 0, dp = 0x0}, u = {used = 0, alloc = 0, sign = 0, dp = 0x0}, heap = 0x0,
data = 0x0, type = 0, state = 0, dataLen = 0, dataIsAlloc = 0 '\000'}}}
#Gopi,
The most common cause of such a segmentation fault is a misconfiguration of application and library. The first thing to check is the headers included.
/* Import APIs for Signing and Verification */
#include "wolfssl/wolfcrypt/rsa.h"
#include "wolfssl/wolfcrypt/hash.h"
#include "wolfssl/wolfcrypt/signature.h"
/* Import WolfSSL Types */
#include "wolfssl/wolfcrypt/types.h"
Notice that neither "wolfssl/options.h" or "wolfssl/wolfcrypt/settings.h" was included. If you built the wolfSSL library with ./configure && make please include "wolfssl/options.h" before all other wolfSSL headers. If you are only using "wolfssl/wolfcrypt/settings.h" to control the build and options.h is not found then at least include "wolfssl/wolfcrypt/settings.h" before all other wolfSSL headers:
/* Import APIs for Signing and Verification */
#include <wolfssl/options.h> // If not found or not available then include <wolfssl/wolfcrypt/settings.h>
#include "wolfssl/wolfcrypt/rsa.h"
#include "wolfssl/wolfcrypt/hash.h"
#include "wolfssl/wolfcrypt/signature.h"
/* Import WolfSSL Types */
#include "wolfssl/wolfcrypt/types.h"
Cheers,
K
I'm using libusb to communicate with a Philips ISP1362 configured as a USB device. I am able to successfully loopback data using Synchronous I/O without any problems. For some reason when using Asynchronous I/O there appears to be a race condition.
I am transferring 64-byte packets using back-to-back OUT-IN transfers. Occasionally when I run my program libusb throws a timeout error and some of the loopback data is lost. When analyzing the USB bus using my Beagle 12 I can see the OUT-IN transactions are out of order (i.e. OUT-OUT-IN-TIMEOUT) when it should be (OUT-IN-OUT-IN).
Update The transfers are appearing out of order in the callback function which is strange because they are not coinciding with what is actually on the bus analyzer.
Example 1: (IN-OUT-IN-OUT)
main(): submitting transfer 0, endpoint 1
main(): submitting transfer 1, endpoint 82
main(): submitting transfer 2, endpoint 1
main(): submitting transfer 3, endpoint 82
xfr_cb(): count 0, status = 0, endpoint = 82, actual_length = 64, completed = 0
xfr_cb(): count 1, status = 0, endpoint = 1, actual_length = 64, completed = 0
xfr_cb(): count 2, status = 0, endpoint = 82, actual_length = 64, completed = 0
xfr_cb(): count 3, status = 0, endpoint = 1, actual_length = 64, completed = 0
completed
Example 2: (OUT-IN-IN-OUT)
main(): submitting transfer 0, endpoint 1
main(): submitting transfer 1, endpoint 82
main(): submitting transfer 2, endpoint 1
main(): submitting transfer 3, endpoint 82
xfr_cb(): count 0, status = 0, endpoint = 1, actual_length = 64, completed = 0
xfr_cb(): count 1, status = 0, endpoint = 82, actual_length = 64, completed = 0
xfr_cb(): count 2, status = 0, endpoint = 82, actual_length = 64, completed = 0
xfr_cb(): count 3, status = 0, endpoint = 1, actual_length = 64, completed = 0
completed
Below is a screenshot from the analyzer:
Below is the code:
#include <stdlib.h>
#include <stdio.h>
#include <libusb-1.0/libusb.h>
/* Specify VENDOR_ID and PRODUCT_ID for device */
#define VENDOR_ID 0x0471
#define PRODUCT_ID 0x3630
/* Define number of bytes to transfer */
#define EP_SIZE 64 // bytes
#define TRANSFERS 4 // number of transfers
#define BYTES EP_SIZE*TRANSFERS
#define TIMEOUT 3*1000 // milliseconds
/* Use a global variable to keep the device handle */
static struct libusb_device_handle *devh = NULL;
/* use a global variable to keep the context */
static struct libusb_context *usb_context = NULL;
/* count variable */
int count = 0;
/* The Endpoint addresses are hard-coded. You should use libusb -v to find
* the values corresponding to device
*/
static int ep_in = 0x82;
static int ep_out = 0x01;
void xfr_cb(struct libusb_transfer *transfer )
{
int *completed = transfer->user_data;
/* callback - This is called after the transfer has been received by libusb */
fprintf(stderr, "xfr_cb(): count %d, status = %d, endpoint = %x, actual_length = %d, completed = %d\n",
count,
transfer->status,
transfer->endpoint,
transfer->actual_length,
*completed);
if (transfer->status != LIBUSB_TRANSFER_COMPLETED)
{
/* Error! */
fprintf(stderr, "Error: %s\n", libusb_error_name((int)transfer->status));
}
if (count == TRANSFERS-1)
*completed = 1;
count++;
}
int main(int argc, char **argv)
{
int ep_addr;
int completed = 0;
unsigned char *buf;
size_t length = 64;
int n;
int i;
int rc;
/* Initialize libusb */
rc = libusb_init(NULL);
if (rc < 0)
{
fprintf(stderr, "Error Initializing libusb: %s\n", libusb_error_name(rc));
exit(1);
}
/* Set debugging output to max level */
libusb_set_debug(NULL, 3);
/* Look for a specific device and open it */
devh = libusb_open_device_with_vid_pid(NULL, VENDOR_ID, PRODUCT_ID);
if (!devh)
{
fprintf(stderr, "Error finding USB device\n");
goto out;
}
/* allocate memory */
buf = malloc(length);
/* start with OUT transfer */
ep_addr = ep_out;
/* queue up alternating OUT-IN transfers */
for (i = 0; i < TRANSFERS; i++)
{
/* fill the buffer with incrementing data */
for (n = 0; n < EP_SIZE; n++)
{
buf[n] = i+n;
}
/* Set up the transfer object */
struct libusb_transfer *transfer;
transfer = libusb_alloc_transfer(0);
libusb_fill_bulk_transfer(transfer, devh, ep_addr, buf, EP_SIZE, xfr_cb, &completed, TIMEOUT); /* callback data = &completed */
/* Submit the transfer object */
libusb_submit_transfer(transfer);
fprintf(stderr, "main(): submitting transfer %d, endpoint %x\n", i, ep_addr);
/* alternate writing and reading for loopback */
ep_addr = (ep_addr == ep_out) ? ep_in : ep_out;
}
/* Handle Events */
while (!completed)
{
rc = libusb_handle_events_completed(NULL, &completed);
if (rc < 0)
{
if (rc == LIBUSB_ERROR_INTERRUPTED)
continue;
fprintf(stderr, "Transfer Error: %s", libusb_error_name(rc));
continue;
}
}
fprintf(stderr, "completed\n");
/* Release the interface */
libusb_release_interface(devh, 0);
/* Close the device handle */
if (devh)
libusb_close(devh);
out:
if (devh)
{
libusb_close(devh);
}
libusb_exit(NULL);
return rc;
}
Update 2 I successfully eliminated the timeout. The cause of the libusb timeout is because the Host was sending two consecutive OUT transactions intermittently on the bus.
Analyzer screenshot:
The following is the working code (no timeouts). Ran these thousands of times with no issues
static void LIBUSB_CALL xfr_cb(struct libusb_transfer *transfer )
{
int *completed = transfer->user_data;
unsigned char *wbuf, *rbuf;
size_t length = 64;
fprintf(stderr, "xfr_cb(): status = %d, endpoint = %x, actual_length = %d\n",
transfer->status,
transfer->endpoint,
transfer->actual_length);
*completed = 1;
}
int main(int argc, char **argv)
{
const struct libusb_version *version;
int ep_addr;
int completed = 0;
unsigned char *buf, *wbuf1, *wbuf2, *rbuf1, *rbuf2;
size_t length = 64;
int n;
int m;
int i;
int rc;
/* Get libusb version */
version = libusb_get_version();
fprintf(stderr, "libusb version: %d.%d.%d.%d\n", version->major, version->minor, version->micro, version->nano);
/* Initialize libusb */
rc = libusb_init(NULL);
if (rc < 0)
{
fprintf(stderr, "Error Initializing libusb: %s\n", libusb_error_name(rc));
exit(1);
}
/* Set debugging output to max level */
libusb_set_debug(NULL, 3);
/* Look for a specific device and open it */
handle = libusb_open_device_with_vid_pid(NULL, VENDOR_ID, PRODUCT_ID);
if (!handle)
{
fprintf(stderr, "Error finding USB device\n");
goto out;
}
/* claim interface */
rc = libusb_claim_interface(handle, 0);
if (rc < 0)
{
fprintf(stderr, "Error claiming interface.\n");
goto out;
}
/* allocate memory */
wbuf1 = malloc(length);
wbuf2 = malloc(length);
rbuf1 = malloc(length);
rbuf2 = malloc(length);
/* fill the buffer with incrementing data */
for (n = 0; n < EP_SIZE; n++)
wbuf1[n] = n;
for (m = 0; m < EP_SIZE; m++)
wbuf2[m] = m+1;
struct libusb_transfer *transfer1;
struct libusb_transfer *transfer2;
struct libusb_transfer *transfer3;
struct libusb_transfer *transfer4;
/* Set up the transfer object */
transfer1 = libusb_alloc_transfer(0);
transfer2 = libusb_alloc_transfer(0);
transfer3 = libusb_alloc_transfer(0);
transfer4 = libusb_alloc_transfer(0);
libusb_fill_bulk_transfer(transfer1, handle, ep_out, wbuf1, EP_SIZE, xfr_cb, NULL, TIMEOUT);
libusb_fill_bulk_transfer(transfer2, handle, ep_in, rbuf1, EP_SIZE, xfr_cb, NULL, TIMEOUT);
libusb_fill_bulk_transfer(transfer3, handle, ep_out, wbuf2, EP_SIZE, xfr_cb, NULL, TIMEOUT);
libusb_fill_bulk_transfer(transfer4, handle, ep_in, rbuf2, EP_SIZE, xfr_cb, &completed, TIMEOUT); /* callback data = &completed */
/* Submit the transfers */
libusb_submit_transfer(transfer1);
libusb_submit_transfer(transfer2);
libusb_submit_transfer(transfer3);
libusb_submit_transfer(transfer4);
/* Handle Events */
while (!completed)
{
rc = libusb_handle_events_completed(NULL, &completed);
if (rc != LIBUSB_SUCCESS)
{
fprintf(stderr, "Transfer Error: %s\n", libusb_error_name(rc));
break;
}
}
fprintf(stderr, "completed\n");
//* Release the interface */
libusb_release_interface(handle, 0);
/* Close the device handle */
if (handle)
libusb_close(handle);
out:
if (handle)
{
libusb_close(handle);
}
libusb_exit(NULL);
return rc;
}
Changing the code as follows (i.e. callback = NULL for transfer 1-3) re-creates intermittent duplicate transactions, as shown in the screenshots.
libusb_fill_bulk_transfer(transfer1, handle, ep_out, wbuf1, EP_SIZE, NULL, NULL, TIMEOUT);
libusb_fill_bulk_transfer(transfer2, handle, ep_in, rbuf1, EP_SIZE, NULL, NULL, TIMEOUT);
libusb_fill_bulk_transfer(transfer3, handle, ep_out, wbuf2, EP_SIZE, NULL, NULL, TIMEOUT);
libusb_fill_bulk_transfer(transfer4, handle, ep_in, rbuf2, EP_SIZE, xfr_cb, &completed, TIMEOUT); /* callback data = &completed */
I honestly don't understand why the loop would cause race conditions based on their documentation and examples. Queueing up multiple transfers is actually suggested in one of the libusb examples (sam3u_benchmark.c) and also demonstrated (using loops) in the following .pdfs.
See asynchronous I/O sections:
https://www.socallinuxexpo.org/sites/default/files/presentations/scale_2017_usb.pdf
http://www.signal11.us/oss/elc2014/elc_2014_usb_0.pdf
From my understanding, the use of libusb_handle_events_completed(NULL, &completed) is supposed to resolve synchronization issues. Am I misunderstanding something?
See libusb_handle_events() from multiple threads
http://libusb.sourceforge.net/api-1.0/libusb_mtasync.html
-"This is why libusb-1.0.9 introduces the new libusb_handle_events_timeout_completed() and libusb_handle_events_completed() functions, which handles doing the completion check for you after they have acquired the lock:"
What they need are crystal clear examples of how to use their API if this is the case.
I can add more event checking but something does not seem right here.
Update 3: See accepted answer.
I started reading the documentation in the libusb source code and understood what was happening.
Particularly the section about how libusb deals with packet sizes:
http://libusb.sourceforge.net/api-1.0/libusb_packetoverflow.html
After reading that it clicked for me and I found two ways to accomplish a loopback test with large data size using asynchronous I/O.
The first way is submitting two transfers consecutively with transfer->buffer containing the entire data structure (i.e. total bytes to send and receive). The second way is submitting the two transfers with transfer->buffer containing wMaxPacketSize (e.g. 64-bytes) and having the out and in callback functions submit additional transfers to transceive the rest of the data.
For the second case, extra code needed to be added to keep track of the number of transfers and to set the completed signal when finished. The OUT-IN packet interleaving is handled by libusb and the OS - which was the part I didn't realize. In other words, not every OUT-IN transfer needed to be specified and queued individually.
Here is the asynchronous code along with the transfer rates to my USB device (ISP1362). My USB device controller is an FPGA coded in pure SystemVerilog.
Note: Regarding the transfer rates, I only have double-buffering enabled on BULK_EP_IN. I am assuming the IN-NAK's (# POLL) and transfer rate would improve in the second approach if double-buffering was enabled on BULK_EP_OUT. So this may not be a fair comparison due to device configuration.
First approach: ~1.161 MB/s (~9.288 Mb/s)
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include </usr/include/libusb-1.0/libusb.h>
/* Specify VENDOR_ID and PRODUCT_ID for device */
#define VENDOR_ID 0x0471
#define PRODUCT_ID 0x3630
/* Define number of bytes to transfer */
#define EP_SIZE 64 // bytes
#define TRANSFERS 1024*768*3/EP_SIZE // number of transfers
#define TIMEOUT 10*1000 // milliseconds
/* Use a global variable to keep the device handle */
static struct libusb_device_handle *handle = NULL;
/* count variables */
unsigned int count = 0;
unsigned int count_in = 0;
unsigned int count_out = 0;
/* The Endpoint addresses are hard-coded. You should use libusb -v to find
* the values corresponding to device
*/
static int ep_in = 0x82;
static int ep_out = 0x01;
/* Write and Read buffers */
unsigned char wbuf[EP_SIZE*TRANSFERS];
unsigned char wbuf_tmp[EP_SIZE*TRANSFERS];
unsigned char rbuf[EP_SIZE*TRANSFERS];
unsigned char rbuf_tmp[EP_SIZE*TRANSFERS];
static void LIBUSB_CALL xfr_cb_out(struct libusb_transfer *transfer )
{
memcpy(wbuf+count_out*EP_SIZE, transfer->buffer, EP_SIZE);
}
static void LIBUSB_CALL xfr_cb_in(struct libusb_transfer *transfer )
{
int *completed = transfer->user_data;
memcpy(rbuf+count_in*EP_SIZE, transfer->buffer, EP_SIZE);
count_in++; // one transfer complete
if (count_in < TRANSFERS)
*completed = 1;
}
int main(int argc, char **argv)
{
const struct libusb_version *version;
int completed = 0;
size_t length = 64;
int n;
int m;
int rc;
/* Get libusb version */
version = libusb_get_version();
fprintf(stderr, "libusb version: %d.%d.%d.%d\n", version->major, version->minor, version->micro, version->nano);
/* Initialize libusb */
rc = libusb_init(NULL);
if (rc < 0)
{
fprintf(stderr, "Error Initializing libusb: %s\n", libusb_error_name(rc));
exit(1);
}
/* Set debugging output to max level */
libusb_set_debug(NULL, 3);
/* Look for a specific device and open it */
handle = libusb_open_device_with_vid_pid(NULL, VENDOR_ID, PRODUCT_ID);
if (!handle)
{
fprintf(stderr, "Error finding USB device\n");
goto out;
}
/* claim interface */
rc = libusb_claim_interface(handle, 0);
if (rc < 0)
{
fprintf(stderr, "Error claiming interface.\n");
goto out;
}
/* fill the buffer with incrementing data */
for (n = 0; n < TRANSFERS; n++)
{
for (m = 0; m < EP_SIZE; m++)
{
wbuf_tmp[m+n*EP_SIZE] = m+n;
}
}
struct libusb_transfer *transfer;
transfer = libusb_alloc_transfer(0);
libusb_fill_bulk_transfer(transfer, handle, ep_out, wbuf_tmp, EP_SIZE*TRANSFERS, xfr_cb_out, NULL, TIMEOUT);
libusb_submit_transfer(transfer);
transfer = libusb_alloc_transfer(0);
libusb_fill_bulk_transfer(transfer, handle, ep_in, rbuf_tmp, EP_SIZE*TRANSFERS, xfr_cb_in, &completed, TIMEOUT);
libusb_submit_transfer(transfer);
/* Handle Events */
while (!completed)
{
rc = libusb_handle_events_completed(NULL, &completed);
if (rc != LIBUSB_SUCCESS)
{
fprintf(stderr, "Transfer Error: %s\n", libusb_error_name(rc));
break;
}
}
fprintf(stderr, "completed\n");
int res;
res = memcmp(rbuf, wbuf, sizeof(wbuf));
if (res != 0)
fprintf(stderr, "miscompare\n");
else
fprintf(stderr, "success\n");
//* Release the interface */
libusb_release_interface(handle, 0);
/* Close the device handle */
if (handle)
libusb_close(handle);
out:
if (handle)
{
libusb_close(handle);
}
libusb_exit(NULL);
return rc;
}
Second approach: ~755.9 MB/s (~6.047 Mb/s)
include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include </usr/include/libusb-1.0/libusb.h>
/* Specify VENDOR_ID and PRODUCT_ID for device */
#define VENDOR_ID 0x0471
#define PRODUCT_ID 0x3630
/* Define number of bytes to transfer */
#define EP_SIZE 64 // bytes
#define TRANSFERS 1024*768*3/EP_SIZE // number of transfers
#define TIMEOUT 10*1000 // milliseconds
/* Use a global variable to keep the device handle */
static struct libusb_device_handle *handle = NULL;
/* count variables */
unsigned int count = 0;
unsigned int count_in = 0;
unsigned int count_out = 0;
/* The Endpoint addresses are hard-coded. You should use libusb -v to find
* the values corresponding to device
*/
static int ep_in = 0x82;
static int ep_out = 0x01;
/* Write and Read buffers */
unsigned char wbuf[EP_SIZE*TRANSFERS];
unsigned char *wbuf_tmp;
unsigned char rbuf[EP_SIZE*TRANSFERS];
unsigned char rbuf_tmp[EP_SIZE*TRANSFERS];
static void LIBUSB_CALL xfr_cb_out(struct libusb_transfer *transfer )
{
memcpy(wbuf+count_out*EP_SIZE, transfer->buffer, EP_SIZE);
count_out++; // one transfer complete
if (count_out < TRANSFERS)
{
transfer->buffer = ++wbuf_tmp;
libusb_submit_transfer(transfer);
}
}
static void LIBUSB_CALL xfr_cb_in(struct libusb_transfer *transfer )
{
int *completed = transfer->user_data;
memcpy(rbuf+count_in*EP_SIZE, transfer->buffer, EP_SIZE);
count_in++; // one transfer complete
if (count_in < TRANSFERS)
libusb_submit_transfer(transfer);
else
*completed = 1;
}
int main(int argc, char **argv)
{
const struct libusb_version *version;
int completed = 0;
size_t length = 64;
int n;
int rc;
/* Get libusb version */
version = libusb_get_version();
fprintf(stderr, "libusb version: %d.%d.%d.%d\n", version->major, version->minor, version->micro, version->nano);
/* Initialize libusb */
rc = libusb_init(NULL);
if (rc < 0)
{
fprintf(stderr, "Error Initializing libusb: %s\n", libusb_error_name(rc));
exit(1);
}
/* Set debugging output to max level */
libusb_set_debug(NULL, 3);
/* Look for a specific device and open it */
handle = libusb_open_device_with_vid_pid(NULL, VENDOR_ID, PRODUCT_ID);
if (!handle)
{
fprintf(stderr, "Error finding USB device\n");
goto out;
}
/* claim interface */
rc = libusb_claim_interface(handle, 0);
if (rc < 0)
{
fprintf(stderr, "Error claiming interface.\n");
goto out;
}
/* allocate memory */
wbuf_tmp = malloc(length*TRANSFERS);
/* fill the buffer with incrementing data */
for (n = 0; n < EP_SIZE*TRANSFERS; n++)
{
wbuf_tmp[n] = n;
}
struct libusb_transfer *transfer;
transfer = libusb_alloc_transfer(0);
libusb_fill_bulk_transfer(transfer, handle, ep_out, wbuf_tmp, EP_SIZE, xfr_cb_out, NULL, TIMEOUT);
libusb_submit_transfer(transfer);
transfer = libusb_alloc_transfer(0);
libusb_fill_bulk_transfer(transfer, handle, ep_in, rbuf_tmp, EP_SIZE, xfr_cb_in, &completed, TIMEOUT);
libusb_submit_transfer(transfer);
/* Handle Events */
while (!completed)
{
rc = libusb_handle_events_completed(NULL, &completed);
if (rc != LIBUSB_SUCCESS)
{
fprintf(stderr, "Transfer Error: %s\n", libusb_error_name(rc));
break;
}
}
fprintf(stderr, "completed\n");
int res;
res = memcmp(rbuf, wbuf, sizeof(wbuf));
if (res != 0)
fprintf(stderr, "miscompare\n");
else
fprintf(stderr, "success\n");
//* Release the interface */
libusb_release_interface(handle, 0);
/* Close the device handle */
if (handle)
libusb_close(handle);
out:
if (handle)
{
libusb_close(handle);
}
libusb_exit(NULL);
return rc;
}
Update: See accepted answer.
The following is an example using Synchronous I/O. I had a lot of trouble getting the transactions to come out in the expected order using Asynchronous I/O. I assume this was due to transfers racing with each other as #Gene had mentioned.
The main gripe I have about the libusb API is the lack of examples to illustrate proper use. The API would lead someone to believe that asynchronous transactions are placed on the bus in the order they are "submitted" and from what I gather this is not true. This functionality would be fine for submitting transactions with all the same packet TOKEN (i.e. OUT or IN).
The following code works for large bulk transfers.
Using Synchronous I/O
#include <stdlib.h>
#include <stdio.h>
#include <libusb-1.0/libusb.h>
/* Change VENDOR_ID and PRODUCT_ID depending on device */
#define VENDOR_ID 0x0471
#define PRODUCT_ID 0x3630
/* Define number of bytes to transfer */
#define BYTES 1024*768*3 // bytes
#define EP_SIZE 64 // bytes
#define TIMEOUT 5*1000 // milliseconds
/* Use a global variable to keep the device handle */
static struct libusb_device_handle *devh = NULL;
/* The Endpoint addresses are hard-coded. You should use libusb -v to find
* the values corresponding to device
*/
static int ep_in_addr = 0x82;
static int ep_out_addr = 0x01;
int write_chars(unsigned char * data, int length)
{
/* To send a char to the device simply initiate a bulk_transfer to the Endpoint
* with the address ep_out_addr.
*/
int actual_length;
int rc = libusb_bulk_transfer(devh, ep_out_addr, data, length, &actual_length, TIMEOUT);
if (rc < 0)
{
fprintf(stderr, "Error while sending char: %d\n", rc);
return -1;
}
return actual_length;
}
int read_chars(unsigned char * data, int length)
{
/* To receive characters from the device initiate a bulk_transfer to the Entpoint
* with address ep_in_addr
*/
int actual_length;
int rc = libusb_bulk_transfer(devh, ep_in_addr, data, length, &actual_length, TIMEOUT);
if (rc == LIBUSB_ERROR_TIMEOUT)
{
printf("timeout (%d)\n", actual_length);
return -1;
}
else if (rc < 0)
{
fprintf(stderr, "Error while waiting for char: %d\n", rc);
return -1;
}
return actual_length;
}
int main(int argc, char **argv)
{
int rc;
/* Initialize libusb */
rc = libusb_init(NULL);
if (rc < 0)
{
fprintf(stderr, "Error Initializing libusb: %s\n", libusb_error_name(rc));
exit(1);
}
/* Set debugging output to max level */
libusb_set_debug(NULL, 3);
/* Look for a specific device and open it */
devh = libusb_open_device_with_vid_pid(NULL, VENDOR_ID, PRODUCT_ID);
if (!devh)
{
fprintf(stderr, "Error finding USB device\n");
goto out;
}
/* We can now start sending or receiving data to the device */
unsigned char buf[BYTES];
unsigned char rbuf[EP_SIZE];
int len;
int n;
int l;
int res;
// fill buffer
for (n = 0; n < BYTES; n++)
{
buf[n] = 0x00+n;
}
// loopback data, write-read
for (l = 0; l < BYTES/EP_SIZE; l++)
{
len = write_chars(buf+l*EP_SIZE, EP_SIZE);
len = read_chars(rbuf, EP_SIZE);
res = memcmp(rbuf, buf+l*EP_SIZE, sizeof(rbuf));
if (res != 0)
fprintf(stderr, "Miscompare: block %d\n", l);
}
libusb_release_interface(devh, 0);
out:
if (devh)
{
libusb_close(devh);
}
libusb_exit(NULL);
return rc;
}
Using Asynchronous and Synchronous together (i.e. OUT is submitted Asynchronously and IN is Synchronous)
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include </usr/include/libusb-1.0/libusb.h>
/* Specify VENDOR_ID and PRODUCT_ID for device */
#define VENDOR_ID 0x0471
#define PRODUCT_ID 0x3630
/* Define number of bytes to transfer */
#define EP_SIZE 64 // bytes
#define TRANSFERS 1024*768*3/EP_SIZE // number of transfers
#define BYTES EP_SIZE*TRANSFERS
#define TIMEOUT 15*1000 // milliseconds
/* Use a global variable to keep the device handle */
static struct libusb_device_handle *handle = NULL;
/* count variable */
unsigned int count_out = 0;
/* The Endpoint addresses are hard-coded. You should use libusb -v to find
* the values corresponding to device
*/
static int ep_in = 0x82;
static int ep_out = 0x01;
unsigned char rbuf[EP_SIZE*TRANSFERS];
unsigned char wbuf[EP_SIZE*TRANSFERS];
static void LIBUSB_CALL xfr_cb_out(struct libusb_transfer *transfer )
{
memcpy(wbuf+count_out*EP_SIZE, transfer->buffer, EP_SIZE);
count_out++;
}
int main(int argc, char **argv)
{
const struct libusb_version *version;
unsigned char *buf, *rbuf_tmp;
size_t length = 64;
int n;
int i;
int rc;
/* Get libusb version */
version = libusb_get_version();
fprintf(stderr, "libusb version: %d.%d.%d.%d\n", version->major, version->minor, version->micro, version->nano);
/* Initialize libusb */
rc = libusb_init(NULL);
if (rc < 0)
{
fprintf(stderr, "Error Initializing libusb: %s\n", libusb_error_name(rc));
exit(1);
}
/* Set debugging output to max level */
libusb_set_debug(NULL, 3);
/* Look for a specific device and open it */
handle = libusb_open_device_with_vid_pid(NULL, VENDOR_ID, PRODUCT_ID);
if (!handle)
{
fprintf(stderr, "Error finding USB device\n");
goto out;
}
/* claim interface */
rc = libusb_claim_interface(handle, 0);
if (rc < 0)
{
fprintf(stderr, "Error claiming interface.\n");
goto out;
}
/* allocate memory */
buf = malloc(length*TRANSFERS);
/* fill the buffer with incrementing data */
for (n = 0; n < EP_SIZE*TRANSFERS; n++)
{
buf[n] = n;
}
/* allocate memory */
rbuf_tmp = malloc(length);
/* set up alternating OUT-IN transfers */
for (i = 0; i < TRANSFERS; i++)
{
struct libusb_transfer *transfer;
transfer = libusb_alloc_transfer(0);
libusb_fill_bulk_transfer(transfer, handle, ep_out, buf+i, EP_SIZE, xfr_cb_out, NULL, TIMEOUT);
libusb_submit_transfer(transfer);
int actual_length;
int rc = libusb_bulk_transfer(handle, ep_in, rbuf_tmp, EP_SIZE, &actual_length, TIMEOUT);
if (rc != LIBUSB_SUCCESS)
{
fprintf(stderr, "Transfer Error: %s\n", libusb_error_name(rc));
break;
}
memcpy(rbuf+i*EP_SIZE, rbuf_tmp, EP_SIZE);
}
fprintf(stderr, "completed\n");
int res;
res = memcmp(rbuf, wbuf, sizeof(wbuf));
if (res != 0)
fprintf(stderr, "miscompare\n");
//* Release the interface */
libusb_release_interface(handle, 0);
/* Close the device handle */
if (handle)
libusb_close(handle);
out:
if (handle)
{
libusb_close(handle);
}
libusb_exit(NULL);
return rc;
}
The above code was an experiment to see if performance increased. Interestingly, the speed difference between the two was negligible.
The version of libusb was 1.0.17.10830
I am working on Assignment 2 of ops-class.
The following function bootstraps a file handler array for the process that is being created (eg. user processes for test programs provided here).
int _fh_bootstrap(struct fharray *fhs){
/* Initialize the file handle array of this process */
fharray_init(fhs);
/* String variables initialized for passage to vfs_open */
char* console_inp = kstrdup(CONSOLE); // CONSOLE = "con:"
char* console_out = kstrdup(console_inp);
char* console_err = kstrdup(console_inp);
/* Return variable */
int ret = 0;
/* Initialize the console files STDIN, STDOUT and STDERR */
struct vnode *stdin;
ret = vfs_open(console_inp,O_RDONLY,0,&stdin);
if(ret != 0){
return ret;
}
kfree(console_inp);
struct fh *stdinfh = kmalloc(sizeof(struct fh));
ret = _fh_create(O_RDONLY,stdin,stdinfh);
if(ret != 0){
return ret;
}
stdinfh->fd = STDIN_FILENO;
fharray_add(fhs,stdinfh,NULL);
struct vnode *stdout;
ret = vfs_open(console_out,O_WRONLY,0,&stdout);
if(ret != 0){
return ret;
}
kfree(console_out);
struct fh *stdoutfh = kmalloc(sizeof(struct fh));
ret = _fh_create(O_WRONLY,stdout,stdoutfh);
if(ret != 0){
return ret;
}
stdoutfh->fd = STDOUT_FILENO;
fharray_add(fhs,stdoutfh,NULL);
struct vnode *stderr;
ret = vfs_open(console_err,O_WRONLY,0,&stderr);
if(ret != 0){
return ret;
}
kfree(console_err);
struct fh *stderrfh = kmalloc(sizeof(struct fh));
ret = _fh_create(O_WRONLY,stderr,stderrfh);
if(ret != 0){
return ret;
}
stderrfh->fd = STDERR_FILENO;
fharray_add(fhs,stderrfh,NULL);
fharray_setsize(fhs,MAX_FD);
return 0;
/* Initialization of stdin, out and err filehandlers complete */
}
If I use os161-gdb to step through this function, I notice the following:
//*stdinfh after the call to _fh_create
{fd = 0, flag = 0, fh_seek = 0, fh_vnode = 0x80044ddc}
//**stdinfh->fh_vnode
{vn_refcount = 2, vn_countlock = {splk_lock = 0, splk_holder = 0x0}, vn_fs = 0x0,
vn_data = 0x8004ab60, vn_ops = 0x8003e690 <dev_vnode_ops>}
This is the strange part. After stepping through the second call to kmalloc (to init stdoutfh), the stdinfh->fh_vnode pointer changes value!
//**stdinfh->fh_vnode
(struct vnode *) 0x1
And even stranger, after proceeding to the following line
fharray_add(fhs,stdoutfh,NULL);
The value of *stdoutfh->fh_vnode and *stdinfh->fh_vnode IS THE SAME
1 possible explanation: Does the OS not have enough heap memory. I find it unlikely and even after assuming this, I can't exactly explain what is happening here.
Some extra code
_fh_create
struct fh definition
static int _fh_create(int flag, struct vnode *file, struct fh *handle){
KASSERT(file != NULL);
/* W , R , RW */
if (
((flag & O_RDONLY) && (flag & O_WRONLY)) ||
((flag & O_RDWR) && ((flag & O_RDONLY) || (flag & O_WRONLY)))
) {
handle = NULL;
return 1;
}
handle->flag = flag;
handle->fh_seek = 0;
handle->fh_vnode = &file;
return 0;
}
struct fh {
uint32_t fd; // file descriptor
int flag; // File handler mode
off_t fh_seek; // seek position in file
struct vnode **fh_vnode; // File object of the file
}
Definition of struct vnode can be found here.
Please let me know if you need more info and thanks for the help!
The code handle->fh_vnode is setting a pointer to automatic variable ("on the stack"), function parameters are automatic variables. After the function returns this will be a dangling pointer.
To fix this you will need to re-design your code a bit, e.g. perhaps the struct fh should just store file and not a pointer to file.
I tried issuing a SCSI Read(10) command to a physical drive on a Windows 7 machine. Below is the code snippet that I am using. It is failing with error code 87.
void scsi_read()
{
const UCHAR cdb[10] = { 0x28, 0, 0, 0, 0, 0, 0, 0, 512, 0 };
UCHAR buf[512];
BYTE senseBuf[196];
const int SENSE_LENGTH = 196;
LPCSTR fname = "\\\\.\\E:";
HANDLE fh;
DWORD ioctl_bytes;
DWORD err = 0;
SCSI_PASS_THROUGH s = {0};
memcpy(s.Cdb, cdb, sizeof(cdb));
s.CdbLength = 10;
s.DataIn = SCSI_IOCTL_DATA_IN;
s.TimeOutValue = 30;
s.Length = sizeof(SCSI_PASS_THROUGH);
s.ScsiStatus = 0x00;
s.SenseInfoOffset = senseBuf;
s.SenseInfoLength = SENSE_LENGTH;
s.DataBufferOffset = buf;
s.DataTransferLength = 512;
fh = CreateFile("\\\\.\\E:", GENERIC_READ | GENERIC_WRITE, FILE_SHARE_READ | FILE_SHARE_WRITE, NULL, OPEN_EXISTING, 0, NULL);
if(fh == INVALID_HANDLE_VALUE) {
printf("Could not open %s file, error %d\n", fname, GetLastError());
return (FALSE);
}
int ret = DeviceIoControl(fh,IOCTL_SCSI_PASS_THROUGH, &s,sizeof(s), //scsiPassThrough.sizeof,
&s,
sizeof(s),
&ioctl_bytes,
NULL);
printf("ret %d",(int)ret);
if (ret==1) {
printf("OK");
}
else {
err = GetLastError();
printf("Last error code %u\n", err);
printf("Return size %d\n", ioctl_bytes);
printf("Sense data\n");
int i=0;
for (i = 0; i < 20; i++) {
printf("\t%x", senseBuf[i]);
}
printf("\n");
}
CloseHandle(fh);
}
Error: Hex dumps are printed in the output
you got error code 87 - ERROR_INVALID_PARAMETER because code totally wrong.
for example:
const UCHAR cdb[10] = { 0x28, 0, 0, 0, 0, 0, 0, 0, 512, 0 };
but 512 is > 255 (MAXUCHAR) are you not got compiler warning here ?
warning C4305: 'initializing': truncation from 'int' to 'const UCHAR'
look at this line !
s.DataBufferOffset = buf;
from SCSI_PASS_THROUGH structure:
DataBufferOffset
Contains an offset from the beginning of this structure to the data
buffer. The offset must respect the data alignment requirements of the
device.
so offset to buffer, not pointer to buffer
for use this correct you code need be like this:
struct MY_DATA : SCSI_PASS_THROUGH
{
UCHAR buf[512];
} s;
s.DataBufferOffset = FIELD_OFFSET(MY_DATA, buf);
but better use SCSI_PASS_THROUGH_DIRECT with IOCTL_SCSI_PASS_THROUGH_DIRECT
you hardcode sector size (512), when need get it at runtime. and how you initialize CDB ?!? at all unclear what you try todo.
working code example (sorry but on c++ instead c)
#define _NTSCSI_USER_MODE_
#include <scsi.h>
#include <ntddscsi.h>
BOOL scsi_read(HANDLE fh, PVOID buf, DWORD cb, ULONGLONG LogicalBlock, ULONG TransferBlocks)
{
SCSI_PASS_THROUGH_DIRECT s = {
sizeof(SCSI_PASS_THROUGH_DIRECT), 0, 0, 0, 0, 0, 0, SCSI_IOCTL_DATA_IN, cb, 30, buf
};
union {
PUCHAR Cdb;
CDB::_CDB10* Cdb10;
CDB::_CDB16* Cdb16;
};
Cdb = s.Cdb;
if (MAXULONG < LogicalBlock || MAXUSHORT < TransferBlocks)
{
s.CdbLength = sizeof(CDB::_CDB16);
Cdb16->OperationCode = SCSIOP_READ16;
*(ULONGLONG*)Cdb16->LogicalBlock = _byteswap_uint64(LogicalBlock);
*(ULONG*)Cdb16->TransferLength = _byteswap_ulong(TransferBlocks);
}
else
{
s.CdbLength = sizeof(CDB::_CDB10);
Cdb10->OperationCode = SCSIOP_READ;
*(ULONG*)&Cdb10->LogicalBlockByte0 = _byteswap_ulong((ULONG)LogicalBlock);
*(USHORT*)&Cdb10->TransferBlocksMsb = _byteswap_ushort((USHORT)TransferBlocks);
}
DWORD ioctl_bytes;
return DeviceIoControl(fh, IOCTL_SCSI_PASS_THROUGH_DIRECT, &s, sizeof(s), &s, sizeof(s), &ioctl_bytes, NULL);
}
BOOL test_scsi_read(PCWSTR fname)
{
BOOL fOk = FALSE;
HANDLE fh = CreateFileW(fname, GENERIC_READ | GENERIC_WRITE,
FILE_SHARE_READ | FILE_SHARE_WRITE, NULL, OPEN_EXISTING, 0, NULL);
if (fh != INVALID_HANDLE_VALUE)
{
DWORD ioctl_bytes;
DISK_GEOMETRY_EX dg;
if (DeviceIoControl(fh, IOCTL_DISK_GET_DRIVE_GEOMETRY_EX, NULL, 0, &dg, sizeof(dg), &ioctl_bytes, 0))
{
// 16 sectors for example
ULONG cb = 16 * dg.Geometry.BytesPerSector;
if (PVOID buf = new CHAR[cb])
{
// read first 16 sectors
fOk = scsi_read(fh, buf, cb, 0, 16);
if (ULONGLONG LogicalBlock = dg.DiskSize.QuadPart / dg.Geometry.BytesPerSector)
{
// read last sector
fOk = scsi_read(fh, buf, dg.Geometry.BytesPerSector, LogicalBlock - 1, 1);
}
delete buf;
}
}
CloseHandle(fh);
}
return fOk;
}
test_scsi_read(L"\\\\?\\e:");
I'm pretty a novice about opencl. I have tried about "get the summation of all cubes of every element in an array". Here's my kernel code:
kernel void cubeSum(global float *input,
local float *prods,
global float *output )
{
int gid = get_global_id( 0 );
int tnum = get_local_id( 0 ); // thread number
int wgNum = get_group_id( 0 ); // work-group number
int numItems = get_local_size( 0 );
prods[ tnum ] = input[ gid ] * input[ gid ] * input[gid]; // cube
for (int offset = 1; offset < numItems; offset *= 2) {
int mask = 2 * offset - 1;
barrier(CLK_LOCAL_MEM_FENCE);
if ( (tnum & mask) == 0 ) {
prods[tnum] += prods[tnum + offset];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if ( tnum == 0 )
output[wgNum] = prods[0];
}
I can't figure out why my result is not the same with sequential result. When the array is from 0 to 511, my result is sequential result minus 2048; when the array is from 0 to 1023, my result is sequential result plus 16384.
I will try to figure it out myself while I'm waiting for you answers.
Another question is I found it is hard to debug kernel code since the dataset is quite big and it runs concurrently. Any advice for debugging?
All the advices are appreciated =).
By the way, here's my host code:
#include <stdio.h>
#include <stdio.h>
#include <math.h>
#include <string.h>
#include <stdlib.h>
#include <OpenCL/opencl.h>
#define NUM_ELEMENTS (512)
#define LOCAL_SIZE (512)
#define MAX_SOURCE_SIZE (0x100000)
int main(int argc, const char * argv[])
{
float data[NUM_ELEMENTS]; //hA
float sum;
float sumTest;
size_t global;
size_t local;
size_t numWorkGroups;
size_t dataSize;
size_t resultsSize;
cl_device_id device;
cl_context context;
cl_command_queue cmdQueue;
cl_program program;
cl_kernel kernel;
cl_mem input;
cl_mem output;
FILE *fp;
//failed to use relative path here. permission problem?
char fileName[] = "/Users/sure/USC/590/cubeSum/cubeSum/cubeSum.cl";
char *source_str;
size_t source_size;
/* カーネルを含むソースコードをロード */
fp = fopen(fileName, "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
fclose( fp );
//allocate the host memory buffers:
int i = 0;
unsigned int count = NUM_ELEMENTS;
for (i = 0; i < count; i++) {
data[i] = i;
}
//array size in bytes (will need this later):
dataSize = NUM_ELEMENTS * sizeof(float);
//opencl function status
cl_int status;
// Connect to a compute device
//
int gpu = 1;
status = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device, NULL);
if (status != CL_SUCCESS)
{
printf("Error: Failed to create a device group!\n");
return EXIT_FAILURE;
}
//create an Opencl context
context = clCreateContext(NULL, 1, &device, NULL, NULL, &status);
//create a command queue
cmdQueue = clCreateCommandQueue( context, device, 0, &status );
//allocate memory buffers on the device
input = clCreateBuffer( context, CL_MEM_READ_ONLY, dataSize, NULL, &status ); //dA
//TODO: at this line, I don't have the value of local which is calculated by clGetKernelWorkGroupInfo
//need to figure out a way to avoid hardcode it.
output = clCreateBuffer( context, CL_MEM_WRITE_ONLY, sizeof(float) * NUM_ELEMENTS / LOCAL_SIZE, NULL, &status ); //dC
// enqueue the 2 commands to write data into the device buffers:
status = clEnqueueWriteBuffer( cmdQueue, input, CL_FALSE, 0, dataSize, data, 0, NULL, NULL );
// create the kernel program on the device:
program = clCreateProgramWithSource(context, 1, (const char **) & source_str, (const size_t *)&source_size, &status);
if (!program)
{
printf("Error: Failed to create compute program!\n");
return EXIT_FAILURE;
}
// Build the program executable
//
status = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (status != CL_SUCCESS)
{
size_t len;
char buffer[2048];
printf("Error: Failed to build program executable!\n");
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
printf("%s\n", buffer);
exit(1);
}
//create compute kernel
kernel = clCreateKernel( program, "cubeSum", &status );
// Get the maximum work group size for executing the kernel on the device
//
status = clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
if (status != CL_SUCCESS)
{
printf("Error: Failed to retrieve kernel work group info! %d\n", status);
exit(1);
}
global = count;
numWorkGroups = global / local;
float results[numWorkGroups]; //hC
resultsSize = numWorkGroups * sizeof(float);
//set kernel parameter
status = clSetKernelArg( kernel, 0, sizeof(cl_mem), &input );
status = clSetKernelArg( kernel, 1, sizeof(float), NULL );
status = clSetKernelArg( kernel, 2, sizeof(cl_mem), &output );
// Execute the kernel over the entire range of our 1d input data set
// using the maximum number of work group items for this device
//
status = clEnqueueNDRangeKernel(cmdQueue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
if (status)
{
printf("Error: Failed to execute kernel!\n");
return EXIT_FAILURE;
}
clFinish(cmdQueue);
status = clEnqueueReadBuffer( cmdQueue, output, CL_TRUE, 0, resultsSize, results, 0, NULL, NULL );
// Validate our results
//
sum = 0;
for (int i=0; i<numWorkGroups; i++) {
sum += results[i];
}
sumTest = 0;
for(i = 0; i < count; i++)
{
sumTest += data[i] * data[i] * data[i];
}
// Print a brief summary detailing the results
//
printf("Computed '%f/%f'!\n", sum, sumTest);
// Shutdown and cleanup
//
clReleaseMemObject(input);
clReleaseMemObject(output);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(cmdQueue);
clReleaseContext(context);
return 0;
}
EDIT: Just found another thing. My code is correct if I just sum all element without cube/square. Thus, I'm gonna figure out how cube affect to my program.
You appear to only be allocating 4-bytes of local memory:
status = clSetKernelArg( kernel, 1, sizeof(float), NULL );
This should be the total amount of local memory required for that argument by the entire work-group. In the case of your kernel, this is (work-group-size * sizeof(float)).
So, you should instead have something like this:
status = clSetKernelArg( kernel, 1, local*sizeof(float), NULL );
The discrepancies you are seeing are likely coming from the limitations of floating point, since you are summing some very large numbers. If you initialise your inputs with smaller numbers (e.g. data[i] = i*0.01;), you should get results equal to your sequential implementation (I've verified this on my own system). This is why you don't see the errors when you remove the cube.