I am doing a project which is locating a sound source. I am using three microphones to capture data and then do cross correlation to find the time difference. The time difference is then used to locate the sound source. The issue I think am having is with the cross correlation of my code. I am doing this in C and from my understanding FFT needs to be done and so I am using the FFTW library to do this. The result of my cross correlation is always 0. The microphones are capturing data by sampling them, and then after a loud sound it captures data for half a buffer and then writes the data to a text file. I am using a circular buffer for storing the data. After the trigger event occurs, the data is reordered and then fast fourier transformed. Then multiplied and then inverse transformed. I then determine when the peak occurs from the output of the IFFT. Most of the time it is 0 but I am not sure why. So I wrote the data into a text file where I then imported the data into MATLAB. This was to test the xcorr function with my data. The results of this match my cross correlation code in C. So I am now not sure of where the issue lies with my code.
I have added a link to the data I acquired in a text file.1 Then for my cross correlation in C I have the following code.
#include <string.h>
#include <stdlib.h>
#include <stdint.h>
#include <math.h>
#include <complex.h> /* Standard Library of Complex Numbers */
#include <fftw3.h>
int i,size = 8; //defining the size of the buffer and FFT
int peak_detec(double);
void reverse(double a[]);
int peak_detec(double a) // function which calculates the maximum value of an array
{
int b= 0;
double Largest;
int Position;
Largest = a;
for(b=1; b<10; b++)
{
if(Largest<a)
{
Largest=a;
Position = b;
}
}
return Position;
}
void reverse(double a[]) // function which reverses the order of the array send to the function
{
int i;
double temp[size];
for(i = 1; i < size+1; i++)
{
temp[i-1] = a[size-i];
}
for (i=0; i< size;i++)
{
a[i]=temp[i];
}
}
int main(void)
{
double array[] = {0.7,0.1,0.1,0.1,0.1,0.1,0.1,0.1}; //my two arrays for FFT
double array2[] = {0.1,0.1,0.1,0.1,0.1,0.1,0.7,0.1};
double *out,*out2,*out3; //pointer for the output
double *err,*err2; //pointer for errors
double *double_array;
double *double_array2;
//****pointer for complex arrays****//
double complex *out_cpx;
double complex *out_cpx2;
double complex *out_cpx3;
//****defining all the plans for fft****//
fftw_plan fft;
fftw_plan ifft;
fftw_plan fft2;
fftw_plan ifft2;
fftw_plan ifft3;
//****allocating memory****//
out_cpx = (double complex*) fftw_malloc(sizeof(double complex)*(2*size));
out = (double *) malloc((2*size)*sizeof(double));
err = (double *) malloc((2*size)*sizeof(double));
out_cpx2 = (double complex*) fftw_malloc(sizeof(double complex)*(2*size));
out2 = (double *) malloc((2*size)*sizeof(double));
err2 = (double *) malloc((2*size)*sizeof(double));
out_cpx3 = (double complex*) fftw_malloc(sizeof(double complex)*(2*size));
out3 = (double *) malloc((2*size)*sizeof(double));
double_array = (double*) malloc(sizeof(double)*(2*size));
double_array2 = (double*) malloc(sizeof(double)*(2*size));
//****setting up fftw and ifft plans****//
fft = fftw_plan_dft_r2c_1d((2*size), double_array, out_cpx, FFTW_ESTIMATE); //Setup fftw plan for fft
ifft = fftw_plan_dft_c2r_1d((2*size), out_cpx, out, FFTW_ESTIMATE); //Setup fftw plan for ifft
fft2 = fftw_plan_dft_r2c_1d((2*size), double_array2, out_cpx2, FFTW_ESTIMATE); //Setup fftw plan for fft
ifft2 = fftw_plan_dft_c2r_1d((2*size), out_cpx2, out2, FFTW_ESTIMATE); //Setup fftw plan for ifft
ifft3 = fftw_plan_dft_c2r_1d((2*size), out_cpx3, out3, FFTW_ESTIMATE); //Setup fftw plan for ifft
//reverse(array2);
memcpy (double_array, array, sizeof(double)*(size));
memset (double_array + size+1, 0, sizeof(double) * (size-1));
memcpy (double_array2, array2, sizeof(double)*(size));
memset (double_array2 + size+1, 0, sizeof(double) * (size-1));
//****executing fft****//
fftw_execute(fft);
fftw_execute(fft2);
double complex scale = 1.0/(2 * size);
for (i=0;i<2*size;i++) //loop which multiplies the FFT arrays
{
out_cpx3[i] = out_cpx[i]*out_cpx2[i]*scale;
}
//****executing ifft****//
fftw_execute(ifft);
fftw_execute(ifft2);
fftw_execute(ifft3);
//****for loop which finds the largest value of the array****//
int Position =0;
double Largest;
Largest = out3[0];
for(i=1; i<2*size; i++)
{
if(Largest<=out3[i])
{
Largest=out3[i];
Position = i ;
}
}
printf("Position of peak value: %d\n",Position);
//****frees the memory****//
fftw_destroy_plan(fft);
fftw_destroy_plan(fft2);
fftw_destroy_plan(ifft);
fftw_destroy_plan(ifft2);
fftw_destroy_plan(ifft3);
fftw_free(out_cpx);
fftw_free(out_cpx2);
fftw_free(out_cpx3);
free(double_array);
free(double_array2);
free(err);
free(out);
free(err2);
free(out2);
free(out3);
return 0;
}
Can anyone help with my issue? I am trying to do cross correlation and with the data in the shared link but does not seem to work.
It is my first time asking a question on here, I think I have given enough information but if not then I can try provide more if something does not make sense.
Related
I am having issues returning a 2D array from a C extension back to Python. When I allocate memory using malloc the returned data is rubbish. When I just initialise an array like sol_matrix[nt][nvar] the returned data is as expected.
#include <Python.h>
#include <numpy/arrayobject.h>
#include <math.h>
#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
// function to be solved by Euler solver
double func (double xt, double y){
double y_temp = pow(xt, 2);
y = y_temp;
return y;
}
static PyObject* C_Euler(double h, double xn)
{
double y_temp, dydx; //temps required for solver
double y_sav = 0; //temp required for solver
double xt = 0; //starting value for xt
int nvar = 2; //number of variables (including time)
int nt = xn/h; //timesteps
double y = 0; //y starting value
//double sol_matrix[nt][nvar]; //works fine
double **sol_matrix = malloc(nt * sizeof(double*)); //doesn't work
for (int i=0; i<nt; ++i){
sol_matrix[i] = malloc (nvar * sizeof(double));
}
int i=0;
//solution loop - Euler method.
while (i < nt){
sol_matrix[i][0]=xt;
sol_matrix[i][1]=y_sav;
dydx = func(xt, y);
y_temp = y_sav + h*dydx;
xt = xt+h;
y_sav=y_temp;
i=i+1;
}
npy_intp dims[2];
dims[0] = nt;
dims[1] = 2;
//Create Python object to copy solution array into, get pointer to
//beginning of array, memcpy the data from the C colution matrix
//to the Python object.
PyObject *newarray = PyArray_SimpleNew(2, dims, NPY_DOUBLE);
double *p = (double *) PyArray_DATA(newarray);
memcpy(p, sol_matrix, sizeof(double)*(nt*nvar));
// return array to Python
return newarray;
}
static PyObject* Euler(PyObject* self, PyObject* args)
{
double h, xn;
if (!PyArg_ParseTuple(args, "dd", &h, &xn)){
return NULL;
}
return Py_BuildValue("O", C_Euler(h,xn));
}
Could you provide any guidance on where I am going wrong?
Thank you.
The data in sol_matrix is not in contiguous memory, it's in nt separately allocated arrays. Therefore the line
memcpy(p, sol_matrix, sizeof(double)*(nt*nvar));
is not going to work.
I'm not a big fan of pointer-to-pointer arrays so believe your best option is to allocate sol_matrix as one big chunk:
double *sol_matrix = malloc(nt*nvar * sizeof(double));
This does mean you can't do 2D indexing so will need to do
// OLD: sol_matrix[i][0]=xt;
sol_matrix[i*nvar + 0] = xt;
In contrast
double sol_matrix[nt][nvar]; //works fine
is a single big chunk of memory so the copy works fine.
I am trying to create a simple program that uses Intel's AVX technology and perform vector multiplication and addition. Here I am using Open MP alongside this. But it is getting segmentation fault due to the function call _mm256_store_ps().
I have tried with OpenMP atomic features like atomic, critical, etc so that if this function is atomic in nature and multiple cores are attempting to execute at the same time, but it is not working.
#include<stdio.h>
#include<time.h>
#include<stdlib.h>
#include<immintrin.h>
#include<omp.h>
#define N 64
__m256 multiply_and_add_intel(__m256 a, __m256 b, __m256 c) {
return _mm256_add_ps(_mm256_mul_ps(a, b),c);
}
void multiply_and_add_intel_total_omp(const float* a, const float* b, const float* c, float* d)
{
__m256 a_intel, b_intel, c_intel, d_intel;
#pragma omp parallel for private(a_intel,b_intel,c_intel,d_intel)
for(long i=0; i<N; i=i+8) {
a_intel = _mm256_loadu_ps(&a[i]);
b_intel = _mm256_loadu_ps(&b[i]);
c_intel = _mm256_loadu_ps(&c[i]);
d_intel = multiply_and_add_intel(a_intel, b_intel, c_intel);
_mm256_store_ps(&d[i],d_intel);
}
}
int main()
{
srand(time(NULL));
float * a = (float *) malloc(sizeof(float) * N);
float * b = (float *) malloc(sizeof(float) * N);
float * c = (float *) malloc(sizeof(float) * N);
float * d_intel_avx_omp = (float *)malloc(sizeof(float) * N);
int i;
for(i=0;i<N;i++)
{
a[i] = (float)(rand()%10);
b[i] = (float)(rand()%10);
c[i] = (float)(rand()%10);
}
double time_t = omp_get_wtime();
multiply_and_add_intel_total_omp(a,b,c,d_intel_avx_omp);
time_t = omp_get_wtime() - time_t;
printf("\nTime taken to calculate with AVX2 and OMP : %0.5lf\n",time_t);
}
free(a);
free(b);
free(c);
free(d_intel_avx_omp);
return 0;
}
I expect that I will get d = a * b + c but it is showing segmentation fault. I have tried to perform the same task without OpenMP and it working errorless. Please let me know if there is any compatibility issue or I am missing any part.
gcc version 7.3.0
Intel® Core™ i3-3110M Processor
OS Ubuntu 18.04
Open MP 4.5, I have executed the command $ echo |cpp -fopenmp -dM |grep -i open and it showed #define _OPENMP 201511
Command to compile, gcc first_int.c -mavx -fopenmp
** UPDATE **
As per the discussions and suggestions, the new code is,
float * a = (float *) aligned_alloc(N, sizeof(float) * N);
float * b = (float *) aligned_alloc(N, sizeof(float) * N);
float * c = (float *) aligned_alloc(N, sizeof(float) * N);
float * d_intel_avx_omp = (float *)aligned_alloc(N, sizeof(float) * N);
This working without perfectly.
Just a note, I was trying to compare general calculations, avx calculation and avx+openmp calculation. This is the result I got,
Time taken to calculate without AVX : 0.00037
Time taken to calculate with AVX : 0.00024
Time taken to calculate with AVX and OMP : 0.00019
N = 50000
The documentation for _mm256_store_ps says:
Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from a into memory. mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
You can use _mm256_storeu_si256 instead for unaligned stores.
A better option is to align all your arrays on a 32-byte boundary (for 256-bit avx registers) and use aligned load and stores for maximum performance because unaligned loads/stores crossing a cache line boundary incur performance penalty.
Use std::aligned_alloc (or C11 aligned_alloc, memalign, posix_memalign, whatever you have available) instead of malloc(size), e.g.:
float* allocate_aligned(size_t n) {
constexpr size_t alignment = alignof(__m256);
return static_cast<float*>(aligned_alloc(alignment, sizeof(float) * n));
}
// ...
float* a = allocate_aligned(N);
float* b = allocate_aligned(N);
float* c = allocate_aligned(N);
float* d_intel_avx_omp = allocate_aligned(N);
In C++-17 new can allocate with alignment:
float* allocate_aligned(size_t n) {
constexpr auto alignment = std::align_val_t{alignof(__m256)};
return new(alignment) float[n];
}
Alternatively, use Vc: portable, zero-overhead C++ types for explicitly data-parallel programming that aligns heap-allocated SIMD vectors for you:
#include <cstdio>
#include <memory>
#include <chrono>
#include <Vc/Vc>
Vc::float_v random_float_v() {
alignas(Vc::VectorAlignment) float t[Vc::float_v::Size];
for(unsigned i = 0; i < Vc::float_v::Size; ++i)
t[i] = std::rand() % 10;
return Vc::float_v(t, Vc::Aligned);
}
unsigned reverse_crc32(void const* vbegin, void const* vend) {
unsigned const* begin = reinterpret_cast<unsigned const*>(vbegin);
unsigned const* end = reinterpret_cast<unsigned const*>(vend);
unsigned r = 0;
while(begin != end)
r = __builtin_ia32_crc32si(r, *--end);
return r;
}
int main() {
constexpr size_t N = 65536;
constexpr size_t M = N / Vc::float_v::Size;
std::unique_ptr<Vc::float_v[]> a(new Vc::float_v[M]);
std::unique_ptr<Vc::float_v[]> b(new Vc::float_v[M]);
std::unique_ptr<Vc::float_v[]> c(new Vc::float_v[M]);
std::unique_ptr<Vc::float_v[]> d_intel_avx_omp(new Vc::float_v[M]);
for(unsigned i = 0; i < M; ++i) {
a[i] = random_float_v();
b[i] = random_float_v();
c[i] = random_float_v();
}
auto t0 = std::chrono::high_resolution_clock::now();
for(unsigned i = 0; i < M; ++i)
d_intel_avx_omp[i] = a[i] * b[i] + c[i];
auto t1 = std::chrono::high_resolution_clock::now();
double seconds = std::chrono::duration_cast<std::chrono::duration<double>>(t1 - t0).count();
unsigned crc = reverse_crc32(d_intel_avx_omp.get(), d_intel_avx_omp.get() + M); // Make sure d_intel_avx_omp isn't optimized out.
std::printf("crc: %u, time: %.09f seconds\n", crc, seconds);
}
Parallel version:
#include <tbb/parallel_for.h>
// ...
auto t0 = std::chrono::high_resolution_clock::now();
tbb::parallel_for(size_t{0}, M, [&](unsigned i) {
d_intel_avx_omp[i] = a[i] * b[i] + c[i];
});
auto t1 = std::chrono::high_resolution_clock::now();
You must use aligned memory for these intrinsics. Change your malloc(...) to aligned_alloc(sizeof(float) * 8, ...) (C11).
This is completely unrelated to atomics. You are working on entirely separate pieces of data (even on different cache lines), so there is no need for any protection.
I am new to using cuda and the magma libraries. I'm trying out some functions on a test problem, a 2D heat equation. The code I wrote seemed to work perfectly for grid sizes of 32, 64, and 128. But it produced wrong results for grid sizes of 256 or larger. I am only posting part of the code here, just enough to reproduce the error. Transferring the final matrix and looking at it in matlab shows that the second call to magmablas_dgemm introduced errors into the solution.
Is there anyone out there who can see why this code would break for larger grid sizes?
int main(int argc, char* argv[])
{
// Get parameters for problem set up
int side_width = atoi(argv[1]); //assuming square grid, N/32 integer
double dx = 2.0 / (side_width-1);
double dt = 0.25 * dx;
//double Tend = dt*3;// 0.5;
// create memory pointers for derivative operator matrices and solution matrix
double* U;
double* Dleft;
double* Dright;
double* dev_U;
double* dev_Dleft;
double* dev_Dright;
//initialize the MAGMA system
magma_init();
magma_int_t N = side_width;
// temp variables required by MAGMA functions
magma_int_t *piv, info, err;
piv = (magma_int_t*)malloc(N*sizeof(magma_int_t));
// Allocate memory for matrices on host and device
err = magma_dmalloc_cpu(&U, N*N);
err += magma_dmalloc_cpu(&Dleft, N*N);
err += magma_dmalloc_cpu(&Dright, N*N);
err += magma_dmalloc(&dev_U, N*N);
err += magma_dmalloc(&dev_Dleft, N*N);
err += magma_dmalloc(&dev_Dright, N*N);
if (err){
printf("error in allocation. err number = %d\n", err);
exit(1);
}
// zero out matrices (not efficient but correct)
for (int k=0; k<N*N; ++k ){
U[k] = 1.0;
Dleft[k] = 0.0;
Dright[k] = 0.0;
}
//create derivative operator matrices
double a = dt/2.0/dx/dx;
double b = dt/dx/dx;
Dleft[0] = 1.0;
Dleft[N*N-1] = 1.0;
for (int k=1; k<N-1; ++k) {
Dleft[k*N + k-1] = -a;
Dleft[k*N + k] = 1+b;
Dleft[k*N + k+1] = -a;
Dright[k*N + k-1] = a;
Dright[k*N + k] = 1-b;
Dright[k*N + k+1] = a;
}
// Determine block and thread amounts
int grid_dim = ((side_width + 31)/32) ;
int block_dim = 32;
dim3 gridDim(grid_dim, grid_dim);
dim3 blockDim(block_dim, block_dim);
//copy data from host to device
magma_dsetmatrix(N, N, U, N, dev_U, N);
magma_dsetmatrix(N, N, Dleft, N, dev_Dleft, N);
magma_dsetmatrix(N, N, Dright, N, dev_Dright, N);
// LU factorize the left hand operator matrix
magma_dgetrf_gpu(N, N, dev_Dleft, N, piv, &info);
double tn = 0; //time counter
// needed to take first step outside while loop because of some tricky transpose nonsense happening
tn += dt;
// compute explicit step : Uhat=Dright*U^T
magmablas_dgemm(MagmaTrans,MagmaNoTrans, N, N, N, 1.0f, dev_Dright, N, dev_U, N, 0.0f, dev_U, N);
// implicit step solve : Dleft*U=Uhat
magma_dgetrs_gpu(MagmaTrans, N, N, dev_Dleft, N, piv, dev_U, N, &info);
// compute explicit step : Uhat=Dright*U^T
magmablas_dgemm(MagmaTrans, MagmaTrans, N, N, N, 1.0f, dev_Dright, N, dev_U, N, 0.0f, dev_U, N);
printf("GPU matrix U at time %3.3f \n ", tn);
magma_dprint_gpu(16, 16, dev_U, N);
//copy solution from device to host
magma_dgetmatrix(N, N, dev_U, N, U, N);
//write data to file
char filename[256];
char str_t[128];
sprintf(str_t, "%d", N );
sprintf(filename, "ADI_%s.bin", str_t);
FILE* fileID = fopen(filename, "wb");
for (int i=0; i<N*N; ++i){
fwrite(&U[i],sizeof(double),1,fileID);
}
fclose(fileID);
free(U);
free(Dleft);
free(Dright);
magma_free(dev_U);
magma_free(dev_Dleft);
magma_free(dev_Dright);
free(piv);
magma_finalize();
return 0;
}
To the best of my knowledge, BLAS/LAPACK gemm has never supported in-place operations, ie.
C := alpha*op( A )*op( B ) + beta*C
cannot be transformed into
A := alpha*op( A )*op( B ) + beta*A
or
B := alpha*op( A )*op( B ) + beta*B
with any guarantee of correctness, even for the canonical case with alpha = 1, beta = 0. If you can follow the fortran, I would recommend having a look at the reference code from the Dongarra group. That implementation will break if the pointer for the matrix passed as C aliaises either A or B.
In multi-threaded or massively parallel BLAS implementations, this is particularly true. Most parallel execution environments don't support any sort of strong or fixed execution ordering. That can mean that operations which unintentionally work in serial versions of linear algebra routines break in parallel, because of the lack of execution order guarantee. If a routine in a parallel BLAS or LAPACK implementation doesn't explicitly say it supports in-place operations, don't assume otherwise, because there be dragons and all of that...
Your MAGMA gemm calls only work at small sizes by accident, and probably because very small matrix sizes don't expose enough parallelism to hit the correctness problems that will arise from aliasing an input and output pointer. If you change your code so that the inputs and output are different memory allocations, I suspect the problem will disappear.
I have the following code in cuda_computation.cu
#include <iostream>
#include <stdio.h>
#include <cuda.h>
#include <assert.h>
void checkCUDAError(const char *msg);
__global__ void euclid_kernel(float *x, float* y, float* f)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
int i = blockIdx.x;
int j = threadIdx.x;
f[idx] = sqrt((x[i]-x[j])*(x[i]-x[j]) + (y[i]-y[j])*(y[i]-y[j]));
}
int main()
{
float *xh;
float *yh;
float *fh;
float *xd;
float *yd;
float *fd;
size_t n = 256;
size_t numBlocks = n;
size_t numThreadsPerBlock = n;
size_t memSize = numBlocks * numThreadsPerBlock * sizeof(float);
xh = (float *) malloc(n * sizeof(float));
yh = (float *) malloc(n * sizeof(float));
fh = (float *) malloc(memSize);
for(int ii(0); ii!=n; ++ii)
{
xh[ii] = ii;
yh[ii] = ii;
}
cudaMalloc( (void **) &xd, n * sizeof(float) );
cudaMalloc( (void **) &yd, n * sizeof(float) );
cudaMalloc( (void **) &fd, memSize );
for(int run(0); run!=10000; ++run)
{
//change value to avoid optimizations
xh[0] = ((float)run)/10000.0;
cudaMemcpy( xd, xh, n * sizeof(float), cudaMemcpyHostToDevice );
checkCUDAError("cudaMemcpy");
cudaMemcpy( yd, yh, n * sizeof(float), cudaMemcpyHostToDevice );
checkCUDAError("cudaMemcpy");
dim3 dimGrid(numBlocks);
dim3 dimBlock(numThreadsPerBlock);
euclid_kernel<<< dimGrid, dimBlock >>>( xd, yd, fd );
cudaThreadSynchronize();
checkCUDAError("kernel execution");
cudaMemcpy( fh, fd, memSize, cudaMemcpyDeviceToHost );
checkCUDAError("cudaMemcpy");
}
cudaFree(xd);
cudaFree(yd);
cudaFree(fd);
free(xh);
free(yh);
free(fh);
return 0;
}
void checkCUDAError(const char *msg)
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
exit(-1);
}
}
It takes about 6" to run on an FX QUADRO 380, while the corresponding serial version using just one i7-870 core takes just about 3". Do I miss something? Is the code under optimised in some ways? Or is it just expected behaviour that for simple calculations (like this all-pairs Euclidean distance) the overhead needed to move memory exceeds the computational gain?
I think you are being killed by the time to move the data.
Especially since you are calling the CUDA kernel with individual values, it might be quicker to upload a large set of values as a 1D array and operate on them.
Also sqrt isn't done in HW on Cuda (at least not on my GPU) whereas the CPU has optimized FPU HW for this and is probably 10x faster than the GPU, and for a small job like this is probably keeping all the results in cache between the timign runs.
Reduce your global memory reads since they are expensive.
You have 4 global memory reads per thread which can be reduced to 2 using shared memory.
__global__ void euclid_kernel(const float * inX_g, const float* inY_g, float * outF_g)
{
const unsigned int threadId = blockIdx.x * blockDim.x + threadIdx.x;
__shared__ float xBlock_s;
__shared__ float yBlock_s;
if(threadIdx.x == 0)
{
xBlock_s = inX_g[blockIdx.x];
yBlock_s = inY_g[blockIdx.x];
}
__syncthreads();
float xSub = xBlock_s - inX_g[threadIdx.x];
float ySub = yBlock_s - inY_g[threadIdx.x];
outF_g[threadId] = sqrt(xSub * xSub + ySub * ySub);
}
You should also test with different block sizes (aslong you have 100% occupancy).
You are splitting the problem so that each block is responsible for a single i vs all 256 j's. This is bad locality, as those 256 j's have to be reloaded for every block, for a total of 2*256*(256 + 1) loads. Instead, split your grid so that each block is responsible for a range of, say, 16 i's and 16 j's, which is still 256 blocks*256 threads. But each block now loads only 2*(16+16) values, for a total or 2*256*32 total loads. The idea is, reuse each loaded value as many times as possible. This may not have a huge impact with 256x256, but becomes more and more important as the size scales.
This optimization is used for efficient matrix multiplies, which have a similar locality problem. See http://en.wikipedia.org/wiki/Loop_tiling, or google for "optimized matrix multiply" for more details. And perhaps the matrix multiplication kernel in the NVIDIA SDK gives some details and ideas.
I have a 2D array of data stored in column-major (Fortran-style) format, and I'd like to take the FFT of each row. I would like to avoid transposing the array (it is not square). For example, my array
fftw_complex* data = new fftw_complex[21*256];
contains entries [r0_val0, r1_val0,..., r20_val0, r0_val1,...,r20_val255].
I can use fftw_plan_many_dft to make a plan to solve each of the 21 FFTs in-place in the data array if it is row-major, e.g. [r0_val0, r0_val1,..., r0_val255, r1_val0,...,r20_val255]:
int main() {
int N = 256;
int howmany = 21;
fftw_complex* data = new fftw_complex[N*howmany];
fftw_plan p;
// this plan is OK
p = fftw_plan_many_dft(1,&N,howmany,data,NULL,1,N,data,NULL,1,N,FFTW_FORWARD,FFTW_MEASURE);
// do stuff...
return 0;
}
According to the documentation (section 4.4.1 of the FFTW manual), the signature for the function is
fftw_plan fftw_plan_many_dft(int rank, const int *n, int howmany,
fftw_complex *in, const int *inembed,
int istride, int idist,
fftw_complex *out, const int *onembed,
int ostride, int odist,
int sign, unsigned flags);
and I should be able to use the stride and dist parameters to set the indexing. From what I can understand from the documentation, the entries in the array to be transformed are indexed as in + j*istride + k*idist where j=0..n-1 and k=0..howmany-1. (My arrays are 1D and there are howmany of them). However, the following code results in a seg. fault (edit: the stride length is wrong, see update below):
int main() {
int N = 256;
int howmany = 21;
fftw_complex* data = new fftw_complex[N*howmany];
fftw_plan p;
// this call results in a seg. fault.
p = fftw_plan_many_dft(1,&N,howmany,data,NULL,N,1,data,NULL,N,1,FFTW_FORWARD,FFTW_MEASURE);
return 0;
}
Update:
I made an error choosing the stride length. The correct call is (the correct stride length is howmany, not N):
int main() {
int N = 256;
int howmany = 21;
fftw_complex* data = new fftw_complex[N*howmany];
fftw_plan p;
// OK
p = fftw_plan_many_dft(1,&N,howmany,data,NULL,howmany,1,data,NULL,howmany,1,FFTW_FORWARD,FFTW_MEASURE);
// do stuff
return 0;
}
The function works as documented. I made an error with the stride length, which should actually be howmany in this case. I have updated the question to reflect this.
I find the documentation for FFTW is somewhat difficult to comprehend without examples (I could just be illiterate...), so I'm posting a more detailed example below, comparing the usual use of fftw_plan_dft_1d with fftw_plan_many_dft. To recap, in the case of howmany arrays with length N that are stored in a contiguous block of memory referenced as in, the array elements j for each transform k are
*(in + j*istride + k*idist)
The following two pieces of code are equivalent. In the first, the conversion from some 2D array is done explicitly, and in the second the fftw_plan_many_dft call is used to do everything in-place.
Explicit Copy
int N, howmany;
// ...
fftw_complex* data = (fftw_complex*) fftw_malloc(N*howmany*sizeof(fftw_complex));
// ... load data with howmany arrays of length N
int istride, idist;
// ... if data is column-major, set istride=howmany, idist=1
// if data is row-major, set istride=1, idist=N
fftw_complex* in = (fftw_complex*) fftw_malloc(N*sizeof(fftw_complex));
fftw_complex* out = (fftw_complex*) fftw_malloc(N*sizeof(fftw_complex));
fftw_plan p = fftw_plan_dft_1d(N,in,out,FFTW_FORWARD,FFTW_MEASURE);
for (int k = 0; k < howmany; ++k) {
for (int j = 0; j < N; ++j) {
in[j] = data[j*istride + k*idist];
}
fftw_execute(p);
// do something with out
}
Plan Many
int N, howmany;
// ...
fftw_complex* data = (fftw_complex*) fftw_malloc(N*howmany*sizeof(fftw_complex));
// ... load data with howmany arrays of length N
int istride, idist;
// ... if data is column-major, set istride=howmany, idist=1
// if data is row-major, set istride=1, idist=N
fftw_plan p = fftw_plan_many_dft(1,&N,howmany,data,NULL,howmany,1,data,NULL,howmany,1,FFTW_FORWARD,FFTW_MEASURE);
fftw_execute(p);