Vitis HLS: pragma Array Partition doesn't work in the sub-function when I set this pragma outside - vivado-hls

I want to unroll the loop "find_col" in the inlined function "find_match". Therefore, I set the pragma array partition on the array "mp_buffer" and "mc_buffer" under the declaration of them (which is outside of "find_match"), and I set them into find_match as arguments. However, there is an II violation because the array is not patitioned. I would like to ask how to solve it, thank you!!
The function find_match (the parallel factor is set to 8 now):
inline void find_match (
Feature_Point origin,
int origin_u_bin,
int origin_v_bin,
int origin_col_idx,
Feature_Point m_buffer[7][4][COL_BIN_FEATURE_MAX],
int32_t m_buffer_num[7][4][V_BIN_NUM+1],
Matching_cand m_matching[U_BIN_NUM][4][COL_BIN_FEATURE_MAX]
) {
Matching_cand min_cand;
int16_t min_cost = 32766;
int16_t psum;
int16_t psum_03, psum_47, psum_811, psum_1215, psum_1619, psum_2023, psum_2427, psum_2831;
int16_t psum_015, psum_1631;
int32_t u_min,u_max,v_min,v_max;
u_min = origin.u-SEARCH_RAD_U;
u_max = origin.u+SEARCH_RAD_U;
v_min = origin.v-SEARCH_RAD_V;
v_max = origin.v+SEARCH_RAD_V;
// bins of interest
int32_t u_bin_min = max(origin_u_bin-3, 0);
int32_t u_bin_max = min(origin_u_bin+3, U_BIN_NUM-1);
int32_t v_bin_min = max(origin_v_bin-3, 0);
int32_t v_bin_max = min(origin_v_bin+3, V_BIN_NUM-1);
int32_t bin_class = origin.type;
int16_t tmp[32];
#pragma HLS ARRAY_PARTITION variable=tmp dim=1 complete
// for all bins of interest do
find_u_bin: for (int u_bin = u_bin_min; u_bin < u_bin_max; u_bin++) {
int u_bin_buffer = u_bin % 7;
find_col: for (int col_idx = m_buffer_num[u_bin_buffer][bin_class][v_bin_min]; col_idx < m_buffer_num[u_bin_buffer][bin_class][v_bin_max]; col_idx++) {
#pragma HLS UNROLL factor=parallel
Feature_Point target = m_buffer[u_bin_buffer][bin_class][col_idx];
if (target.u>=u_min && target.u<=u_max && target.v>=v_min && target.v<=v_max) {
psum = 0;
calc: for (int i = 0; i < 32; i++) {
#pragma HLS UNROLL factor=32
ap_uint<8> a = origin.d.range((i+1)*8-1, 8*i);
ap_uint<8> b = target.d.range((i+1)*8-1, 8*i);
tmp[i] = ABS(a, b);
}
// adder tree
...
I set the pragma here:
Matching_cand mc_matching[U_BIN_NUM][4][COL_BIN_FEATURE_MAX];
Matching_cand mp_matching[U_BIN_NUM][4][COL_BIN_FEATURE_MAX];
static int _p_matched_num;
#pragma HLS ARRAY_PARTITION variable=mc_buffer dim=3 type=cyclic factor=parallel
#pragma HLS ARRAY_PARTITION variable=mp_buffer dim=3 type=cyclic factor=parallel
find_match(origin, i, v_buffer_idx, col_idx, mp_buffer, mp_buffer_num, mc_matching);
I have tried to set array partition pragma in the "find_match" function, but it take a very long time to do the C synthesys.

Related

Linear Recursion Vectorization

To vectorize the following mathematical expression (linear recursion):
f(i)=f(i-1)/c+g(i), i starts at 1, f(0), c are constant numbers and given.
I can get better speed by using list-comprehension, that is:
def function():
txt= [0,2,0,2,0,2,0,2,2,2,0,2,2,2,0,2,0,2,2,2,0,2,0,2,0,2,0,2,2,2,0,2,0,2,0,2,0,2,2,2]
indices_0=[]
vl=0
sb_l=10
CONST=512
[vl := vl+pow(txt[i],(i+1)) for i in range(sb_l)]
if (vl==1876):
indices_0=[0]
p=[i for i in range(1,len(txt)-sb_l+1) if (vl := (vl-txt[i-1])/2+ txt[i+sb_l-1]*CONST)==1876]
print(indices_0+p)
function()
I am looking for a vectorized/faster than vectorization (if possible!) implementation of the above code in python/c.
Note:
1.
A linear recursive function is a function that only makes a single call to itself each time the function runs (as opposed to one that would call itself multiple times during its execution). The factorial function is a good example of linear recursion.
2.
Note that all variables array are given for demonstration purpose, the main part for vectorization is :
[vl := vl+pow(txt[i],(i+1)) for i in range(sb_l)]
if (vl==1876):
indices_0=[0]
p=[i for i in range(1,len(txt)-sb_l+1)
if (vl := (vl-txt[i-1])/2+ txt[i+sb_l-1]*CONST)==1876]
Here, f(i-1)= (vl-txt[i-1]), c=2, g(i)= txt[i+sb_l-1]*CONST.
POSTSCRIPT: I am currently doing it in python, would it be much faster if it is implemented in C language's vectorization?
Here is an example of equivalent C program doing the same thing but faster:
#include <stdio.h>
#include <stdlib.h>
// See: https://stackoverflow.com/questions/29787310/does-pow-work-for-int-data-type-in-c
int64_t int_pow(int64_t base, int exp)
{
int64_t result = 1;
while (exp)
{
// Branchless optimization: result *= base * (exp % 2);
if(exp % 2)
result *= base;
exp /= 2;
base *= base;
}
return result;
}
void function()
{
// Both txt values and sb_l not be too big or it will cause silent overflows (ie. wrong results)
const int64_t txt[] = {0,2,0,2,0,2,0,2,2,2,0,2,2,2,0,2,0,2,2,2,0,2,0,2,0,2,0,2,2,2,0,2,0,2,0,2,0,2,2,2};
const size_t txtSize = sizeof(txt) / sizeof(txt[0]);
const int sb_l = 10;
const int64_t CONST = 512;
int64_t vl = 0;
int64_t* results = (int64_t*)malloc(txtSize * sizeof(int64_t));
size_t cur = 0;
// Optimization so not to compute pow(0,i+1) which is 0
for (int i = 0; i < sb_l; ++i)
if(txt[i] != 0)
vl += int_pow(txt[i], i+1);
if (vl == 1876)
{
results[cur] = 0;
cur++;
}
for (int i = 1; i < txtSize-sb_l+1; ++i)
{
vl = (vl - txt[i-1]) / 2 + txt[i+sb_l-1] * CONST;
if(vl == 1876)
results[cur++] = i;
}
// Printing
printf("[");
for (int i = 0; i < cur; ++i)
{
if(i > 0)
printf(", ");
printf("%ld", results[i]);
}
printf("]\n");
fflush(stdout);
free(results);
}
int main(int argc, char* argv[])
{
function();
return 0;
}
Be careful with overflows. You can put assertions if you are unsure about that in specific places (note they make the code slower when enabled though). Please do not forget to compile the program with optimizations (eg. -O3 with GCC and Clang and /O2 with MSVC).

Segfault after refactoring nested loops

I have some MATLAB code from a digital audio course that I've ported to C. Given an array of numeric data (for example, PCM audio encoded as double-precision floating-point), produce an array of data segments of a specified width and which overlap each other by a specified amount. Here's the relevant code.
typedef struct AudioFramesDouble {
const size_t n, // number of elements in each frame
num_frames;
double* frames[];
} AudioFramesDouble;
/*
* Produce a doubly-indexed array of overlapping substrings (a.k.a windows, frames,
* segments ...) from a given array of data.
*
* x: array of (i.e., pointer to) data
* sz: number of data elements to consider
* n: number of elements in each frame
* overlap: each frame overlaps the next by a factor of 1 - 1/overlap.
*/
AudioFramesDouble* audio_frames_double(register const double x[], const size_t sz, const unsigned n, const unsigned overlap) {
// Graceful exit on nullptr
if (!x) return (void*) x;
const double hop_d = ((double) n) / ((double) overlap); // Lets us "hop" to the start of the next frame.
const unsigned hop = (unsigned) ceil(hop_d);
const unsigned remainder = (unsigned) sz % hop;
const double num_frames_d = ((double) sz) / hop_d;
const size_t num_frames = (size_t) (remainder == 0
? floor(num_frames_d) // paranoia about floating point errors
: ceil(num_frames_d)); // room for zero-padding
const size_t total_samples = (size_t) n * num_frames;
AudioFramesDouble af = {.n = n, .num_frames = num_frames};
// We want afp->frames to appear as (double*)[num_frames].
AudioFramesDouble* afp = malloc((sizeof *afp) + (sizeof (double*) * num_frames));
if (!afp) return afp;
memcpy(afp, &af, sizeof af);
for (size_t i = 0; i < num_frames; ++i) {
/* Allocate zero-initialized space at the start of each frame. If this
fails, free up the memory and vomit a null pointer. */
afp->frames[i] = calloc(n, sizeof(double));
if (!afp->frames[i]) {
double* p = afp->frames[i];
for (long ii = ((long)i) - 1; 0 <= ii; ii--) {
free(afp->frames[--i]);
}
free(afp);
return (void*) p;
}
for (size_t j = 0, k; j < n; ++j) {
if (sz <= (k = (i*hop) + j)) break;
afp->frames[i][j] = x[k];
}
}
return afp;
}
This performs as expected. I wanted to optimize the nested FOR to the following
for (size_t i = 0, j = 0, k; i < num_frames; (j == n - 1) ? (j = 0,i++) : ++j) {
// If we've reached the end of the frame, reset j to zero.
// Then allocate the next frame and check for null.
if (j == 0 && !!(afp->frames[i] = calloc(n, sizeof(double)))) {
double* p = afp->frames[i];
for (long ii = ((long)i) - 1; 0 <= ii; ii--) {
free(afp->frames[--i]);
}
free(afp);
return (void*) p;
}
if (sz <= (k = (i*hop) + j)) break;
afp->frames[i][j] = x[k];
}
This actually compiles and runs just fine; but in my testing, when I try to access the last frame as in
xFrames->frames[xFrames->num_frames-1],
I get a segmentation fault. What's going on here? Am I neglecting an edge case in my loop? I've been looking over the code for awhile, but I might need a second set of eyes. Sorry if the answer is glaringly obvious; I'm a bit of a C novice.
P.S. I'm a fan of branchless programming, so if anyone has tips for cutting out those IFs, I'm all ears. I was using ternary operators before, but reverted to IFs for readability in debugging.
Remember that the logical operator && and || does short-circuit evaluation.
That means if j != 0 then you won't actually call calloc, and you'll have an invalid pointer in afp->frames[i].

SSE for 2D arrays

I want to change the following code using SSE3 instructions:
for (i=0; i<=imax+1; i++) {
/* The vertical velocity approaches 0 at the north and south
* boundaries, but fluid flows freely in the horizontal direction */
v[i][jmax] = 0.0;
u[i][jmax+1] = u[i][jmax];
v[i][0] = 0.0;
u[i][0] = u[i][1];
}
u and v are 2D arrays of type float. What I have so far is this but the program does not run correctly.
int loop2 = ((imax+1) / loopFactor) * loopFactor;
for(i=0; i<loop2; i+=loopFactor) {
__m128 zeroVec = _mm_set1_ps(0.0f);
_mm_storeu_ps(&v[i][jmax], zeroVec);
__m128 umaxVec = _mm_loadu_ps(&u[i][jmax]);
_mm_storeu_ps(&u[i][jmax+1], umaxVec);
__m128 zVec = _mm_set1_ps(0.0f);
_mm_storeu_ps(&v[i][0], zVec);
__m128 uVec = _mm_loadu_ps(&u[i][1]);
_mm_storeu_ps(&u[i][0], uVec);
}
for (; i<=imax+1; i++){
v[i][jmax] = 0.0;
u[i][jmax+1] = u[i][jmax];
v[i][0] = 0.0;
u[i][0] = u[i][1];
}
I suspect that this is because _mm_loadu_ps stores values for u[i][1], u[i][2], u[i][3] and u[i][4] but I want to store the values u[i][1], u[i+1][1], u[i+2][1], u[i+3][1] and u[i+4][1]. How can I do that? Loopfactor has a value of 4.
Any help is really appreciated.

multiple analog inputs to produce individual averages for each channel

I am trying to put four analog inputs into individual channels that contain an array. Once that happens I am trying to get an average of each channel's array getting a single int or float. Lastly, I want to compare the averages in an if statement to get a serial print and divide the compared averages.
I am just confused on what in the code I pieced together is necessary.
Thank you for any advice or help.
Here is my code below
#include <Servo.h>
float sVal0 = 0.0;
float sVal1 = 0.0;
float sVal2 = 0.0;
float sVal3 = 0.0;
float sVal02 = 0.0;
float sVal13 = 0.0;
const int numReadings = 10; //# of readings needed to average
const int numChannels = 4; // 4 analog outputs
int readings[numChannels][numReadings]; // the readings from the analog input
int index; // the index of the current reading
void setup () {
Serial.begin(9600);
}
void loop () {
sVal0 = analogRead(A0);
sVal1 = analogRead(A1);
sVal2 = analogRead(A2);
sVal3 = analogRead(A3);
for (int chan = 0; chan <= numChannels; ++chan ){
Serial.println(sVal0[chan]); // serial print each array
Serial.println(sVal1[chan]);
Serial.println(sVal2[chan]);
Serial.println(sVal3[chan]);
for (int thisReading = 0; thisReading < numReadings; thisReading++) {
readings[thisReading] = 0;
index = index + 1;
}
if (index >= numReadings) {
index = 0;
sVal0_avg = sVal0[chan]/numReadings; // get average
sVal1_avg = sVal0[chan]/numReadings;
sVal2_avg = sVal0[chan]/numReadings;
sVal3_avg = sVal0[chan]/numReadings;
}
}
if (sVal1_avg > sVal3_avg) {
Serial.print("1 avg: );
Serial.println(sVal1_avg);
sVal31 = sVal3_avg / sVal1_avg;
Serial.print("comparison : ");
Serial.println(sVal31);
}
}

U-Boot: Unexpected problems porting code

I want to extend the u-boot SPL code with some fuzzy extractor logic by adding code into {u-boot_sources}/arch/arm/cpu/armv7/omap-common/hwinit-common.c. U-boot shall be used on a PandaBoard ES (omap4460 SoC).
Thus, first I successfully implemented the code on my x86 pc and I am porting it to the ARM-based PandaBoard. The complete code can be found here (as a side note the "main" function is s_init()):
http://pastebin.com/iaz13Yn9
However, I am expecting dozens of unexptected effects, which results in either stopping during the execution of the code, stopping u-boot after reading u-boot.img or not sending output (and thus not booting) at all.
For example, I want to call two functions (computeSyndrome, decodeErrors) inside a for-loop, which is part of another function golayDecode.
For my first problem please ignore the code below the multiline comment starting with /* >>>> These lines of code below totally break u-boot. Also only the function computeSyndrome in conjunction with the calling function golayDecode is important.
The issue: If comment out both functions computeSyndrome and decodeErrors everything works fine and the OS (Android) is booting. However, if computeSyndrome is not commented out and thus gets processed, u-boot stucks after displaying reading u-boot.img.
The funny thing about it: even if I replace computeSyndrome with a bogus function which does not but iterating a values or displaying stuff, u-boot stucks as well.
Furthermore, if I remove the multiline comment furhter below to also include the residual code, u-boot doesn't display ony character. (1*)
I am a beginner regarding microprocessor programming but I can not figure out a possible error in these 12 lines of the computeSyndrome function or the general behaviour of u-boot at all. (2*)
Does anyone have a clue what I am missing?
Thanks,
P.
1* I am using minicom to display the output of u-boot, which I receive over serial-usb-converter.
2* I am using the following compiler flags to make sure there are no errors at compile time: -Wall -Wstrict-prototypes -Wdisabled-optimization -W -pedantic
void golayDecode(volatile int x[12], volatile int y[12], volatile unsigned int golayEncodedSecret[30], volatile unsigned int s, volatile unsigned char repetitionDecodedSecretBits[360]){
printf("\n[I] - Performing Golay decoding\r\n");
volatile unsigned char secret[22] = {0};
volatile unsigned char currentByte = 0, tmpByte = 0;
volatile unsigned int golayDecodedSecret[30] ={0};
volatile int twelveBitCounter = 0;//, j = 0, k = 0, q = 0, aux = 0, found = 0, bitCounter = 0, i_2 = 7, currentSecretEncByte = 0x00;
volatile int c_hat[2] = {0}, e[2] = {0};
e[0] = s;
e[1] = 0;
for(twelveBitCounter = 0; twelveBitCounter < 30; twelveBitCounter+=2){
printf("Computing syndrome and decoding errors for bytes %03x & %03x\n", golayEncodedSecret[twelveBitCounter], golayEncodedSecret[twelveBitCounter+1]);
computeSyndrome(golayEncodedSecret[twelveBitCounter], golayEncodedSecret[twelveBitCounter+1], x, y, s);
decodeErrors(golayEncodedSecret[i], golayEncodedSecret[i+1], x, y, s);
}
printf("\n[D] - Reconstructing secret bytes\r\n");
/* >>>> These lines of code below totally break u-boot
for(i = 0; i < 30; i+=2){
currentSecretEncByte = golayDecodedSecret[i];
volatile int j = 11;
// Access each source bit
for(; 0<=j; j--){
volatile int currentSourceBit = (currentSecretEncByte >> j) & 0x01;
repetitionDecodedSecretBits[bitCounter] = currentSourceBit;
bitCounter++;
}
}
k = 0;
for(i = 0; i<176; i++){
tmpByte = repetitionDecodedSecretBits[i] << i_2;
currentByte = currentByte | tmpByte;
i_2--;
if(i_2==0){ // We collected 8 bits and created a byte
secret[k] = currentByte;
i_2 = 7;
tmpByte = 0x00;
currentByte = 0x00;
k++;
}
}
SHA256_CTX ctx;
unsigned char hash[32];
printf("\n[I] - Generating secret key K\n");
sha256_init(&ctx);
sha256_update(&ctx,secret,strlen((const char*)secret));
sha256_final(&ctx,hash);
printf("\n[I] - This is our secret key K\n\t==================================\n\t");
print_hash(hash);
printf("\t==================================\n");
*/
}
/* Function for syndrome computation */
void computeSyndrome(int r0, int r1, volatile int x[12], volatile int y[12], volatile unsigned int s){
unsigned int syndromeBitCounter, syndromeMatrixCounter, syndromeAux;
s = 0;
for(syndromeMatrixCounter=0; syndromeMatrixCounter<12; syndromeMatrixCounter++){
syndromeAux = 0;
for(syndromeBitCounter=0; syndromeBitCounter<12; syndromeBitCounter++){
syndromeAux = syndromeAux^((x[syndromeMatrixCounter]&r0)>>syndromeBitCounter &0x01);
}
for(syndromeBitCounter=0; syndromeBitCounter<12; syndromeBitCounter++){
syndromeAux = syndromeAux^((y[syndromeMatrixCounter]&r1)>>syndromeBitCounter &0x01);
}
s = (s<<1)^syndromeAux;
}
}
/* Funcion to recover original byte */
void decodeErrors(int r0, int r1, volatile int x[12], volatile int y[12], volatile unsigned int s){
//printf("\n[D] - Starting to decode errors for %3x | %3x\n", r0, r1);
volatile unsigned int c_hat[2] = {0xaa}, e[2] = {0xaa};
volatile unsigned int q;
unsigned int i, j, aux, found;
//printf("Step 2\n");
if(weight(s)<=3){
e[0] = s;
e[1] = 0;
}else{
/******* STEP 3 */
//printf("Step 3\n");
i = 0;
found = 0;
do{
if (weight(s^y[i]) <=2){
e[0] = s^y[i];
e[1] = x[i];
found = 1;
printf("\ntest 2\n");
}
i++;
}while ((i<12) && (!found));
if (( i==12 ) && (!found)){
/******* STEP 4 */
//printf("Step 4\n");
q = 0;
for (j=0; j<12; j++){
aux = 0;
for (i=0; i<12; i++)
aux = aux ^ ( (y[j]&s)>>i & 0x01 );
q = (q<<1) ^ aux;
}
/******* STEP 5 */
//printf("Step 5\n");
if (weight(q) <=3){
e[0] = 0;
e[1] = q;
}else{
/******* STEP 6 */
//printf("Step 6\n");
i = 0;
found = 0;
do{
if (weight(q^y[i]) <=2){
e[0] = x[i];
e[1] = q^y[i];
found = 1;
}
i++;
}while((i<12) && (!found));
if ((i==12) && (!found)){
/******* STEP 7 */
printf("\n[E] - uncorrectable error pattern! (%3x | %3x)\n", r0, r1);
/* You can raise a flag here, or output the vector as is */
//exit(1);
}
}
}
}
c_hat[0] = r0^e[0];
c_hat[1] = r1^e[1];
//printf("\t\tEstimated codeword = %x%x\n", c_hat[0], c_hat[1]);
}
Indeed, the code was a little bit too complex to be executed at this point of boot time. At this time there is ne real CRT and I only have a minimal stack.
Thus, I moved the code to board_init_f() which is still part of the SPL. It gave more stable results and my algorithm now works as expected.

Resources