char* function returning Null - c

I am currently working on a spam filter program that takes in and reads text files. In the initializeTraining function, I call the preprocess function which reads in each string from each line in a given text-file.
Once the newDict function is executed from the line first=newDict(string, NULL);, the program, however, returns an error stating that there is a load of null pointer of type 'char' at line while(string[i] !='\0' && i<WORDLENGTH) { in the newDict function.
It seems that the preprocess function is returning null pointers despite the fact that it still takes in the passed-in strings from the text file. Is there something that I'm doing wrong in the preprocess function?
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
/*
WORDLENGTH is the length of the word in the linked list named as dictionary
MAILSEPARATOR is the totken to differentiate the mails that are included in one file.
it is also the token to update updated in linkedlist
*/
#define WORDLENGTH 20
#define MAILSEPARATOR "#####"
/*
define DEBUG as 0 to disable debug mode and to 1 to enable the mode.
*/
#define DEBUG 0
typedef struct dictionary dict;
typedef dict* word_dict;
typedef enum {false, true} bool;
/*
linked list, count is for the total word count and
occur is the numbers of the mails that had the word
*/
struct dictionary{
char word[WORDLENGTH];
int occur;
int count;
word_dict next;
bool updated;
};
// if there is no matching words after searching, create a new node
word_dict newDict(char *string, word_dict next){
word_dict target = (word_dict)malloc(sizeof(dict));
int i = 0;
while(string[i] !='\0' && i<WORDLENGTH) {
target->word[i] = string[i];
i++;
}
target->count = 1;
target->next = next;
target->occur = 1;
target->updated = true;
return target;
}
/*
preprocessor, convert string to lowercase
and trim the puctuations at the back
*/
char* preprocess(char* string){
#if DEBUG
printf("\nbefore preprocess, string: %s \n", string);
#endif
int i=0;
while(string[i] != '\0') { // convert to lower case
if (string[i] >= 65 && string[i] < 90) {
string[i] += 32;
i++;
}
while(true) {
i--;
if(i < 0) {
#if DEBUG
printf("word of only punctuations \n");
#endif
return NULL;
} else if((string[i] >= 97 && string[i] <= 122) || (string[i] >= 48 && string[i] <= 57)){
string[i+1]='\0';
break;
}
}
i=0;
while(true) {
if ((string[i] >= 97 && string[i] <= 122) || (string[i] >= 48 && string[i] <= 57)){
break;
} else {
string = &string[i+1];
}
i++;
}
}
#if DEBUG
printf("_after preprocess, string: %s\n", string);
#endif
return string;
}
/*
initialize training
reads the sample mails and creates a linked list of
the percentages of the words occuring in the sample mails
*/
word_dict initializeTraining(char* filename){
FILE *fp = NULL;
fp = fopen(filename, "r");
if(fp == NULL) {
printf("no file found\n");
return NULL;
}
char* string;
string = (char*)malloc(sizeof(char)*50);
word_dict first = NULL;
fscanf(fp, "%s\n", string);
string = preprocess(string);
first = newDict(string, NULL);
while(fscanf(fp,"%s", string) == 1) {
first = searchDict(string, first);
}
fclose(fp);
free(string);
return first;
}
/*
tests whether the mail is pam or not
takes the filename of the test mail,
returns true or false depending on the email's content
*/
bool bayesian_spam_filter(char * filename_for_test_email) {
word_dict spamDict=initializeTraining("spam.txt");
word_dict nonspamDict=initializeTraining("not_spam.txt");
#if DEBUG
printDict(spamDict);
printDict(nonspamDict);
#endif
FILE *stream=NULL;
stream = fopen(filename_for_test_email, "r");
if(stream == NULL){
printf("no file found\n");
return false;
}
char* string;
string = (char*)malloc(sizeof(char)*50);
int ps, pn; // probability of spam mail and non-spam mail
double prob = 0.5;
while(fscanf(stream,"%s", string) == 1){
char* tempString; // for handling the errors happening from string being null during preprocessing
tempString = preprocess(string);
if(tempString == NULL){
continue;
}
if((ps = searchTest(tempString, spamDict)) != 0) {
if((pn = searchTest(tempString, nonspamDict)) != 0) {
printf("ps:%3d, pn:%3d, %s\n", ps, pn, tempString);
prob = prob * (double) ps / ((prob* (double)ps + (1 - prob) * (double) pn));
printf("this prob: %.10f\n", prob);
}
}
}
//printf("%d, %d \n", pSProduct, pNProduct);
//proba=(float)(pSProduct/(pSProduct+pNProduct));
printf("Probability of mail being spam: %.10f\n", prob);
fclose(stream);
free(string);
if (prob > 0.9) {
return true;
}
return false;
}

It seems that the preprocess function is returning null pointers
Hardly surprising when it contains a line return NULL;. At that point, you should instead set the first character of the string to '\0' and return it since the surrounding code expects to get the string returned in all cases.
Another problem can be found in this section:
i=0;
while(true) {
if ((string[i] >= 97 && string[i] <= 122) || (string[i] >= 48 && string[i] <= 57)){
break;
} else {
string = &string[i+1];
}
i++;
}
Let's say the string is ".a" and go through the loop. Since the first character is not a letter, we do not break, instead we update the string pointer so that it points to "a" now. Then, we increment i. In the next iteration, string[i] is the null terminator, which is not a letter, so we continue. Since we are past the string's data, what follows is undefined behavior.
The simple fix for this is to not increment i and rather stick with [0] since you always want to remove from the beginning. The proper fix is to use i but without incrementing the string pointer, since you want to free it later – you must call free on the pointer returned by malloc, therefore modifying the pointer leads to undefined behavior! Instead of returning the string from preprocess, return the offset from the start (as counted by i), this way freeing the string later will work properly. The calling code would then look like this:
int offset = preprocess(string);
first = newDict(string + offset, NULL);

Related

program adds on a string seemingly out of nowhere

I'm writing a code that reads in an input file from stdin, and outputs the exact same content (to stdout) except that replaces any words found in the "dictionary" in the following way, in the exact order:
if the word exactly as it is stored as a key in dictionary appears, then find the corresponding value pair and print that out instead.
if the word that's capitalized properly (eg., Thomas, with capital first letter and lowercase for everything else) is a valid key in the dictionary, print out the corresponding value pair instead
if the lowercased version is a valid key, print out its corresponding value
if there are no matches, just print things out as they are.
(All non-alphabetical characters are just printed out "normally".)
A problem I've been having though is that when I'm doing (2), somehow a character ('U') gets tagged on to the end of the "string" or copy2 array when I'm testing "IPSUM" (all cap).
For instance, see this output:
My outputs are in the lines with "<", and the ">" indicate what should've been. Based on the order I'm checking things, since IPSUM is not in the dictionary (see the end of this post for contents of dictionary), it goes to (2) where IPSUM should become Ipsum, and it should print out the corresponding value of Ipsum. But instead I get IpsumU, and so the dictionary doesn't recognize the word. But I'm not sure where the 'U' is coming from, since the input is exactly
IPSUM (all cap).
Could anyone help me figure out what might be wrong with my code?
//for reference:
typedef struct HashBucketEntry {
void *key;
void *data;
struct HashBucketEntry *next;
} HashBucketEntry;
typedef struct HashTable {
int size;
unsigned int (*hashFunction)(void *);
int (*equalFunction)(void*, void*);
HashBucketEntry **buckets;
} HashTable;
//We have a Hashtable *dictionary.
void processInput() {
//char c;
int c;
int i = 0;
//char * word = (char *) malloc(60 * sizeof(char));
char word[60];
while (c = getchar()) {
if (isalpha(c)) {
word[i] = c;
i++;
} else {
word[i] = '\0';
if (word[0] != '\0') {
//char * copy = (char *) malloc(60 * sizeof(char));
char copy[60];
strcpy(copy, word);
unsigned int location = (dictionary->hashFunction)(copy) % (dictionary->size);
char * word_in_dict;
if (dictionary->buckets[location] != NULL) {
word_in_dict = (char *) dictionary->buckets[location]->data;
} else {
word_in_dict = NULL;
}
char copy2[60];
copy2[0] = toupper(copy[0]);
for(int i = 1; copy[i]; i++){
copy2[i] = tolower(copy[i]);
}
unsigned int location2 = (dictionary->hashFunction)(copy2) % (dictionary->size);
char * word_in_dict2;
if (dictionary->buckets[location2] != NULL) { //somehow this is NULL when IPSUM, even though copy2 has correct string
word_in_dict2 = (char *) dictionary->buckets[location2]->data;
} else {
word_in_dict2 = NULL;
}
char copy3[60];
for(int i = 0; copy[i]; i++){
copy3[i] = tolower(copy[i]);
}
unsigned int location3 = (dictionary->hashFunction)(copy3) % (dictionary->size);
char * word_in_dict3;
if (dictionary->buckets[location3] != NULL) {
word_in_dict3 = (char *) dictionary->buckets[location3]->data;
} else {
word_in_dict3 = NULL;
}
if (word_in_dict != NULL) {
fprintf(stdout, "%s", word_in_dict);
} else if (word_in_dict2 != NULL) {
fprintf(stdout, "%s", word_in_dict2);
} else if (word_in_dict3 != NULL) {
fprintf(stdout, "%s", word_in_dict3);
} else {
//fprintf(stdout, "%s", copy);
printf("%s", copy);
}
putchar(c);
i = 0;
} else if (c != EOF) {
putchar(c);
} else {
break;
}
}
}
}
The dictionary contains only these entries:
ipsum i%##!
fubar fubar
IpSum XXXXX24
Ipsum YYYYY211
Any help would be really appreciated!
Update in response to the answer:
I changed the code for copy2 to this:
for(j = 1; j < strlen(copy); j++) {
if (j < sizeof(copy2)) {
copy2[j] = tolower(copy[j]);
}
}
(and did a similar thing to copy3). The second case works, but now the third case fails; things only seem to work if I change the second case but not the third case. Does anyone know why this is the case?
Your code to creaty modified copies of your input strings, e.g.
char copy2[60];
copy2[0] = toupper(copy[0]);
for(int i = 1; copy[i]; i++){
copy2[i] = tolower(copy[i]);
}
does not copy the terminating '\0'. As automatic variables are not implicitly initialized, the corresponding memory may contain any data (from prevous loop cycles or from unrelated code) which may appear as trailing characters. You must append a '\0' character after the last character of your string.
This error may result in out-of-bounds access to the array when you access it as a string if there is no '\0' within the array bounds. (undefined behavior)
Your code itself might result in out-of-bounds access if the input string is too long. You should add a check to prevent access to array elements at i >= sizeof(copy2).
I suggest something like this:
char copy2[60];
copy2[0] = toupper(copy[0]);
/* avoid reading past the end of an empty string */
if(copy[0]) {
for(int i = 1; copy[i] && (i < sizeof(copy)-1); i++){
copy2[i] = tolower(copy[i]);
}
/* variable i will already be incremented here */
copy2[i] = '\0';
}
Edit as a response to a question in a comment:
You cannot combine strcpy and tolower, but you can copy the string first and modify the characters in-place afterwards.
Example:
char copy2[60];
if(strlen(copy) y sizeof(copy2)) {
strcpy(copy2, copy);
copy2[0] = toupper(copy2[0]);
if(copy[0]) {
/* The length has been checked before, no need to check again here */
for(int i = 1; copy[i]; i++) {
copy2[i] = tolower(copy2[i]);
}
/* the string is already terminated */
}
} else {
/* string too long, handle error */
}
or with truncating instead of reporting an error
char copy2[60];
strncpy(copy2, copy, sizeof(copy)-1);
copy2[sizeof(copy2)-1] = '\0';
copy2[0] = toupper(copy2[0]);
if(copy[0]) {
/* A long string would have been truncated before, no need to check the length here */
for(int i = 1; copy[i]; i++) {
copy2[i] = tolower(copy2[i]);
}
/* the string is already terminated */
}

String Extraction and Insertion Functions in C?

I need to write these two functions:
Precondition: hMy_string is the handle to a valid My_string object.
Postcondition: hMy_string will be the handle of a string object that contains
the next string from the file stream fp according to the following rules.
1) Leading whitespace will be ignored.
2) All characters (after the first non-whitespace character is obtained and included) will be added to the string until a stopping condition
is met. The capacity of the string will continue to grow as needed
until all characters are stored.
3) A stopping condition is met if we read a whitespace character after
we have read at least one non-whitespace character or if we reach
the end of the file.
Function will return SUCCESS if a non-empty string is read successfully.
and failure otherwise. Remember that the incoming string may aleady
contain some data and this function should replace the data but not
necessarily resize the array unless needed.
Status my_string_extraction(MY_STRING hMy_string, FILE* fp);
Precondition: hMy_string is the handle to a valid My_string object.
Postcondition: Writes the characters contained in the string object indicated by the handle hMy_string to the file stream fp.
Function will return SUCCESS if it successfully writes the string and
FAILURE otherwise.
Status my_string_insertion(MY_STRING hMy_string, FILE* fp);
However, I am getting a segmentation fault with my current code:
#include <stdio.h>
#include <stdlib.h>
#include "my_string.h"
Status my_string_extraction(MY_STRING hMy_string, FILE *fp)
{
string *pString = (string *) hMy_string;
int lws = 0;
int exit = 0;
int nws = 0;
int i;
int count = 0;
while(fp != NULL && exit == 0) {
if(pString->size >= pString->capacity) {
char *t_data = (char *)malloc(sizeof(char) * pString->capacity * 2);
if(t_data == NULL) {
return FAILURE;
}
for(i = 0; i < pString->size; i++) {
t_data[i] = pString->data[i];
}
free(pString->data);
pString->data = t_data;
pString->capacity *= 2;
if(getc(fp) == ' ' && lws == 0) {
lws++;
} else if(getc(fp) == ' ' && lws == 1) {
exit++;
} else if(getc(fp) == ' ' && nws > 0) {
exit++;
} else {
pString->data[count] = getc(fp);
count++;
pString->size++;
nws++;
}
fp++;
}
return SUCCESS;
}
Status my_string_insertion(MY_STRING hMy_string, FILE *fp)
{
int i;
string *pString = (string *) hMy_string;
for(i = 0; i < pString->size; i++) {
putc(pString->data[i], fp);
}
if(fp == NULL) {
return FAILURE;
}
return SUCCESS;
}
I am getting a segmentation fault with my current code
Why do you do this:
fp++; // incrementing pointer does not get you the next character in the file
When you try to get next character via getc(fp) that is enough to cause the crash . fp will be invalid pointer at that moment.
Also, I see no restrictions for crossing data boundary:
pString->data[count] = getc(fp);
count++; // count is not restricted from growing

C code to read config file and parse directives

I'm trying to read a config file and parse the config directives. So far I have the following code, I need advice on how to improve this or change it. Is this efficient? Thanks!
struct config
{
char host;
char port;
}
void parse_line(char *buf) {
char *line;
if(strstr(buf, "host=") || strstr(buf, "host = ") || strstr(buf, "host= ") || strstr(buf, "host =")) {
line = strstr(buf, "=");
printf("Host: %s", &line[2]);
} else if(strstr(buf, "port=") || strstr(buf, "port = ") || strstr(buf, "port= ") || strstr(buf, "port =")) {
line = strstr(buf, "=");
printf("Port: %s", &line[2]);
}
}
int main(int argc, char *argv[])
{
char *file_name;
FILE *file;
file_name = argv[1];
file = fopen(file_name, "r");
// check if file is NULL, etc..
char buffer[BUFSIZ];
char *line;
int i;
while(fgets(buffer, sizeof(buffer), file) != NULL) {
for(i = 0; i < strlen(buffer); i++) { // iterate through the chars in a line
if(buffer[i] == '#') { // if char is a #, stop processing chars on this line
break;
} else if(buffer[i] == ' ') { // if char is whitespace, continue until something is found
continue;
} else {
parse_line(buffer); // if char is not a # and not whitespace, it is a config directive, parse it
break;
}
}
}
fclose(file);
return 0;
}
I am looking for a way to ignore # if it is a first character on a line, and also lines that are white spaces. I think my code does that, but is that efficient?
EDIT:
Thanks everyone for all the suggestions, I have managed to do this simple code to trim the white spaces, so that I wouldn't need all the strstr() calls.
void trim(char *src)
{
int i, len;
len = strlen(src);
for(i = 0; i < len; i++) {
if(src[i] == ' ') {
continue;
}
if(src[i] == '\n' || src[i] == '#') {
break;
}
printf("%c", src[i]); // prints: host=1.2.3.4
}
}
int main(void)
{
char *str = "host = 1.2.3.4 # this is a comment\n";
trim(str);
return EXIT_SUCCESS;
}
It prints correctly: host=1.2.3.4 but now I need this in a variable to be further parsed. I think I will try to use strcpy.
EDIT 2:
I do not think that strcpy is the right choice. Those chars are printed out in a loop, so every time I use strcpy, the previous char is overwritten. I have tried this, but it does not work because only the host= part is placed into arr. The IP part is not placed into arr.. how can this be fixed..
char arr[sizeof(src)];
for(i = 0; i < len; i++) {
if(src[i] == ' ') {
continue;
}
if(src[i] == '\n' || src[i] == '#') {
break;
}
printf("%c", src[i]); // prints: host=1.2.3.4
arr[i] = src[i];
}
int j;
for(j = 0; j < sizeof(arr); j++) {
printf("%c", arr[j]); //prints: host=
}
EDIT 3:
I found the correct way of placing chars into arr:
int i, count = 0;
for(i = 0; i < len; i++) {
if(src[i] == ' ') {
continue;
}
if(src[i] == '\n' || src[i] == '#') {
break;
}
arr[count] = src[i];
count++;
}
Your implementation is pretty fragile. Parsers really ought to verify syntax and return errors when they see something unexpected. For example, yours should detect missing fields and multiply defined ones.
Fortunately this parsing problem is simple enough for sscanf to handle everything:
skip blank lines,
skip comments
ignore any amount of whitespace
extract the key/value pairs
Here's code:
#include <stdio.h>
#define CONFIG_SIZE (256)
#define HOST_SET (1)
#define PORT_SET (2)
typedef struct config {
unsigned set;
char host[CONFIG_SIZE];
unsigned long port;
} CONFIG;
// Parse the buffer for config info. Return an error code or 0 for no error.
int parse_config(char *buf, CONFIG *config) {
char dummy[CONFIG_SIZE];
if (sscanf(buf, " %s", dummy) == EOF) return 0; // blank line
if (sscanf(buf, " %[#]", dummy) == 1) return 0; // comment
if (sscanf(buf, " host = %s", config->host) == 1) {
if (config->set & HOST_SET) return HOST_SET; // error; host already set
config->set |= HOST_SET;
return 0;
}
if (sscanf(buf, " port = %lu", &config->port) == 1) {
if (config->set & PORT_SET) return PORT_SET; // error; port already set
config->set |= PORT_SET;
return 0;
}
return 3; // syntax error
}
void init_config(CONFIG *config) {
config->set = 0u;
}
void print_config(CONFIG *config) {
printf("[host=%s,port=", config->set & HOST_SET ? config->host : "<unset>");
if (config->set & PORT_SET) printf("%lu]", config->port); else printf("<unset>]");
}
int main(int argc, char *argv[]) {
if (argc != 2) {
fprintf(stderr, "Usage: %s CONFIG_FILE\n", argv[0]);
return 1;
}
FILE *f = fopen(argv[1], "r");
char buf[CONFIG_SIZE];
CONFIG config[1];
init_config(config);
int line_number = 0;
while (fgets(buf, sizeof buf, f)) {
++line_number;
int err = parse_config(buf, config);
if (err) fprintf(stderr, "error line %d: %d\n", line_number, err);
}
print_config(config);
return 0;
}
With this input:
# This is a comment
This isn't
# Non-leading comment
host = 123.456.789.10
###
port =42
port= 1
host=fruit.foo.bar
the output is
error line 3: 3
error line 10: 2
error line 11: 1
[host=fruit.foo.bar,port=1]
Note that when the parser discovers a field has already been set, it still uses the latest value in the config. It's easy enough to keep the original instead. I'll let you have that fun.
I think parse_line is a little bit rigid for my taste, I would use strtok
instead. Then you don't have to worry too much about spaces, like you do if you
have a space before the = sign.
Your struct is also wrong, host and port would only hold a character.
Besides port should be an integer. And you need a semicolon ; after the
struct definition.
struct config
{
char host[100];
int port;
};
int parse_line(struct config *config, char *buf)
{
if(config == NULL || buf == NULL)
return 0;
char varname[100];
char value[100];
const char* sep = "=\n"; // get also rid of newlines
char *token;
token = strtok(buf, sep);
strncpy(varname, token, sizeof varname);
varname[sizeof(varname) - 1] = 0; // making sure that varname is C-String
trim(varname);
token = strtok(NULL, sep);
if(token == NULL)
{
// line not in format var=val
return 0;
}
strncpy(value, token, sizeof value);
value[sizeof(varname) - 1] = 0
trim(value);
if(strcmp(varname, "port") == 0)
{
config->port = atoi(value);
return 1;
}
if(strcmp(varname, "host") == 0)
{
strncpy(config->host, value, siezof config->host);
config->host[(sizeof config->host) - 1] = 0;
return 1;
}
// var=val not recognized
return 0;
}
Note that I used a function called trim. This function is not part of the
standard library. Below I posted a possible implementation of such a function.
I like using trim because it gets rid of white spaces. Now you can do this in
main:
struct config config;
// initializing
config.port = 0;
config.host[0] = 0;
int linecnt = 0;
while(fgets(buffer, sizeof(buffer), file) != NULL) {
linecnt++;
trim(buffer);
if(buffer[0] == '#')
continue;
if(!parse_line(&config, buffer))
{
fprintf(stderr, "Error on line %d, ignoring.\n", linecnt);
continue;
}
}
A possible implementation of trim
void rtrim(char *src)
{
size_t i, len;
volatile int isblank = 1;
if(src == NULL) return;
len = strlen(src);
if(len == 0) return;
for(i = len - 1; i > 0; i--)
{
isblank = isspace(src[i]);
if(isblank)
src[i] = 0;
else
break;
}
if(isspace(src[i]))
src[i] = 0;
}
void ltrim(char *src)
{
size_t i, len;
if(src == NULL) return;
i = 0;
len = strlen(src);
if(len == 0) return;
while(src[i] && isspace(src[i]))
i++;
memmove(src, src + i, len - i + 1);
return;
}
void trim(char *src)
{
rtrim(src);
ltrim(src);
}
There are a few ways that you can improve performance:
Calling strstr() in this scenario is inefficient, because the presence of the "host" part of buf can be checked once instead of multiple times every time strstr() is called. Instead, make an if statement that checks if buf begins with "host", then check if buf contains the other elements. The same thing applies to the portion of code checking for the presence of "port".
In the loop in main, instead of doing this:
for(i = 0; i < strlen(buffer); i++) { // iterate through the chars in a line
if(buffer[i] == '#') { // if char is a #, stop processing chars on this line
break;
} else if(buffer[i] == ' ') { // if char is whitespace, continue until something is found
continue;
} else {
parse_line(buffer); // if char is not a # and not whitespace, it is a config directive, parse it
break;
}
do this:
for(i = 0; i < strlen(buffer); i++) { // iterate through the chars in a line
char temp = buffer[i];
if(temp == '#') { // if char is a #, stop processing chars on this line
break;
} else if (temp != ' ') {
parse_line(buffer); // if char is not a # and not whitespace, it is a config directive, parse it
break;
}
Checking to see if something is not equal to another is likely to be just as fast as checking if they are equal (at least on Intel, the je (jump equal) and jne (jump not equal) instructions exhibit the same latency of 1 cycle each), so the statement with the continue in it is not necessary. The temp variable is so that buffer[i] does not need to be calculated in the second if again in case the first if is false. Also, do what user3121023 stated below (same reason for performance as creating the temp variable).
You can use operating-system-specific functions (such as thos from the library WINAPI/WIN32/WIN64 (synonyms) on windows) instead of C standard library functions. Microsoft has very good documentation about their functions in the MSDN (Microsoft Developer Network) web site.
Use uint_fast8_t (defined in stdint.h, this typedef is set to the fastest integer type greater than or equal to the size in bits specified in the typedef) when performing operations on the host and port (but use chars when storing the variables on the disk, in order to make read i/o operations faster).
This isn't related to performance , but use return EXIT_SUCCESS; in main instead of return 0;, since using EXIT_SUCCESS is more readable and exhibits the same performance.
Honestly, I can't help but wonder if rolling your own parser is so great.
Why not use an existing JSON or YAML parser and test for keys in the parsed data?
This will be easily extendible by allowing for new keys to be added with very little effort and the common format of the configuration file makes it very easy for developers to edit.
If you are going to roll out your own parser, than some of the previously mentioned advice makes a lot of sense.
The biggest ones are: don't seek the whole buffer, read the single line that's in front of you and report any errors. Also, advance as you go.
Your parser should work correctly if someone would dump a GigaByte of garbage into the configuration file, so make no assumptions about the data.

conflict of using strcpy , strcat in C?

In the following code I'm trying to load a text file of words character by character
then I'm trying to save each whole word in hash table (array of strings)
but it seems that strcpy saves a whole word not a single char and I don't know why. Am I misusing strcpy and strcat?
# include <stdio.h>
# include <stdlib.h>
# include <string.h>
# include <ctype.h>
# include <stdbool.h>
bool load(const char* dictionary);
#define LENGTH 45
int main (int argc, char* argv[])
{
char* dictionary = argv[1];
load(dictionary);
return 0;
}
bool load(const char* dictionary)
{
int index = 0, words = 0, kk = 0;
int lastl = 0, midl = 0;
char word[LENGTH + 1];
char *wholeword[1001];
FILE* dic = fopen(dictionary, "r");
if (dic == NULL)
{
printf("Could not open %s.\n", dictionary);
return false;
}
for (int c = fgetc(dic); c != EOF; c = fgetc(dic))
{
// allow only alphabetical characters and apostrophes
if (isalpha(c) || (c == '\'' && index > 0))
{
// append character to word
word[index] = c;
index++;
// ignore alphabetical strings too long to be words
if (index > LENGTH)
{
// consume remainder of alphabetical string
while ((c = fgetc(dic)) != EOF && isalpha(c));
// prepare for new word
index = 0;
}
}
// ignore words with numbers (like MS Word can)
else if (isdigit(c))
{
// consume remainder of alphanumeric string
while ((c = fgetc(dic)) != EOF && isalnum(c));
// prepare for new word
index = 0;
}
// we must have found a whole word
else if (index > 0)
{
// terminate current word
word[index] = '\0';
lastl = index - 1;
midl = (index - 1) % 3;
words++;
index = 0;
int hashi = (word[0] + word[lastl]) * (word[midl] + 17) % 1000;
wholeword[hashi] = (char*) malloc(sizeof(char) * (lastl + 2));
strcpy(wholeword[hashi], &word[0]); // ***
for (kk = 1; kk <= lastl + 1; kk++)
{
strcat(wholeword[words], &word[kk]);
}
}
}
fclose(dic);
return true;
}
Strcpy doesn't copy a single char, it copies all chars until the next null ('\0') byte. To copy a single char in your code try:
wholeword[hashi] = &word[0];
instead of:
strcpy(wholeword[hashi], &word[0]);
Yes you are misusing strcpy and strcat: these functions copy a whole source string to the destination array (at the end of an existing string there for strcat).
The following lines:
wholeword[hashi] = (char*) malloc(sizeof(char) * (lastl + 2));
strcpy(wholeword[hashi], &word[0]); // ***
for (kk = 1; kk <= lastl + 1; kk++)
{
strcat(wholeword[words], &word[kk]);
}
}
Can be replaced with a single call to
wholeword[hashi] = strdup(word);
strdup() allocates the memory, copies the argument string to it and returns the pointer. It is available on all Posix systems, if you do not have it, use these 2 lines:
wholeword[hashi] = malloc(lastl + 2);
strcpy(wholeword[hashi], word);
Notes:
you assume your hash to be perfect, without collisions. As currently coded, a collision causes the previous word to be removed from the dictionary and its corresponding memory to be lost.
the dictionary char *wholeword[1001]; is a local variable in the load function. It is uninitialized, so there is no way to know if an entry is a valid pointer to a word. It should be allocated, initialized to NULL and returned to the caller.

putenv() and setenv() on Unix

I need to get input from user and deal with variables. I need to have next features:
set varname = somevalue: set the value of the environment variable named varname to the value specified by somevalue.
delete varname: remove the named environment variable.
print varname: prints out the current value of the named environment variable.
What I have till now is this:
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <readline/readline.h>
#include <readline/history.h>
int main(int argc, char **argv) {
char * s;
char * command;
char * varName;
char * varValue;
while (s = readline("prompt> ")) {
/* Initialise char* variables */
command = NULL;
varName = NULL;
varValue = NULL;
add_history(s); /* adds the line to the readline history buffer */
printf("> %s\n", s); //print > sign
int cmdNo = 1;
int i;
// parse through entire string
for (i = 0; i < strlen(s); i++)
{
// check for space or = and jump over it
while ((isspace(s[i]) || s[i] == '=') && (i < strlen(s)))
{
i++;
}
// check if i is greater than string size
if (i >= strlen(s))
{
printf("Bad command format!\n");
break;
}
// if cmdNo == 1, get the command
if (cmdNo == 1)
{
int commandSize = 0;
int index = i;
// point index to space
while (!isspace(s[index]))
{
commandSize++;
index++;
}
// get command
command = (char*)malloc(commandSize + 1);
int destIndex = 0;
// copy command into command array
while (i<index)
{
command[destIndex] = s[i];
destIndex++;
i++;
}
// adding terminate character
command[destIndex] = '\0';
// increase command number by 1
cmdNo++;
}
// if cmdNo == 2 we deal with variable name
else if (cmdNo == 2)
{
// variable name size
int varNameSize = 0;
int index = i;
// point index to space
while (!isspace(s[index]))
{
varNameSize++;
index++;
}
// get var name
varName = (char*)malloc(varNameSize + 1);
int index2 = 0;
while (i<index)
{
varName[index2] = s[i];
index2++;
i++;
}
// add terminate char
varName[index2] = '\0';
// increment cmdNo by 1
cmdNo++;
}
// if cmdNo == 3 we deal with variable value
else if (cmdNo == 3)
{
int valueSize = 0;
int index = i;
// point index to space
while (!isspace(s[index]) && s[index] != '\0')
{
valueSize++;
index++;
}
// get variable value
varValue = (char*)malloc(valueSize + 1);
int index2 = 0;
while (i<index)
{
varValue[index2] = s[i];
index2++;
i++;
}
// add terminate char
varValue[index2] = '\0';
}
}
// print command, variable name and value
if (command != NULL)
{
printf("%s", command);
}
if (varName != NULL)
{
printf(" %s", varName);
}
if (varValue != NULL)
{
printf(" %s\n", varValue);
}
/* clean up! */
free(s);
free(command);
free(varName);
free(varValue);
}
return(0);
}
Obviously, I had to put somewhere putenv(), setenv() or clearenv(). I don't have much experience with these commands.
Also, there is some error (Segmentation fault). This is response of system
The crash is caused by your loops while (!isspace(s[index])) for cmdNo 1 and 2 -- if there is no third (or second) word on the input line, they'll run past the NUL terminator in the string and (probably) crash. You need a check for NUL in these loops as you check in cmdNo 3 case.
You also have a problem if you have more than 3 words on the input line -- you'll go into an infinite loop on the 4th word.
Rather than duplicating the code for the word in if/else if/else if as you have done, its much better to put the words in an array. You could even use strtok_r or strsep rather than manually parsing out the characters.

Resources