Most efficient way to find if a string is mixedCase - c

Suppose I have very long strings and I want to see if a column is allLower, allUpper, or mixedCase. For example with the following column
text
"hello"
"New"
"items"
"iTem12"
"-3nXy"
The text would be mixedCase. A naive algorithm to determine this might be:
int is_mixed_case, is_all_lower, is_all_upper;
int has_lower = 0;
int has_upper = 0;
// for each row...for each column...
for (int i = 0; (c=s[i]) != '\0'; i++) {
if (c >='a' && c <= 'z') {
has_lower = 1;
if (has_upper) break;
}
else if (c >='A' && c <= 'Z') {
has_upper = 1;
if (has_lower) break;
}
}
is_all_lower = has_lower && !has_upper;
is_all_upper = has_upper && !has_lower;
is_mixed_case = has_lower && has_upper;
I'm sure there would be a more performant way to do this, however. What might be the most efficient way to do this algorithm/calculation?

If you know the character encoding that's going to be used (I've used ISO/IEC 8859-15 in the code example), a look-up table may be the fastest solution. This also allows you to decide which characters from the extended character set, such as µ or ß, you'll count as upper case, lower case or non-alphabetical.
char test_case(const char *s) {
static const char alphabet[] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // ABCDEFGHIJKLMNO
1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0, // PQRSTUVWXYZ
0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // abcdefghijklmno
2,2,2,2,2,2,2,2,2,2,2,0,0,0,0,0, // pqrstuvwxyz
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,1,0,2,0,2,0,0,0,0, // Š š ª
0,0,0,0,0,1,2,0,0,2,0,2,0,1,2,1, // Žµ ž º ŒœŸ
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏ
1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1, // ÐÑÒÓÔÕÖ ØÙÚÛÜÝÞß
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // àáâãäåæçèéêëìíîï
2,2,2,2,2,2,2,0,2,2,2,2,2,2,2,2}; // ðñòóôõö øùúûüýþÿ
char cases = 0;
while (*s && cases != 3) {
cases |= alphabet[(unsigned char) *s++];
}
return cases; // 0 = none, 1 = upper, 2 = lower, 3 = mixed
}
As suggested in a comment by chux, you can set the value of alphabet[0] to 4, and then you need only one condition cases < 3 in the while loop.

This should be fairly efficient - it checks the minimum number of characters necessary. This assumes a bias towards lower-case characters, so checking for lower-case first should be slightly more efficient:
#include <ctype.h>
int ismixed( const unsigned char *str )
{
int hasUpper = 0;
int hasLower = 0;
while ( *str )
{
// can't be both upper and lower case
// but it can be neither
if ( islower( *str ) )
{
hasLower = 1;
}
else if ( isupper( *str ) )
{
hasUpper = 1;
}
// return true as soon as we hit
// both upper and lower case
if ( hasLower && hasUpper )
{
return( 1 );
}
str++;
}
return( 0 );
}
Depending on whether your input is biased to lower or upper case, checking isupper() first might be better.

If we assume ASCII
If we assume all alpha,
Then code only needs to count the "case" bits. Is the sum 0, same as string length or otherwise?
void test_case(const char *s) {
const char *start = s;
size_t sum = 0;
size_t mask = 'A' ^ 'a';
while (*s) {
sum += *s++ & mask;
}
ptrdiff_t len = s - start;
sum /= mask;
if (len == 0) puts("Empty string");
else if (sum == 0) puts("All UC");
else if (sum == len) puts("All LC");
else puts("Mixed");
}
Note: with slight mods, will work for EBCIDIC too.

Is said string guaranteed to only contain letters? If so, could check to see if any two consecutive characters are different cases.
#include <ctype.h>
#include <errno.h>
int mixed_case(const char *str) {
if(!str){
// sanity check
errno = EINVAL;
return -1;
}
// can't be mixed-case without more than one letter
if(str[0] == '\0' || str[1] == '\0'){
return 0;
}
for(int i = 1; str[i] != '\0' ; ++i) {
if (!islower(str[i]) ^ !islower(str[i-1])) {
// if two letter next to each other are not the same case, it's mixed case
return 1;
}
}
// didn't find any mismatches, so not mixed case
return 0;
}
Taking a similar approach, but instead of checking consecutive characters, it will find the first alphabetical character and check it against any other alphabetical characters found. This should be able to handle strings with non-alphabetical characters.
int mixed_case(const char *str) {
if(!str){
// sanity check
errno = EINVAL;
return -1;
}
// can't be mixed-case without more than one letter
if(str[0] == '\0' || str[1] == '\0'){
return 0;
}
// find the first alphabetical character and store its index at 'i'
int i = 0;
for(;!isalpha(str[i]) || str[i] == '\0'; ++i);
if(str[i] == '\0') {
// no alphabetical characters means you can't have mixed cases
return 0;
}
// See if any of the other alphabetical characters differ from the case of the first one
for(int j = i+1; str[j] != '\0' ; ++j) {
if(isalpha(str[j]) && (!islower(str[i]) ^ !islower(str[j]))) {
return 1;
}
}
// didn't find any mismatches, so not mixed case
return 0;
}

Another approach that does not assume ASCII nor all alpha.
Assess the first char and then perform one of 2 optimized loops.
This quits the loops on the first mis-match. Since the while() loops are only doing a single test, this leads to optimal performance.
#include <ctype.h>
void case_test(const char *s) {
if (*s == '\0') {
puts("Empty string");
return;
}
unsigned char *us = (unsigned char *)s; // use unsigned char with is***() functions.
if (islower(*us)) {
while (islower(*us)) {
us++;
}
if (*us) {
puts("Mixed or not alpha");
} else {
puts("All lower");
}
} else if (isupper(*us)) {
while (isupper(*us)) {
us++;
}
if (*us) {
puts("Mixed case or not alpha");
} else {
puts("All upper");
}
} else {
puts("Not alpha");
}
}
OP added cases including non-alpha. The below promptly handles that.
void case_test_with_non_letters(const char *s) {
unsigned char *us = (unsigned char *)s; // use unsigned char with is***() functions.
// Find first alpha or null character
while (!isalpha(*us) && *us) {
us++;
}
if (*us == '\0') {
puts("Empty string");
return;
}
if (islower(*us)) {
while (!isupper(*us) && *us) {
us++;
}
if (isupper(*us)) {
puts("Mixed");
} else {
puts("All letters lower");
}
} else if (isupper(*us)) {
while (!islower(*us) && *us) {
us++;
}
if (*us) {
puts("Mixed case");
} else {
puts("All letters upper");
}
} else {
puts("Not alpha");
}
}

97 = a = 1100001
65 = A = 0100001
You have just to test the bit number 6.

Related

Function to allow for only alphabets, hyphens and apostrophes

As stated above, I would like to make a function that checks if all the characters in a string contains any prohibited input. The condition is that I only want to accept alphabets, hyphens and apostrophes. Below is my code which does not work the way I intended it to. If it is not an alphabet AND not an apostrophe or a hyphen, I want to change result to 0. However, when I enter a valid input like 'a-a; which is either an alphabet or hyphen, the if function still gets executed which prints "IT IS NOT ACCEPTED".
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
int validateInput(char word[]);
int main(void) {
char word[33] = "a-a";
printf("%d",validateInput(word));
}
int validateInput(char word[]) {
int result = 1;
int i;
int length = strlen(word);
for (i = 0; i <length; i++) {
if ((isalpha(word[i]) == 0) && ((word[i] != '-') || (word[i] != '\''))) {
printf("IT IS NOT ACCEPTED\n");
result = 0;
}
else {
printf("ACCEPTED\n");
}
}
return result;
}
There are multiple problems in your code:
you issue the diagnostic at each iteration instead of at the end of the loop
the test (word[i] != '-') || (word[i] != '\'') is always true.
isalpha() should not be passed a char value that could be negative. You should cast the argument as (unsigned char) to avoid potential undefined behavior.
Here is a modified version:
#include <ctype.h>
int validateInput(const char *word) {
int result = 1;
for (size_t i = 0; word[i] != '\0'; i++) {
if (!isalpha((unsigned char)word[i]) && word[i] != '-' && word[i] != '\'') {
result = 0;
break;
}
}
if (result) {
printf("ACCEPTED\n");
} else {
printf("IT IS NOT ACCEPTED\n");
}
return result;
}
Note however that the above function will accept an empty string, which might not be the intended behavior.
Here is a simpler version using sscanf() that works for ASCII:
#include <stdio.h>
int validateInput(const char *word) {
int pos = 0;
sscanf(word, "%*[-a-zA-Z']%n", &pos);
if (pos > 0 && word[pos] == '\0') {
printf("ACCEPTED\n");
return 1;
} else {
printf("IT IS NOT ACCEPTED\n");
return 0;
}
}
And this is a more verbose version using strspn() that works for all encodings:
#include <string.h>
int validateInput(const char *word) {
size_t len = strspn(word, "'-ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
if (len > 0 && word[len] == '\0') {
printf("ACCEPTED\n");
return 1;
} else {
printf("IT IS NOT ACCEPTED\n");
return 0;
}
}
Try:
if( !( (isalpha((unsigned char)word[i])) || (word[i] == '-') || (word[i] == '\'')) )

Type checking arbitrary length array in ANSI C

Hi I am confined to stdio.h, stdlib.h and string.h and I need to ask a user for input - the input can be any number of characters between 1 and 6, however the first two characters MUST be an uppercase alphabetical letter, and the remaining four characters MUST be a number between 0 and 9.
Examples of valid input:
AB1
AB1234
AB
A
Examples of Invalid Input:
AB12345 (too many characters)
123 (first two characters are not uppercase letters)
ABA (a character after the second one is not a numeric value)
Here is my attempt so far (just bear in mind I have almost no experience with C, the likelihood that this solution is "idiomatic" is next to none, and the reason I am asking this is so that I can learn):
Flightcode is a char array defined as flightcode[7] it lives inside another struct called flight. I am fgetsing it into a temp_array[7] first and then strcpying it into the flight->flightcode such that the null terminator is appended and I don't know a better way of doing that.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define MAX_FLIGHTCODE_LEN 6
#define MAX_CITYCODE_LEN 3
#define MAX_NUM_FLIGHTS 50
#define DB_NAME "database"
typedef struct {
int month;
int day;
int hour;
int minute;
} date_time_t;
typedef struct {
char flightcode[MAX_FLIGHTCODE_LEN + 1];
date_time_t departure_dt;
char arrival_city[MAX_CITYCODE_LEN + 1];
date_time_t arrival_dt;
} flight_t;
date_time_t departure_dt;
date_time_t arrival_dt;
char * scanline(char *dest, int dest_len);
int main(){
char temp_string[100];
flight_t flight[MAX_NUM_FLIGHTS + 1];
int correct_code = 0;
printf("Enter flight code>\n");
scanline(temp_string, sizeof(flight->flightcode));
strcpy(flight->flightcode, temp_string);
while(correct_code == 0)
{
for(int i = 0; flight->flightcode[i] != '\0' && correct_code == 0; i++)
{
while((i < 2 && (flight->flightcode[i] <= 64 || flight->flightcode[i] >= 91)) || (i > 1 && (flight->flightcode[i] < 48 || flight->flightcode[i] >= 58)))
{
printf("Invalid input.\n");
scanline(temp_string, sizeof(flight->flightcode));
strcpy(flight->flightcode, temp_string);
}
if((i < 2 && (flight->flightcode[i] > 64 || flight->flightcode[i] < 91)) || (i > 1 && (flight->flightcode[i] >= 48 || flight->flightcode[i] < 58)))
{
correct_code = 1;
}
}
}
}
char * scanline(char *dest, int dest_len){
int i, ch;
i = 0;
for (ch = getchar();
ch != '\n' && ch != EOF && i < dest_len -1; ch = getchar())
dest[i++] = ch;
dest[i] = '\0';
while (ch != '\n' && ch != EOF)
ch = getchar();
return (dest);
}
Scansets and the %n specifier could be used to parse the input.
The format string "%n%2[A-Z]%n%4[0-9]%n" uses the %n specifier in three places to capture the number of characters processed. The scanset %2[A-Z] will scan up to two characters if the characters are in the set of upper case letters. %4[0-9] will scan up to four characters if the characters are digits.
If two values are scanned by sscanf, the number of characters processed are subtracted to make sure there are two leading upper case characters and six or fewer total character and the trailing character is the terminating zero.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define MAX_FLIGHTCODE_LEN 6
#define MAX_CITYCODE_LEN 3
#define MAX_NUM_FLIGHTS 50
#define DB_NAME "database"
typedef struct {
int month;
int day;
int hour;
int minute;
} date_time_t;
typedef struct {
char flightcode[MAX_FLIGHTCODE_LEN + 1];
date_time_t departure_dt;
char arrival_city[MAX_CITYCODE_LEN + 1];
date_time_t arrival_dt;
} flight_t;
date_time_t departure_dt;
date_time_t arrival_dt;
char * scanline(char *dest, int dest_len);
int main(){
int head = 0, leading = 0, tail = 0;
int correct_code = 0;
int result = 0;
char temp_string[100];
char upper[3] = "";
char digits[5] = "";
flight_t flight[MAX_NUM_FLIGHTS + 1];
do {
printf("Enter flight code>\n");
scanline(temp_string, sizeof(temp_string));
if ( 0 < ( result = sscanf ( temp_string, "%n%2[A-Z]%n%4[0-9]%n", &head, upper, &leading, digits, &tail))) {
if ( 1 == result && 0 == temp_string[leading]) {
correct_code = 1;
break;
}
if ( 2 == result && 2 == leading - head && 7 > tail - head && 0 == temp_string[tail]) {
correct_code = 1;
}
else {
printf ( "invalid input\n");
}
}
else {
printf ( "invalid input\n");
}
} while(correct_code == 0);
printf ( "Input is: %s\n", temp_string);
strcpy(flight->flightcode, temp_string);
return 0;
}
char * scanline(char *dest, int dest_len){
int i, ch;
i = 0;
for (ch = getchar(); ch != '\n' && ch != EOF && i < dest_len -1; ch = getchar()) {
dest[i++] = ch;
}
dest[i] = '\0';
while (ch != '\n' && ch != EOF) {
ch = getchar();
}
return dest;
}
Your function scanline does not do much more than the standard function fgets. I propose to use the standard function instead. Removing the trailing newline '\n' is easy.
I have split the checks into 3 parts:
Check the length to be more than 0 and not more than MAX_FLIGHTCODE_LEN.
Check the first 2 characters to be uppercase letters A..Z
Check the remaining characters to be digits 0..9
Proposed code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define MAX_FLIGHTCODE_LEN 6
#define MAX_CITYCODE_LEN 3
#define MAX_NUM_FLIGHTS 50
#define DB_NAME "database"
typedef struct {
int month;
int day;
int hour;
int minute;
} date_time_t;
typedef struct {
char flightcode[MAX_FLIGHTCODE_LEN + 1];
date_time_t departure_dt;
char arrival_city[MAX_CITYCODE_LEN + 1];
date_time_t arrival_dt;
} flight_t;
date_time_t departure_dt;
date_time_t arrival_dt;
int main(void){
char temp_string[100];
flight_t flight[MAX_NUM_FLIGHTS + 1];
int correct_code;
size_t len;
int i;
do
{
/* we first assume the code is correct and set this to 0 on any error */
correct_code = 1;
printf("Enter flight code>\n");
if(fgets(temp_string, sizeof(temp_string), stdin) == NULL)
{
if(feof(stdin)) fprintf(stderr, "no input (EOF)\n");
else perror("fgets");
correct_code = 0;
temp_string[0] = '\0';
}
if(correct_code)
{
len = strlen(temp_string);
/* cut off newline
* Use a loop to handle CR and LF just in case Windows might leave more than one character */
while((len > 0) &&
((temp_string[len - 1] == '\n') ||
(temp_string[len - 1] == '\r')))
{
len--;
temp_string[len] == '\0';
}
if(len > MAX_FLIGHTCODE_LEN)
{
correct_code = 0;
fprintf(stderr, "Input must not be longer than %d characters.\n", MAX_FLIGHTCODE_LEN);
}
if(len == 0)
{
correct_code = 0;
fprintf(stderr, "Empty input.\n");
}
}
/* check first two letters */
for(i = 0; (i < 2) && (i < len) && correct_code; i++)
{
/* you could use function isupper when you make sure the locale is set to "C" */
if((temp_string[i] < 'A') || (temp_string[i] > 'Z'))
{
correct_code = 0;
fprintf(stderr, "first two characters must be uppercase letters. Found '%c' at position %d\n", temp_string[i], i);
}
}
/* check digits starting from 3rd character */
for(i = 2; (i < MAX_FLIGHTCODE_LEN) && (i < len) && correct_code; i++)
{
/* you could use function isdigit here */
if((temp_string[i] < '0') || (temp_string[i] > '9'))
{
correct_code = 0;
fprintf(stderr, "Third to last characters must be digits. Found '%c' at position %d\n", temp_string[i], i);
}
}
if(correct_code)
{
/* we already checked that length is not more than MAX_FLIGHTCODE_LEN, so we don't need strncpy to avoid buffer overflow */
strcpy(flight->flightcode, temp_string);
printf("Valid code: %s\n", flight->flightcode);
}
else
{
fprintf(stderr, "Invalid code.\n");
}
} while(!correct_code);
return 0;
}
You have a requirement that does not fit well with what scanf can easily do, so I would stay away from it, and use fgets as a primary read utility.
But as the number of acceptable uppercase and digit characters is not fixed by only limited I would use a custom parser based on a state machine. It is probably not the most elegant nor efficient way but it is simple, robust and easy to maintain.
Just to demonstrate it, I have allowed blank characters before the first uppercase one and spaces after the last digit. So the following code accept an arbitrary long line following this regex pattern [ \t]*[A-Z]{1,maxupper}[0-9]{0,maxdigit}\s* provided it receives a buffer of size at least maxupper+maxupper+1. It returns a pointer to the buffer is successful or NULL if not.
As you have said that you could not use the ctype macros, I have defined ASCII (or any charset derived from ASCII) equivalent for the ones I have used.
#define TRUE 1
#define FALSE 0
inline int isupper(int c) {
return c >= 'A' && c <= 'Z'; // only for ASCII and derived
}
inline int isdigit(char c) {
return c >= '0' && c <= '9'; // guarantee per standard
}
inline int isblank(int c) {
return c == ' ' || c == '\t';
}
inline int isspace(int c) {
static const char spaces[] = " \t\r\n\v";
for(const char *s=spaces; *s != '\0'; s++) {
if (c == *s) return TRUE;
}
return FALSE;
}
char *get_string(char *buffer, int maxupper, int maxdigit, FILE *fd) {
char buf[16]; // any size >=2 will fit
char *cur = buffer;
int state = 0, uppersize=0, digitsize=0;
for (;;) { // allow lines longer than buf
if (NULL == fgets(buf, sizeof(buf), fd)) {
*cur = '\0'; // EOF: do not forget the terminating NULL
return state >= 1 ? buffer : NULL; // must have at least 1 char
}
for (char *b=buf; *b!='\0'; b++) {
switch(state) {
case 0: // spaces before first uppercase
if (isblank(*b)) break;
state++;
case 1: // first uppercase
if (! isupper(*b)) {
state = 5; // must read up to \n
break;
}
state++;
case 2: // process uppercase chars
if (! isupper(*b)) {
if (uppersize > 0) state++;
else {
state = 5; // must read up to \n
break;
}
}
else {
if (uppersize >= maxupper) {
state = 5; // must read up to \n
break;
}
*cur++ = *b;
uppersize++;
break;
}
case 3: // process digit chars
if (! isdigit(*b)) {
state++;
}
else {
if (digitsize >= maxdigit) {
state = 5; // must read up to \n
break;
}
*cur++ = *b;
digitsize++;
break;
}
case 4: // allow spaces after last digit
if ('\n' == *b) {
*cur = '\0';
return buffer;
}
if (! isspace(*b)) state++
break;
case 5: // on error clean end of line
if ('\n' == *b) return NULL;
}
}
}
}
Then in your code, you simply calls it that way:
...
printf("Enter flight code>\n");
if (NULL == get_string(flight->flightcode, 2, 4, stdin)) {
// process the error
...
}
...
First thing, realize that your question text is missing a question. Moreover, your question title makes no sense.
Anyway, here it is a possible, purposely very ugly, solution. Approach: you want to do X, so you write the code to do X. Let's start with scanline():
int scanline(char *dest, int dest_len)
{
int i = 0;
int ch;
while (1) {
// Read
ch = fgetc(stdin);
// Check
if (ch == EOF)
break;
if (ch == '\n')
break;
if (i >= dest_len - 1)
break;
// Use
dest[i] = ch;
++i;
}
dest[i] = 0;
// Is the string finished? Ok!
if (ch == '\n' || ch == EOF)
return 1;
// Otherwise discard the rest of the line. Not ok!
while (ch != '\n' && ch != EOF)
ch = fgetc(stdin);
return 0;
}
I know this is ugly, but I believe that it is helpful to clarify the three steps involved in file input: read, check, use. Note that it returns true if the line was up to the required number of characters (one less than the buffer size to accomodate for the terminator.
Then you want to check if:
scanline() is successful
there is at least one character.
character 0 is between 'A' and 'Z'
character 1 is between 'A' and 'Z'
character 2 is between '0' and '1'
character 3 is between '0' and '1'
character 4 is between '0' and '1'
character 5 is between '0' and '1'
Lets write the code for that:
int main(void)
{
flight_t flight;
while (1) {
printf("Enter flight code>\n");
if (!scanline(flight.flightcode, sizeof(flight.flightcode))) {
printf("Too many characters.\n");
continue;
}
int i = 0;
if (flight.flightcode[i] == 0) {
printf("Empty input.\n");
continue;
}
if (flight.flightcode[i] < 'A' || flight.flightcode[i] > 'Z') {
printf("Character %d is not upper case.\n", i);
continue;
}
i++;
if (flight.flightcode[i] == 0)
break;
if (flight.flightcode[i] < 'A' || flight.flightcode[i] > 'Z') {
printf("Character %d is not upper case.\n", i);
continue;
}
i++;
if (flight.flightcode[i] == 0)
break;
if (flight.flightcode[i] < '0' || flight.flightcode[i] > '9') {
printf("Character %d is not a digit.\n", i);
continue;
}
i++;
if (flight.flightcode[i] == 0)
break;
if (flight.flightcode[i] < '0' || flight.flightcode[i] > '9') {
printf("Character %d is not a digit.\n", i);
continue;
}
i++;
if (flight.flightcode[i] == 0)
break;
if (flight.flightcode[i] < '0' || flight.flightcode[i] > '9') {
printf("Character %d is not a digit.\n", i);
continue;
}
i++;
if (flight.flightcode[i] == 0)
break;
if (flight.flightcode[i] < '0' || flight.flightcode[i] > '9') {
printf("Character %d is not a digit.\n", i);
continue;
}
i++;
if (flight.flightcode[i] == 0)
break;
}
}
Some remarks:
in your code you set correct_code to 1 as soon as the first character was ok. If you want to loop through the characters you must check if there is an error and exit the loop.
don't use ASCII codes when you have the specific character literals available.
I suggest that you take my solution and, as an exercise fix it to be able to work with arbitrary MAX_FLIGHTCODE_LEN, and possibly with arbitrary number of letters and numbers. Of course MAX_FLIGHTCODE_LEN shall be equal to their sum!
Drop the useless requirement for not using <ctype.h>, and use also <stdbool.h>, which makes the programmer intention clearer.

Count word in string

I'm trying to write a function to count occurrences of a particular word in a string. For example: Given string -
"Stop, time to go home. Todo fix me."
Letters "to" appeared three times (twice in different words); however, the word "to" appears only once. What should I do to count only word "to" (if will appear in string more times then count every single one). Any advice?
This is the code I was trying and playing around.
int word(char inputLine[]) {
int word = 0, i = 0, j = 0;
for (i = 0; inputLine[i] != '\0'; i++) {
if (inputLine[i] == 't' || inputLine[i] == 'o' || inputLine[i] != ' ') {
word++;
}
}
return word;
}
Try this:
int word(char inputLine[]) {
int word = 0, i = 0;
// stop before the last char
for (i = 0; inputLine[i] != '\0' && inputLine[i+1] != '\0'; i++) {
// is (T or t) and (O or o)
if ((inputLine[i] == 't' || inputLine[i] == 'T') && (inputLine[i+1] == 'o' || inputLine[i+1] == 'O')) {
// after the 'to' is not a letter
if ((inputLine[i+2] < 'a' || inputLine[i+2] > 'z') &&
(inputLine[i+2] < 'A' || inputLine[i+2] > 'Z')) {
// before is not a letter (or this is the start of the string)
if (i == 0 ||
((inputLine[i-1] < 'a' || inputLine[i-1] > 'z') &&
(inputLine[i-1] < 'A' || inputLine[i-1] > 'Z'))) {
word++;
}
}
}
}
return word;
}
The simplest way would be to use strtok. But, if you'd like to do it all by hand, the following will work. Although you only wanted the "to" this will work for any search string:
#include <stdio.h>
// word -- get number of string matches
int
word(char *input,char *str)
// input -- input buffer
// str -- string to search for within input
{
int chr;
int prev;
int off;
int stopflg;
int wordcnt;
off = -1;
stopflg = 0;
wordcnt = 0;
prev = 0;
for (chr = *input++; ! stopflg; prev = chr, chr = *input++) {
// we've hit the end of the buffer
stopflg = (chr == 0);
// convert whitespace characters to EOS [similar to what strtok might
// do]
switch (chr) {
case ' ':
case '\t':
case '\n':
case '\r':
chr = 0;
break;
}
++off;
// reset on mismatch
// NOTE: we _do_ compare EOS chars here
if (str[off] != chr) {
off = -1;
continue;
}
// we just matched
// if we're starting the word we must ensure we're not in the middle
// of one
if ((off == 0) && (prev != 0)) {
off = -1;
continue;
}
// at the end of a word -- got a match
if (chr == 0) {
++wordcnt;
off = -1;
continue;
}
}
return wordcnt;
}
void
tryout(int expcnt,char *buf)
{
int actcnt;
actcnt = word(buf,"to");
printf("%d/%d -- '%s'\n",expcnt,actcnt,buf);
}
// main -- main program
int
main(int argc,char **argv)
{
char *cp;
--argc;
++argv;
for (; argc > 0; --argc, ++argv) {
cp = *argv;
if (*cp != '-')
break;
switch (cp[1]) {
default:
break;
}
}
tryout(1,"to");
tryout(2,"to to");
tryout(1," to ");
tryout(1,"todo to");
tryout(2,"todo to to");
tryout(2,"doto to to");
tryout(1,"doto to doto");
tryout(0,"doto");
return 0;
}
If you must use only "basic" C functions the above solutions seems ok, but in the case you want to build a more scalable application (and you want to solve the problem in a smarter way) you can use a library that manipulate regular expressions. You can check this answer: Regular expressions in C: examples?
Regexes has the advantage that you can make the regex case unsensible (That is one of your issues).
I usually use pcre because it has the regex style of perl and java.
Here it is a very useful example that uses pcre: http://www.mitchr.me/SS/exampleCode/AUPG/pcre_example.c.html
Let's posit these rules:
"to" can be a word only when there is no char before and after it except the space char
If you accept those rules as valid and correct you need to check 4 conditions:
if (str[i]=='t'&& str[i+1]=='o'&& str[i-1]!='a-z'&& str[i+2]!='a-z'){
word++;
}
Two more conditions can be included to check for the upper case letters.
public class FindCountOfWordInString {
public static void main(String[] args) {
String str = "yhing ghingu jhhtring inghfg ajklingingd me";
String find = "ing";
int count = findCountOfWordInString(str, find);
System.out.println(count);
}
private static int findCountOfWordInString(String str, String find) {
String[] strArr = str.split(" ");
int count = 0, k = 0;
for (int i = 0; i < strArr.length; i++) {
if (strArr[i].contains(find)) {
String strCheck = strArr[i];
char[] findCharArr = find.toCharArray();
for (int j = 0; j < strCheck.length(); j++) {
if (strCheck.charAt(j) == findCharArr[k]) {
k++;
if (k == 3) {
count++;
k = 0;
}
} else {
k = 0;
}
}
}
}
return count;
}
}

Replace space with newline

I'm trying to do a space with line replacer for Ti-89 calculators so that I can print lines without them being cut because of the horizontal character length. They would normally look like so
This is a pretty long test
ing for the Ti89 Calculator
and I would like them to look like so
This is a pretty long
testing for the Ti89
Calculator
I tried to do it with this code
void _print_line(char* string)
{
int k = strlen(string);
if(k > 26)
{
int n = 0;
int c = 25;
while(n == 0)
{
if(string[c] == 32)
{
n = 1;
}
else
{
c--;
}
if(c <= 0)
{
n = 2;
}
}
if(n == 1)
{
string[c] == '\n';
}
}
printf("%s\n", string);
}
But it seems to just ignore it and keep printing it like the first example.
You are not inserting the carriage return.
Replace
string[c] == '\n';
With
string[c] = '\n';
As Till said, you are not inserting the carriage return. The line
string[c] == '\n';
needs to be
string[c] = '\n';
With the difference being one "equals" sign versus two.
The reason is because "==" is a conditional operator which evaluates to either true or false, while "=" is the assignment operator which sets a value to a variable.
There needs to be processed for the entire string.
Keep a record of the current output character position to check whether or not exceed the specified width when you output the next word.
like this:
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#define DISP_OUT stdout
#define DISP_WIDTH 26
int disp_one(char ch){
static int pos;
fputc(ch, DISP_OUT);
if(ch == '\n')
pos = 0;
else
++pos;
if(pos == DISP_WIDTH){
fputc('\n', DISP_OUT);
pos = 0;
}
return pos;
}
typedef enum word_break {
KEEP, BREAK
} WORD_BREAK;
void disp(const char *str, WORD_BREAK word_break){
static int pos;
switch(word_break){
case BREAK:
while(*str){
pos = disp_one(*str++);
}
break;
case KEEP:
while(*str){
if(isspace((unsigned char)*str)){
pos = disp_one(*str++);
continue;
}
const char *end = str;//end : word end (find delimiter)
while(*end && !isspace((unsigned char)*end))
++end;
int len = end - str;//length of next output word
if(pos + len >= DISP_WIDTH && len < DISP_WIDTH){
pos = disp_one('\n');
}
while(str < end){
pos = disp_one(*str++);
}
}
break;
}
}
int main(void){
char *text = "This is a pretty long testing for the Ti89 Calculator";
disp(text, BREAK);
disp("\n", BREAK);
disp(text, KEEP);
return 0;
}

checking if character is upper or lower case in alphanumeric

I have this C code. If I input a LOL123 it should display that it is uppercase. And lol123 it is in lowercase. How do I use isalpha in excluding non-numerical input when checking isupper or is lower?
#include <stdio.h>
#define SIZE 6
char input[50];
int my_isupper(char string[]);
int main(){
char input[] = "LOL123";
int m;
m= isupper(input);
if( m==1){
printf("%s is all uppercase.\n", input);
}else
printf("%s is not all uppercase.\n", input);
return 0;
}
int my_isupper(char string[]){
int a,d;
for (a=0; a<SIZE); a++){
d= isupper(string[a]) ;
}
if(d != 0)
d=1;
return d;
}
For upper-case function just loop trough the string and if a lowercase character is encountred you return false like value. And don't use standard library functions names to name your own functions. Use isUpperCase instead.
Live Demo: https://eval.in/93429
#include <stdio.h>
#include <string.h>
int isUpperCase(const char *inputString);
int main(void)
{
char inputString1[] = "LOL123";
char inputString2[] = "lol123";
printf("%s is %s\n", inputString1, isUpperCase(inputString1)?"upper-case":"not upper-case");
printf("%s is %s\n", inputString2, isUpperCase(inputString2)?"lower-case":"not upper-case");
return 0;
}
int isUpperCase(const char *inputString)
{
int i;
int len = strlen(inputString);
for (i = 0; i < len; i++) {
if (inputString[i] >= 'a' && inputString[i] <= 'z') {
return 0;
}
}
return 1;
}
int my_isalpha_lower(int c) {
return ((c >= 'a' && c <= 'z')); }
int my_isalpha_upper(int c) {
return ((c >= 'A' && c <= 'Z')); }
int isdigit(int c) {
return (c >= '0' && c <= '9'); }
while (*s) {
if (!is_digit(*s) && !my_isalpha_lower(*s))
{
//isnot lower but is alpha
}
else if (!is_digit(*s) && !my_alpha_upper(*s))
{
//is not upper but is alpha
}
s++;
}
char c = ...;
if (isalpha(c))
{
// do stuff if it's alpha
} else {
// do stuff when not alpha
}
You have a lot to learn, besides using a name of a standard function your design also is completely flawed. You only memorize the case of the last character that you encounter in your for loop, so the result that you return is not at all what you think.
Some more observations:
Don't use the name of a standard function for your own.
Arrays decay to pointers when then are used as function parameters. You have no way to automatically detect the size of the array.
You expect your return from isupper to be a logical value. Testing that again with ==1 makes not much sense.
You have two different variables called input, one in file scope, one in main.
Fairly simple:
#include <ctype.h>
/**
* Will return true if there's at least one alpha character in
* the input string *and* all alpha characters are uppercase.
*/
int allUpper( const char *str )
{
int foundAlpha = 0;
int upper = 1;
for ( const char *p = str; *p; p++ )
{
int alpha = isalpha( *p );
foundAlpha = foundAlpha || alpha;
if ( alpha )
upper = upper && isupper( *p );
}
return foundAlpha && upper;
}

Resources