Suppose I have very long strings and I want to see if a column is allLower, allUpper, or mixedCase. For example with the following column
text
"hello"
"New"
"items"
"iTem12"
"-3nXy"
The text would be mixedCase. A naive algorithm to determine this might be:
int is_mixed_case, is_all_lower, is_all_upper;
int has_lower = 0;
int has_upper = 0;
// for each row...for each column...
for (int i = 0; (c=s[i]) != '\0'; i++) {
if (c >='a' && c <= 'z') {
has_lower = 1;
if (has_upper) break;
}
else if (c >='A' && c <= 'Z') {
has_upper = 1;
if (has_lower) break;
}
}
is_all_lower = has_lower && !has_upper;
is_all_upper = has_upper && !has_lower;
is_mixed_case = has_lower && has_upper;
I'm sure there would be a more performant way to do this, however. What might be the most efficient way to do this algorithm/calculation?
If you know the character encoding that's going to be used (I've used ISO/IEC 8859-15 in the code example), a look-up table may be the fastest solution. This also allows you to decide which characters from the extended character set, such as µ or ß, you'll count as upper case, lower case or non-alphabetical.
char test_case(const char *s) {
static const char alphabet[] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // ABCDEFGHIJKLMNO
1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0, // PQRSTUVWXYZ
0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // abcdefghijklmno
2,2,2,2,2,2,2,2,2,2,2,0,0,0,0,0, // pqrstuvwxyz
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,1,0,2,0,2,0,0,0,0, // Š š ª
0,0,0,0,0,1,2,0,0,2,0,2,0,1,2,1, // Žµ ž º ŒœŸ
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏ
1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1, // ÐÑÒÓÔÕÖ ØÙÚÛÜÝÞß
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // àáâãäåæçèéêëìíîï
2,2,2,2,2,2,2,0,2,2,2,2,2,2,2,2}; // ðñòóôõö øùúûüýþÿ
char cases = 0;
while (*s && cases != 3) {
cases |= alphabet[(unsigned char) *s++];
}
return cases; // 0 = none, 1 = upper, 2 = lower, 3 = mixed
}
As suggested in a comment by chux, you can set the value of alphabet[0] to 4, and then you need only one condition cases < 3 in the while loop.
This should be fairly efficient - it checks the minimum number of characters necessary. This assumes a bias towards lower-case characters, so checking for lower-case first should be slightly more efficient:
#include <ctype.h>
int ismixed( const unsigned char *str )
{
int hasUpper = 0;
int hasLower = 0;
while ( *str )
{
// can't be both upper and lower case
// but it can be neither
if ( islower( *str ) )
{
hasLower = 1;
}
else if ( isupper( *str ) )
{
hasUpper = 1;
}
// return true as soon as we hit
// both upper and lower case
if ( hasLower && hasUpper )
{
return( 1 );
}
str++;
}
return( 0 );
}
Depending on whether your input is biased to lower or upper case, checking isupper() first might be better.
If we assume ASCII
If we assume all alpha,
Then code only needs to count the "case" bits. Is the sum 0, same as string length or otherwise?
void test_case(const char *s) {
const char *start = s;
size_t sum = 0;
size_t mask = 'A' ^ 'a';
while (*s) {
sum += *s++ & mask;
}
ptrdiff_t len = s - start;
sum /= mask;
if (len == 0) puts("Empty string");
else if (sum == 0) puts("All UC");
else if (sum == len) puts("All LC");
else puts("Mixed");
}
Note: with slight mods, will work for EBCIDIC too.
Is said string guaranteed to only contain letters? If so, could check to see if any two consecutive characters are different cases.
#include <ctype.h>
#include <errno.h>
int mixed_case(const char *str) {
if(!str){
// sanity check
errno = EINVAL;
return -1;
}
// can't be mixed-case without more than one letter
if(str[0] == '\0' || str[1] == '\0'){
return 0;
}
for(int i = 1; str[i] != '\0' ; ++i) {
if (!islower(str[i]) ^ !islower(str[i-1])) {
// if two letter next to each other are not the same case, it's mixed case
return 1;
}
}
// didn't find any mismatches, so not mixed case
return 0;
}
Taking a similar approach, but instead of checking consecutive characters, it will find the first alphabetical character and check it against any other alphabetical characters found. This should be able to handle strings with non-alphabetical characters.
int mixed_case(const char *str) {
if(!str){
// sanity check
errno = EINVAL;
return -1;
}
// can't be mixed-case without more than one letter
if(str[0] == '\0' || str[1] == '\0'){
return 0;
}
// find the first alphabetical character and store its index at 'i'
int i = 0;
for(;!isalpha(str[i]) || str[i] == '\0'; ++i);
if(str[i] == '\0') {
// no alphabetical characters means you can't have mixed cases
return 0;
}
// See if any of the other alphabetical characters differ from the case of the first one
for(int j = i+1; str[j] != '\0' ; ++j) {
if(isalpha(str[j]) && (!islower(str[i]) ^ !islower(str[j]))) {
return 1;
}
}
// didn't find any mismatches, so not mixed case
return 0;
}
Another approach that does not assume ASCII nor all alpha.
Assess the first char and then perform one of 2 optimized loops.
This quits the loops on the first mis-match. Since the while() loops are only doing a single test, this leads to optimal performance.
#include <ctype.h>
void case_test(const char *s) {
if (*s == '\0') {
puts("Empty string");
return;
}
unsigned char *us = (unsigned char *)s; // use unsigned char with is***() functions.
if (islower(*us)) {
while (islower(*us)) {
us++;
}
if (*us) {
puts("Mixed or not alpha");
} else {
puts("All lower");
}
} else if (isupper(*us)) {
while (isupper(*us)) {
us++;
}
if (*us) {
puts("Mixed case or not alpha");
} else {
puts("All upper");
}
} else {
puts("Not alpha");
}
}
OP added cases including non-alpha. The below promptly handles that.
void case_test_with_non_letters(const char *s) {
unsigned char *us = (unsigned char *)s; // use unsigned char with is***() functions.
// Find first alpha or null character
while (!isalpha(*us) && *us) {
us++;
}
if (*us == '\0') {
puts("Empty string");
return;
}
if (islower(*us)) {
while (!isupper(*us) && *us) {
us++;
}
if (isupper(*us)) {
puts("Mixed");
} else {
puts("All letters lower");
}
} else if (isupper(*us)) {
while (!islower(*us) && *us) {
us++;
}
if (*us) {
puts("Mixed case");
} else {
puts("All letters upper");
}
} else {
puts("Not alpha");
}
}
97 = a = 1100001
65 = A = 0100001
You have just to test the bit number 6.
I basically have a sentence in a string and want to break it down word per word. Every word should go into an array of strings. I am not allowed to use strtok. I have this code but it doesn't work. Can someone help?
There is for sure something similar in the internet but I couldn't find anything...
int main(){
char s[10000]; // sentence
char array[100][100]; // array where I put every word
printf("Insert sentence: "); // receive the sentence
gets(s);
int i = 0;
int j = 0;
for(j = 0; s[j] != '\0'; j++){ // loop until I reach the end
for(i = 0; s[i] != ' '; i++){ // loop until the word is over
array[j][i] = s[i]; // put every char in the array
}
}
return 0;
}
Every word should go into an array of strings. I am not allowed to use
strtok.
Interesting problem which could be resolved in a compact algorithm.
It handles multiple spaces and punctuation marks specified in check(char c).
The most difficult part of the problem is to properly handle corner cases. We may have situation when words are longer more than WORD_LEN length or the number of words exceeds the capacity of the array.
Both cases are properly handled. The algorithm truncates the excessive words and parses only to the capacity of the array.
(BTW. Do not use gets: Why is the gets function so dangerous that it should not be used?)
Edit: The fully tested find_tokens function has been presented.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define WORD_LEN 3 // 100 // MAX WORD LEN
#define NR_OF_WORDS 3 // 100 // MAX NUMBER OF WORDS
#define INPUT_SIZE 10000
int is_delimiter(const char * delimiters, char c) // check for a delimiter
{
char *p = strchr (delimiters, c); // if not NULL c is separator
if (p) return 1; // delimeter
else return 0; // not a delimeter
}
int skip(int *i, char *str, int skip_delimiters, const char *delimiters)
{
while(1){
if(skip_delimiters) {
if( (str[(*i)+1] =='\0') || (!is_delimiter(delimiters, str[(*i)+1])) )
break; // break on nondelimeter or '\0'
else (*i)++; // advance to next character
}
else{ // skip excess characters in the token
if( is_delimiter(delimiters, str[(*i)]) )
{
if( (str[(*i)+1] =='\0') || !is_delimiter(delimiters, str[(*i)+1]) )
break; // break on non delimiter or '\0'
else (*i)++; // skip delimiters
}
else (*i)++; // skip non delimiters
}
}
if ( str[(*i)+1] =='\0') return 0;
else return 1;
}
int find_tokens(int max_tokens, int token_len, char *str, char array[][token_len+1], const char *delimiters, int *nr_of_tokens)
{
int i = 0;
int j = 0;
int l = 0;
*nr_of_tokens = 0;
int status = 0; // all OK!
int skip_leading_delimiters = 1;
int token = 0;
int more;
for(i = 0; str[i] != '\0'; i++){ // loop until I reach the end
// skip leading delimiters
if( skip_leading_delimiters )
{
if( is_delimiter( delimiters, str[i]) ) continue;
skip_leading_delimiters = 0;
}
if( !is_delimiter(delimiters,str[i]) && (j < token_len) )
{
array[l][j] = str[i]; // put char in the array
//printf("%c!\n", array[l][j] );
j++;
array[l][j] = 0;
token = 1;
}
else
{
//printf("%c?\n", str[i] );
array[l][j] = '\0'; // token terminations
if (j < token_len) {
more = skip(&i, str, 1, delimiters); // skip delimiters
}
else{
more = skip(&i, str, 0, delimiters); // skip excess of the characters in token
status = status | 0x01; // token has been truncated
}
j = 0;
//printf("more %d\n",more);
if(token){
if (more) l++;
}
if(l >= max_tokens){
status = status | 0x02; // more tokens than expected
break;
}
}
}
if(l>=max_tokens)
*nr_of_tokens = max_tokens;
else{
if(l<=0 && token)
*nr_of_tokens = 1;
else
{
if(token)
*nr_of_tokens = l+1;
else
*nr_of_tokens = l;
}
}
return status;
}
int main(void){
char input[INPUT_SIZE+1]; // sentence
char array[NR_OF_WORDS][WORD_LEN+1]; // array where I put every word, remeber to include null terminator!!!
int number_of_words;
const char * delimiters = " .,;:\t"; // word delimiters
char *p;
printf("Insert sentence: "); // receive the sentence
fgets(input, INPUT_SIZE, stdin);
if ( (p = strchr(input, '\n')) != NULL) *p = '\0'; // remove '\n'
int ret = find_tokens(NR_OF_WORDS, WORD_LEN, input, array, delimiters, &number_of_words);
printf("tokens= %d ret= %d\n", number_of_words, ret);
for (int i=0; i < number_of_words; i++)
printf("%d: %s\n", i, array[i]);
printf("End\n");
return 0;
}
Test:
Insert sentence: ..........1234567,,,,,,abcdefgh....123::::::::::::
tokens= 3 ret= 1
0: 123
1: abc
2: 123
End
You are not '\0'-terminating the strings and you are scanning the source from
the beginning every time you've found a empty character.
You only need one loop and, the inner loop and the condition must be s[i] != 0:
int j = 0; // index for array
int k = 0; // index for array[j]
for(i = 0; s[i] != '\0'; ++i)
{
if(k == 99)
{
// word longer than array[j] can hold, aborting
array[j][99] = 0; // 0-terminating string
break;
}
if(j == 99)
{
// more words than array can hold, aborting
break;
}
if(s[i] == ' ')
{
array[j][k] = 0; // 0-terminating string
j++; // for the next entry in array
k = 0;
} else
array[j][k++] = s[i];
}
Note that this algorithm doesn't handle multiple spaces and punctuation marks.
This can be solved by using a variable that stores the last state.
int j = 0; // index for array
int k = 0; // index for array[j]
int sep_state = 0; // 0 normal mode, 1 separation mode
for(i = 0; s[i] != '\0'; ++i)
{
if(k == 99)
{
// word longer than array[j] can hold, aborting
array[j][99] = 0; // 0-terminating string
break;
}
if(j == 99)
{
// more words than array can hold, aborting
break;
}
// check for usual word separators
if(s[i] == ' ' || s[i] == '.' || s[i] == ',' || s[i] == ';' || s[i] == ':')
{
if(sep_state == 1)
continue; // skip multiple separators
array[j][k] = 0; // 0-terminating string
j++; // for the next entry in array
k = 0;
sep_state = 1; // enter separation mode
} else {
array[j][k++] = s[i];
sep_state = 0; // leave separation mode
}
}
As you can see, using the sep_state variable I'm able to check if multiple
separators come one after the other and skips subsequent separators. I also
check for common punctuation marks.
#include <stdio.h>
int main()
{
char s[10000]; // sentence
char array[100][100]; // array where i put every word
printf("Insert sentence: "); // receive the sentece
gets(s);
printf("%s",s);
int i = 0;
int j = 0;
int k = 0;
for(j = 0; s[j] != '\0'; j++){ // loop until i reach the end
if ( s[j] != ' ' || s[j] == '\0' )
{
array[i][k] = s[j];
k++;
}
else {
i++;
k = 0;
}
}
return 0;
}
please note that the gets function is very unsafe and shouldn't in any case be used, use scanf or fgets instead
I have been trying to figure out how to count the vowels and characters in each word of a sentance.
For example
In hello there sentence
hello : 5 characters, 2 vowels
there : 5 characters, 2 vowels. I have seen the code for doing the same thing for a full sentence. But not word by word.
Below is the coding I've been working on
int main() {
char str[512] = "hello there", word[256];
int i = 0, j = 0, v, h;
str[strlen(str)] = '\0';
/* checking whether the input string is NULL */
if (str[0] == '\0') {
printf("Input string is NULL\n");
return 0;
}
/* printing words in the given string */
while (str[i] != '\0') {
/* ' ' is the separator to split words */
if (str[i] == ' ')
{
for (h = 0; word[h] != '\0'; ++h)
{
if (word[h] == 'a' || word[h] == 'e' || word[h] == 'i' || word[h] == 'o' || word[h] == 'u')++v;
}
printf("\nVowels: %d", v);
word[j] = '\0';
printf("%s\n", word);
j = 0;
}
else
{
word[j++] = str[i];
}
i++;
}
word[j] = '\0';
/* printing last word in the input string */
printf("%s\n", word);
return 0;
}
The input will be all lower case. I'm having a hard time figuring this out.
While running the code I'm not getting the vowels count. I'm able to split the sentence. But vowel counting is not happening.
One fairly simple approach:
#include <stdio.h>
const char* s(int n)
{
return n == 1? "" : "s";
}
void count (const char* str)
{
for (int i = 0;;)
for (int v = 0, w = i;;)
{
int len;
char c = str[i++];
switch (c)
{
case 'a': case 'e': case 'i': case 'o': case 'u':
v++;
default:
continue;
case ' ': case '\t': case '\n': case '\0':
len = i - 1 - w;
printf("'%.*s': %d character%s, %d vowel%s\n", len, str+w, len, s(len), v, s(v));
if (c)
break;
else
return;
}
break;
}
}
int main(void)
{
count("My words with vowels");
return 0;
}
This sounds an awful lot like a homework assignment..
here's some pseudo-code <-- below will NOT run as is. Just to show logic.
int c = 0;
int v = 0;
for (int i = 0; i < lengthOfSentence; i++){
if (stringName[i] == '\0') { //optionally '\n' may be more suitable
return;
}
if (stringName[i] == ' '){
print previousWord // + c, v in whatever format you want
c = 0;
v = 0;
}
if (stringName[i] == vowel) { //you can do this part like in your code
word[v+c] = stringName[i]; //get current char and add to next slot
v++;
}
else {
word[v+c] = stringName[i];
c++;
}
beyond that it's minute details like realizing v+c will give you total word length when printing, etc..
Try this code. it might help you
#include<stdio.h>
int main() {
char str[512] = "hello there", word[256];
int i = 0, j = 0, v=0,h; // you didn't initialize v to 0
str[strlen(str)] = '\0';
/* checking whether the input string is NULL */
if (str[0] == '\0') {
printf("Input string is NULL\n");
return 0;
}
/* printing words in the given string */
while (str[i] != '\0') {
/* ' ' is the separator to split words */
if (str[i] == ' ' ) {
for (h = 0; word[h] != '\0'; h++) {
if (word[h] == 'a' || word[h] == 'e' || word[h] == 'i' || word[h] == 'o' || word[h] == 'u')
v++;
}
printf("%s :", word);
printf(" %d chracters,",strlen(word));
printf(" %d Vowels.\n", v);
j = 0; v=0;
word[j] = '\0';
} else {
word[j++] = str[i];
word[j] = '\0';
}
i++;
}
/* calculating vowels in the last word*/ // when NULL occurs, Wont enter into while loop.
for (h = 0; word[h] != '\0'; h++) {
if (word[h] == 'a' || word[h] == 'e' || word[h] == 'i' || word[h] == 'o' || word[h] == 'u')
v++;
}
printf("%s :", word);
printf(" %d chracters,",strlen(word));
printf(" %d Vowels.\n", v);
return 0;
}
What you can probably do is, you can print the count for the characters and vowels when you encounter a " "(space) and then reset the counters. That way, you can find the characters and vowels for each word of the sentence.
If you understand the logic for doing this throughout a sentence, then you can also do it in single words by simple breaking the sentence into individual word and applying the same logic to each word. You can use the fact that words are separated by a space (or multiple, maybe) to break down the sentence into words.