O(n) in -place algorithm for compressing "same letters pattern" - arrays

thats my first post here so I hope im following the rules.
I had a job interview few days ago and I got a problem which I coulnd solve (until now).
The idea is (was on C but I guess its not an isuue): given a char* of n length, you should use O(1) memory comp (in-place changes), and compress the patterns of the same char to "num+char" where num is the pattern length and char is the character of the pattern. you can assume the memory is long enough to contain the expected result. Expected time comp is O(n).
Example- [abbbccccdee] ---> [1a3b4c1d2e]
my main issue here was with the case of 1 long patterns, because "pushing forward" all letters to get the 1 before it, raising the complexity. Otherwise my idea was to work with a pointer that indicates the locations of the next compress pattern, and it works only as long as the pointer is behind the regular scanning of the array.
Thanks!

compressing “same letters pattern”
my main issue here was with the case of 1 long patterns, because "pushing forward" all letters to get the 1 before it, raising the complexity
Note the length needed in the first pass to keep O(n) and expand from the end.
2 passes.
Compress:
Walk the string from beginning counting character repetition:
-- If digit encountered, error out.
-- Replace any repeat of 2 or more characters with the decimal length as text followed by the character.
-- Count single character cases.
Note the length of the compressed data.
Expand:
Let source = &s[length] - 1
Let destination = &s[length + single] - 1
Walk the array from the end reading from source and then writing to destination:
-- With special detection of the the array start:
---- Detect a character lacking a preceding digit, then write to the destination as '1' + character.
---- Otherwise copy "digits + character" to destination. Copy these characters in reverse order so with "12z", copy 'z', '2', then '1'.
Insure algorithm handles "" case.
Similar to #dratenik

Let's put #chux amazing answer into real code! The following code:
#include <stdio.h>
#include <assert.h>
#include <ctype.h>
char *compress_num_to_str(char *out, size_t v) {
// the standard output&reverse
char *tmp = out;
while (v) {
*tmp++ = v % 10 + '0';
v /= 10;
}
char *ret = tmp;
while (tmp > out) {
--tmp;
char a = *tmp;
*tmp = *out;
*out = a;
out++;
}
return ret;
}
char *compress(char *string) {
if (*string == '\0') {
return string;
}
// compress
char *in = string;
char *out = string;
size_t single = 0;
for (char c; (c = *in++) != '\0'; ) {
// the letter is going to be written anyway
char *const pos = in;
// iterate over repeated letters
while (c == *in) {
++in;
}
// if there are any repeated letters
if (pos != in) {
// output the count
const size_t count = in - pos + 1;
out = compress_num_to_str(out, count);
} else {
// a single char
single++;
}
*out++ = c;
}
const size_t length = out - string;
// expand
in = &string[length];
out = &string[length + single];
*out = '\0';
// I am not particularly happy at how this loops
// it's unreadable, but hey - it's fast and it works.
char c = *--in; // current character is buffered to be read
int cont; // should _the next_ loop continue
do {
// this should be a letter always
assert(!isdigit(c));
*--out = c;
// put the letter into destination
if ((cont = in != string) && (c = *--in, isdigit(c))) {
// copy up until digits
do {
*--out = c;
} while ((cont = in != string) && (c = *--in, isdigit(c)));
} else {
// the next input is not a digit
// add a one and repeat
*--out = '1';
}
} while (cont);
assert(in == string);
assert(out == string);
return string;
}
#define TEST(str) printf("%s -> %s\n", str, compress((char[sizeof(str) * 2 - 1]){str}))
int main() {
TEST("abbcccddddeef");
TEST("aabcdee");
TEST("");
TEST("a");
TEST("abcde");
TEST("aaaaaaaaaaaaaaaaaab");
TEST("aaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb");
TEST("abbbbbbbbbbbbbbbbbbbbbbbbbbbbbb");
}
outputs:
abbcccddddeef -> 1a2b3c4d2e1f
aabcdee -> 2a1b1c1d2e
->
a -> 1a
abcde -> 1a1b1c1d1e
aaaaaaaaaaaaaaaaaab -> 18a1b
aaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb -> 18a374b
abbbbbbbbbbbbbbbbbbbbbbbbbbbbbb -> 1a30b

Related

unintended output character p when reversing a DNA string in C

The intended output is to first reverse the whole DNA string, and then convert A <-> T, C <-> G.
However, in the actual output, the first character prints as "p", which is coming out of nowhere, but the rest of the output string is fine.
Here's the code:
int main() {
const char dna[] = "GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCAT"
"TTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTG"
"GAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATT"
"CTATTATTTATCGCACCTACGTTCAATATTACAGGCGAACATACCTACTA"
"AAGTGTGTTAATTAATTAATGCTTGTAGGACATAATAATAACAATTGAAT";
int dna_len = strlen(dna);
char rev_comp[dna_len+1];
char temp = '\0';
char temp_dna[dna_len+1];
for (int i = 0; i < dna_len + 1; i++) {
temp_dna[i] = dna[dna_len - i];
if (temp_dna[i] == 'A') {
temp = 'T';
rev_comp[i] = temp;
}
else if (temp_dna[i] == 'T') {
temp = 'A';
rev_comp[i] = temp;
}
else if (temp_dna[i] == 'C') {
temp = 'G';
rev_comp[i] = temp;
}
else if (temp_dna[i] == 'G') {
temp = 'C';
rev_comp[i] = temp;
}
}
rev_comp[dna_len+1] = '\0';
printf("original: %s\n", dna);
printf("rev_comp: %s\n", rev_comp);
return 0;
}
#TedLyngmo has already pointed out the indexing errors in your original code, but another consideration you may address is thinking about being able to reuse some of the code that you are writing in other programs later. Rather than writing specialized code over-and-over again for each individual program you write, identifying common parts of the code you may want to use again in another program and creating a short function for that part of the code makes that possible.
You will likely have the need to reverse a string more times than just in this program, so writing a reusable function to reverse a string that you can use wherever it is needed makes sense. Depending on your career path, you may also have the need to transform A <-> T, C <-> G more than in this one program, so a short function to do that may make sense as well.
Caveat: If upmost efficiency is required (dealing with billions of characters strings), then it would make sense to combine the operations and take advantage of a single iteration over the DNA sequence string. By working from each end of the string towards the middle you can handle two-characters per-iteration reducing by-half the number of iterations needed.
To make a reusable function for each the reversal and the transform of the string you can write the functions as follows. The string reversal function shows how to work from each end toward the middle requiring only half the number of iterations as the string has characters:
#include <stdio.h>
#include <string.h>
/* reverse src in dest copying 2-characters per-iteration. */
void strrev(char *dest, const char *src)
{
size_t begin = 0, end = strlen(src); /* begin and 1-past-end indexes */
dest[end] = 0; /* nul-terminate dest */
for(; begin < end--; ++begin) {
dest[begin] = src[end]; /* end to begin */
dest[end] = src[begin]; /* begin to end */
}
}
/* transform A <-> T, C <-> G */
void xformATCG (char *s)
{
do {
if (*s == 'A')
*s = 'T';
else if (*s == 'T')
*s = 'A';
else if (*s == 'C')
*s = 'G';
else if (*s == 'G')
*s = 'C';
} while (*s++);
}
If you like, you can write a simple print function that will break long lines of output at a specific number of characters similar to how you show with your initialization of dna[]. For what it's worth you could add:
/* simple print with break at brk chars function */
void prnwbrk (const char *s, size_t brk)
{
size_t n = 0; /* counter */
while (s[n]) { /* loop until end-of-string */
if (n && n % brk == 0) /* if brk chars, output \n */
putchar ('\n');
putchar (s[n++]); /* output char */
}
putchar ('\n'); /* final \n */
}
Now reversing and transforming the string simply becomes a matter of calling strrev() and xformATCG() in main(). You can output between each operation to check each step (which makes debugging a bit easier). A short main() could be:
int main (void) {
const char dna[] = "GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCAT"
"TTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTG"
"GAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATT"
"CTATTATTTATCGCACCTACGTTCAATATTACAGGCGAACATACCTACTA"
"AAGTGTGTTAATTAATTAATGCTTGTAGGACATAATAATAACAATTGAAT";
char rev_comp[sizeof dna];
prnwbrk (dna, 50); /* print original dna */
putchar ('\n');
strrev (rev_comp, dna); /* reverse and print */
prnwbrk (rev_comp, 50);
putchar ('\n');
xformATCG (rev_comp); /* transform chars and print */
prnwbrk (rev_comp, 50);
}
Example Use/Output
If I understood your question and the operations properly, the reversed and transformed strings would look like:
$ ./bin/revdna
GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCAT
TTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTG
GAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATT
CTATTATTTATCGCACCTACGTTCAATATTACAGGCGAACATACCTACTA
AAGTGTGTTAATTAATTAATGCTTGTAGGACATAATAATAACAATTGAAT
TAAGTTAACAATAATAATACAGGATGTTCGTAATTAATTAATTGTGTGAA
ATCATCCATACAAGCGGACATTATAACTTGCATCCACGCTATTTATTATC
TTACTCCGTCCTTAGTTTCTGTCTATGACGCTGTATCCCACGAGGCCGAG
GTCGCAGAGCGTTACGATAGCGCACGTGTGGGGGGTCTGCTTTTATGGTT
TACGTACCTCTCGAGGGCACTCACCAATTATCCCACTATCTGGACACTAG
ATTCAATTGTTATTATTATGTCCTACAAGCATTAATTAATTAACACACTT
TAGTAGGTATGTTCGCCTGTAATATTGAACGTAGGTGCGATAAATAATAG
AATGAGGCAGGAATCAAAGACAGATACTGCGACATAGGGTGCTCCGGCTC
CAGCGTCTCGCAATGCTATCGCGTGCACACCCCCCAGACGAAAATACCAA
ATGCATGGAGAGCTCCCGTGAGTGGTTAATAGGGTGATAGACCTGTGATC
Nothing wrong with doing it all in main(), but thinking ahead can save you from reinventing-the-wheel each time you need to do the same thing in another program. (additionally, writing and debugging a function once, prevents new bugs from slipping in when you reinvent the function later)
Look things over and let me know if you have further questions.
Your loop is wrong and loops up to dna_len (where the null terminator is). It should be:
for (int i = 0; i < dna_len; i++) { // corrected loop
temp_dna[i] = dna[dna_len - i - 1]; // corrected calculation
Also, the final null terminator in rev_comp should be assigned at index dna_len, not dna_len + 1 - which is out of bounds so your program has undefined behavior. Printing p is one possible outcome of undefined behavior.
rev_comp[dna_len] = '\0';
Demo
You can make a small helper function to just do the reversing before you start swapping characters in the string though. Making small dedicated functions that does one thing only is good for debugging your program later. It's then easier to isolate and find problems. Example:
void rev(const char *in, size_t len, char *out) {
for(size_t i = 0; i < len; ++i) {
out[len - i - 1] = in[i];
}
out[len] = '\0';
}
And call it with
rev(dna, dna_len, rev_comp);
before swapping the letters:
for (size_t i = 0; i < dna_len; ++i) {
switch(rev_comp[i]) {
case 'A': rev_comp[i] = 'T'; break;
case 'T': rev_comp[i] = 'A'; break;
case 'G': rev_comp[i] = 'C'; break;
case 'C': rev_comp[i] = 'G'; break;
}
}
Demo

How to copy a sentence from a longer string into a new array while including period?

I want to save part of a string into a new char array while including the period. For example, the string is:
My name is John. I have 1 dog.
I want to copy each char up to and including the first period, so the new char array will contain:
My name is John.
The code I have written below copies only "My name is John" but omits the period.
ptrBeg and ptrEnd point to the char at the beginning and end, respectively, of the portion I want to copy. My intention was to copy ptrBeg into array newBuf through a pointer to newBuf and then increment both ptrBeg and the pointer to the array until ptrBeg and ptrEnd point to the same char, which should always be a period.
At this point, the text of the string should be copied, so I increment the pointer to char array once more and copy the period to the new space using
++ptrnewBuf;
*ptrnewBuf = *ptrEnd";
Finally, I print the contents of newBuf.
Here's the total code:
int main()
{
char buf[] = "My name is John. I have 1 dog.";
char * ptrBuf;
char * ptrBeg;
char * ptrEnd;
ptrBeg = buf;
ptrBuf = ptrBeg;
while (*ptrBuf != '.'){
ptrBuf++;
}
ptrEnd = ptrBuf;
char newBuf[100];
char * ptrnewBuf = newBuf;
while(*ptrBeg != *ptrEnd){
*ptrnewBuf = *ptrBeg;
ptrnewBuf++;
ptrBeg++;
}
++ptrnewBuf;
*ptrnewBuf = *ptrEnd;
printf("%s", newBuf);
}
How would I modify this code to include a period?
You are on the right track, but you may be making things a bit more complicated than needed and overlooking a few critical checks. The key to iterating by pointers or using pointer arithmetic is to always validate and protect your array or memory bounds during each iteration or arithmetic operation.
Another tip is to always map out your pointer positions on a piece of paper before coding everything up so you have a clear picture of what your iteration limits and any adjustments need to be. (you don't have to use full long strings and many boxes, just use a representation of what needs to be done with a handful of characters) In your case where you wish to copy the substing up through the first '.', something simple like the following will do, e.g.
+---+---+---+---+---+---+
| A | . | | B | . |\0 |
+---+---+---+---+---+---+
^ ^
| pointer (when *p == '.')
buf
So to copy "A." from buf to a new buffer you can't simply iterate while (*p != '.') or you will not copy '.'. By drawing it out, you can clearly see you need to also copy the character when p == '.', e.g.
+---+---+---+---+---+---+
| A | . | | B | . |\0 |
+---+---+---+---+---+---+
^ ^
| |-->| pointer (p + 1)
buf
Now regardless of the actual length of the string before '.', you now know you need p + 1 as the final address to include the last character in the copy.
You also know how many characters your new buffer can store. Say the size of new is MAXC characters (maximum number of characters). So you can store a string of at most MAXC-1 characters (plus the nul-character). When you are filling new you need to always validate you are within MAXC-1 characters.
You also need to insure you new string is nul-terminated (or it isn't a string, it's simply an array of characters). One effective way to insure nul-termination is by initializing all characters in new to 0 when it is declared, e.g.
char new[MAXC] = "";
which initializes the 1st character to 0 (e.g. '\0' empty-string) and all remaining characters 0 by default. Now if you fill no more than MAXC-1 characters, you are guaranteed the array will be a nul-terminated string.
Putting it altogether, you could do something like the following:
#include <stdio.h>
#define MAXC 128 /* if you need a constant, #define one (or more) */
int main (void) {
char buf[] = "My name is John. I have 1 dog.",
*p = buf, /* pointer to buf */
new[MAXC] = "", /* buffer for substring */
*n = new; /* pointer to new */
size_t ndx = 0; /* index for new */
/* loop copying each char until new full, '.' copied, or end of buf */
for (; ndx + 1 < MAXC && *p; p++, n++, ndx++) {
*n = *p; /* copy char from buf to new */
if (*n == '.') /* if char was '.' break */
break;
}
printf ("buf: %s\nnew: %s\n", buf, new);
return 0;
}
(note: ndx is incremented as part of the for loop to track the number of characters copied with the pointers)
Example Use/Output
$ ./bin/str_cpy_substr
buf: My name is John. I have 1 dog.
new: My name is John.
If you do not have the luxury of initializing the string to insure nul-termination, you can always affirmatively nul-terminate after your copy is done. For example, you could add the following after the for loop exit to insure an array of unknown initialization is properly terminated:
*++n = 0; /* nul-terminate (if not already done by initialization) and
* note ++n applied before * due to C operator precedence.
*/
Look things over and let me know if you have further questions.
Just breaking it out into a helper function that "extracts" the first sentence from a line. Just copies the characters over one at a time until either an end of string condition is hit on the source, the period is found, or a max length of the destination buffer is encountered.
void ExtractFirstSentence(const char* line, char* dst, int size)
{
int count = 0;
char c ='\0';
if ((line == NULL) || (dst == NULL) || (size <= 0))
{
return;
}
while ((*line) && ((count+1) < size) && (c != '.'))
{
c = *line++;
*dst++ = c;
count++;
}
*dst = '\0';
}
int main()
{
char buf[] = "My name is John. I have 1 dog.";
char newBuf[100];
ExtractFirstSentence(buf, newBuf, 100);
printf("%s", newBuf);
}
if you want something a bit easier without dealing with all those pointers, try :
int main()
{
char buf[] = "My name is John. I have 1 dog.";
int i = 0;
int j = 0;
while(buf[i] != '.' && buf[i] != '\0') {
i++;
}
char newbuf[i+1];
while (j <= i) {
newbuf[j] = buf[j];
j++;
}
newbuf[j] = '\0';
printf("%s\n",newbuf);
return 0;
}
though the i+1 when making newbuf and the newbuff[j] = '\0' im not 100% certain need to be that way. my thoughts are the i+1 is needed to make room for the \0 ending which is then added after the while loop copying buf to newbuf. but i could be mistaken.
You can use strtok() to split string. Just type man strtok, You will see:
Program source
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int
main(int argc, char *argv[])
{
char *str1, *str2, *token, *subtoken;
char *saveptr1, *saveptr2;
int j;
if (argc != 4) {
fprintf(stderr, "Usage: %s string delim subdelim\n",
argv[0]);
exit(EXIT_FAILURE);
}
for (j = 1, str1 = argv[1]; ; j++, str1 = NULL) {
token = strtok_r(str1, argv[2], &saveptr1);
if (token == NULL)
break;
printf("%d: %s\n", j, token);
for (str2 = token; ; str2 = NULL) {
subtoken = strtok_r(str2, argv[3], &saveptr2);
if (subtoken == NULL)
break;
printf(" --> %s\n", subtoken);
}
}
exit(EXIT_SUCCESS);
}
An example of the output produced by this program is the following:
$ ./a.out 'a/bbb///cc;xxx:yyy:' ':;' '/'
1: a/bbb///cc
--> a
--> bbb
--> cc
2: xxx
--> xxx
3: yyy
--> yyy

Print out all possible strings in C using recursion

My assignment is that code takes a string as input, and if there are x's in the string replace them with either a 0 or a 1, and print out all the possible string combinations. We have to use recursion for this as well. For example, if the input string was "1x0X" the output would be:
1000
1001
1100
1101
I'm really struggling with how I'm supposed to find all the permutations of the string without having the complete string yet. I have a series of function that combine to print out all permutations of a list of numbers, but I don't know how to make a function where it only permutes certain elements of a list.
Does anyone have any suggestions on how to accomplish this?
Jonathan's initial suggestion
This code implements what I suggested in a comment essentially verbatim. It accepts either x or X as a valid marker because the examples in the question do too.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
static void map_x(const char *str)
{
size_t xloc = strcspn(str, "xX");
if (str[xloc] == '\0')
printf("%s\n", str);
else
{
char *copy = strdup(str);
copy[xloc] = '0';
map_x(copy);
copy[xloc] = '1';
map_x(copy);
free(copy);
}
}
int main(void)
{
char buffer[4096];
while (fgets(buffer, sizeof(buffer), stdin) != 0)
{
buffer[strcspn(buffer, "\n")] = '\0';
map_x(buffer);
}
return 0;
}
The main() function is essentially the same in all three variants. The use of strcspn() is a standard idiom that trims everything from the first newline onwards, or overwrites the end of the string if there is no newline in it.
Note that this solution is safe even if a read-only string literal is passed to the function; it does not modify the string that it is passed. The following solutions will both crash or otherwise fail if the initial string is in fact a read-only string literal.
It would be possible to determine the string length, allocate a VLA (variable length array) to take the string copy, and copy the string into the VLA. That would dramatically reduce the cost of allocating memory for the string (VLA allocation is much simpler than a general purpose memory allocator).
Gene's Suggestion
This code implements what Gene suggested in a comment. It will be more efficient because it does no extra memory allocation, an expensive operation on most systems.
#include <stdio.h>
#include <string.h>
static void map_x(char *str)
{
size_t xloc = strcspn(str, "xX");
if (str[xloc] == '\0')
printf("%s\n", str);
else
{
char letter = str[xloc];
str[xloc] = '0';
map_x(str);
str[xloc] = '1';
map_x(str);
str[xloc] = letter;
}
}
int main(void)
{
char buffer[4096];
while (fgets(buffer, sizeof(buffer), stdin) != 0)
{
buffer[strcspn(buffer, "\n")] = '\0';
map_x(buffer);
}
return 0;
}
Mildly optimized variant
This optimizes the work by not rescanning the prefix that is already known to be free of x's.
/* SO 4764-4683 */
#include <stdio.h>
#include <string.h>
static void map_x(char *str, size_t offset)
{
size_t xloc = strcspn(&str[offset], "xX") + offset;
if (str[xloc] == '\0')
printf("%s\n", str);
else
{
char letter = str[xloc];
str[xloc] = '0';
map_x(str, xloc);
str[xloc] = '1';
map_x(str, xloc);
str[xloc] = letter;
}
}
int main(void)
{
char buffer[4096];
while (fgets(buffer, sizeof(buffer), stdin) != 0)
{
buffer[strcspn(buffer, "\n")] = '\0';
map_x(buffer, 0);
}
return 0;
}
The difference in performance is probably not measurable on almost any input simply because the I/O time will dominate.
With all due respect to chux, I think that the code in the answer is more complex than necessary. The extra data structure seems like overkill.
Recursion is best used when the recursive depth is limited and not too big.
Yet setting aside that axiom, below is a double recursive solution. Its eats up stack space quickly (that is its biggest constraint) and so is not a robust solution. Yet it gets the job.
For "code takes a string as input,", use fgets() - not shown.
Yet in the spirit of recursion why not recurse the input too? The print_combo() recursion produces a linked list (LL) of characters and keeps track of the number of 'x' read. Once an end-of-line/end-of-file occurs, it is time to print and the linked-list starts with the last character.
The foo() recursion prints the LL in reverse order, passing in a binary mask to direct the x substitution of 0 or 1. The unsigned binary mask is good for typically 32 x's. That is another restriction.
If you must, mouse over for the code.
typedef struct node {
const struct node *prev;
int ch;
} node;
// Print the line
void foo(const node *prev, unsigned mask) {
if (prev) {
if (prev->ch == 'x' || prev->ch == 'X') {
foo(prev->prev, mask >> 1);
putchar("01"[mask & 1]);
} else {
foo(prev->prev, mask);
putchar(prev->ch);
}
}
}
// Read, form the LL and then print
void print_combo(const node *prev, unsigned xcount) {
node n = {.prev = prev, .ch = getchar()};
if (n.ch == '\n' || n.ch == EOF) {
for (unsigned mask = 0; mask < (1u << xcount); mask++) {
foo(prev, mask);
putchar('\n');
}
} else {
print_combo(&n, xcount + (n.ch == 'x' || n.ch == 'X'));
}
}
int main(void) {
print_combo(NULL, 0);
}
Input
00x01x10x11
Output
00001010011
00001010111
00001110011
00001110111
00101010011
00101010111
00101110011
00101110111
I would do something a bit simpler. Just use a position parameter to iterate over the input string. Whenever you hit the 'x' character recurse twice, once for '0' and once for '1'. Make sure to reset the character back to 'x' after you return. Whenever you hit the digit character just recurse once. Increment the position parameter each time you recurse. When you hit the end of the string, print it out. With this idea, you'd get something like this:
#include <stdio.h>
void print_combo(char *str, int pos) {
char c;
c = str[pos];
switch (c) {
case '0':
case '1':
print_combo(str, pos+1);
break;
case 'x':
case 'X':
str[pos] = '0';
print_combo(str, pos+1);
str[pos] = '1';
print_combo(str, pos+1);
str[pos] = c;
break;
case '\0':
printf("%s\n", str);
break;
default:
printf("bad input\n");
break;
}
}
int main() {
char str[10];
strcpy(str, "1x0x");
printf("printing %s\n", str);
print_combo(str, 0);
strcpy(str, "0x01x");
printf("printing %s\n", str);
print_combo(str, 0);
strcpy(str, "0x01x0X1");
printf("printing %s\n", str);
print_combo(str, 0);
return 0;
}
My output looks like this:
printing 1x0x
1000
1001
1100
1101
printing 0x01x
00010
00011
01010
01011
printing 0x01x0X1
00010001
00010011
00011001
00011011
01010001
01010011
01011001
01011011

how to cut out a chinese words & english words mixture string in c language

I have a string that contains both Mandarin and English words in UTF-8:
char *str = "你a好测b试";
If you use strlen(str), it will return 14, because each Mandarin character uses three bytes, while each English character uses only one byte.
Now I want to copy the leftmost 4 characters ("你a好测"), and append "..." at the end, to give "你a好测...".
If the text were in a single-byte encoding, I could just write:
strncpy(buf, str, 4);
strcat(buf, "...");
But 4 characters in UTF-8 isn't necessarily 4 bytes. For this example, it will be 13 bytes: three each for 你, 好 and 测 and one for a. So, for this specific case, I would need
strncpy(buf, str, 13);
strcat(buf, "...");
If I had a wrong value for the length, I could produce a broken UTF-8 stream with an incomplete character. Obviously I want to avoid that.
How can I compute the right number of bytes to copy, corresponding to a given number of characters?
First you need to know your encoding. By the sound of it (3 byte Mandarin) your string is encoded with UTF-8.
What you need to do is convert the UTF-8 back to unicode code points (integers). You can then have an array of integers rather than bytes, so each element of the array will be 1 character, reguardless of the language.
You could also use a library of functions that already handle utf8 such as http://www.cprogramming.com/tutorial/utf8.c
http://www.cprogramming.com/tutorial/utf8.h
In particular this function: int u8_toucs(u_int32_t *dest, int sz, char *src, int srcsz); might be very useful, it will create an array of integers, with each integer being 1 character. You can then modify the array as you see fit, then convert it back again with int u8_toutf8(char *dest, int sz, u_int32_t *src, int srcsz);
I would recommend dealing with this at a higher level of abstraction: either convert to wchar_t or use a UTF-8 library. But if you really want to do it at the byte level, you could count characters by skipping over the continuation bytes (which are of the form 10xxxxxx):
#include <stddef.h>
size_t count_bytes_for_chars(const char *s, int n)
{
const char *p = s;
n += 1; /* we're counting up to the start of the subsequent character */
while (*p && (n -= (*p & 0xc0) != 0x80))
++p;
return p-s;
}
Here's a demonstration of the above function:
#include <string.h>
#include <stdio.h>
int main()
{
const char *str = "你a好测b试";
char buf[50];
int truncate_at = 4;
size_t bytes = count_bytes_for_chars(str, truncate_at);
strncpy(buf, str, bytes);
strcpy(buf+bytes, "...");
printf("'%s' truncated to %d characters is '%s'\n", str, truncate_at, buf);
}
Output:
'你a好测b试' truncated to 4 characters is '你a好测...'
The Basic Multilingual Plane was designed to contain characters for almost all modern languages. In particular, it does contain Chinese.
So you just have to convert your UTF8 string to a UTF16 one to have each character using one single position. That means that you can just use a wchar_t array or even better a wstring to be allowed to use natively all string functions.
Starting with C++11, the <codecvt> header declares a dedicated converter std::codecvt_utf8 to specifically convert UTF8 narrow strings to wide Unicode ones. I must admit it is not very easy to use, but it should be enough here. Code could be like:
char str[] = "你a好测b试";
std::codecvt_utf8<wchar_t> cvt;
std::mbstate_t state = std::mbstate_t();
wchar_t wstr[sizeof(str)] = {0}; // there will be unused space at the end
const char *end;
wchar_t *wend;
auto cr = cvt.in(state, str, str+sizeof(str), end,
wstr, wstr+sizeof(str), wend);
*wend = 0;
Once you have the wstr wide string, you can convert it to a wstring and use all the C++ library tools, or if you prefer C strings you can use the ws... counterparts of the str... functions.
Pure C solution:
All UTF8 multibyte characters will be made from char-s with the most-significant-bit set to 1 with the first bits of their first character indicating how many characters makes a codepoint.
The question is ambiguous in regards to the criterion used in cutting; either:
a fixed number of codepoints followed by three dots, this wil require a variable size output buffer
a fixed size output buffer, which will impose "whatever you can fit inside"
Both the solutions will require a helper function telling how many chars make the next codepoint:
// Note: the function does NOT fully validate a
// UTF8 sequence, only looks at the first char in it
int codePointLen(const char* c) {
if(NULL==c) return -1;
if( (*c & 0xF8)==0xF0 ) return 4; // 4 ones and one 0
if( (*c & 0xF0)==0xE0 ) return 3; // 3 ones and one 0
if( (*c & 0xE0)==0xC0 ) return 2; // 2 ones and one 0
if( (*c & 0x7F)==*c ) return 1; // no ones on msb
return -2; // invalid UTF8 starting character
}
So, solution for the criterion 1 (fixed number of code points, variable output buff size) - does not append ... to the destination, but you can ask "how many chars I need" upfront and if it is longer than you can afford, reserve yourself the extra space.
// returns the number of chars used from the output
// If not enough space or the dest is null, does nothing
// and returns the lenght required for the output buffer
// Returns negative val if the source in not a valid UTF8
int copyFirstCodepoints(
int codepointsCount, const char* src,
char* dest, int destSize
) {
if(NULL==src) {
return -1;
}
// do a cold run to see if size of the output buffer can fit
// as many codepoints as required
const char* walker=src;
for(int cnvCount=0; cnvCount<codepointsCount; cnvCount++) {
int chCount=codePointLen(walker);
if(chCount<0) {
return chCount; // err
}
walker+=chCount;
}
if(walker-src < destSize && NULL!=dest) {
// enough space at destination
strncpy(src, dest, walker-src);
}
// else do nothing
return walker-src;
}
Second criterion (limited buffer size): just use the first one with the number of codepoints returned by this one
// return negative if UTF encoding error
int howManyCodepointICanFitInOutputBufferOfLen(const char* src, int maxBufflen) {
if(NULL==src) {
return -1;
}
int ret=0;
for(const char* walker=src; *walker && ret<maxBufflen; ret++) {
int advance=codePointLen(walker);
if(advance<0) {
return src-walker; // err because negative, but indicating the err pos
}
// look on all the chars between walker and walker+advance
// if any is 0, we have a premature end of the source
while(advance>0) {
if(0==*(++walker)) {
return src-walker; // err because negative, but indicating the err pos
}
advance--;
} // walker is set on the correct position for the next attempt
}
return ret;
}
static char *CutStringLength(char *lpszData, int nMaxLen)
{
if (NULL == lpszData || 0 >= nMaxLen)
{
return "";
}
int len = strlen(lpszData);
if(len <= nMaxLen)
{
return lpszData;
}
char strTemp[1024] = {0};
strcpy(strTemp, lpszData);
char *p = strTemp;
p = p + (nMaxLen-1);
if ((unsigned char)(*p) < 0xA0)
{
*(++p) = '\0'; // if the last byte is Mandarin character
}
else if ((unsigned char)(*(--p)) < 0xA0)
{
*(++p) = '\0'; // if the last but one byte is Mandarin character
}
else if ((unsigned char)(*(--p)) < 0xA0)
{
*(++p) = '\0'; // if the last but two byte is Mandarin character
}
else
{
int i = 0;
p = strTemp;
while(*p != '\0' && i+2 <= nMaxLen)
{
if((unsigned char)(*p++) >= 0xA0 && (unsigned char)(*p) >= 0xA0)
{
p++;
i++;
}
i++;
}
*p = '\0';
}
printf("str = %s\n",strTemp);
return strTemp;
}

How to remove a character from a string using baskspace in C?

can you give me an example of deleting characters from an array of characters in c?
I tried too much, but i didn't reach to what i want
That is what i did:
int i;
if(c<max && c>start) // c is the current index, start == 0 as the index of the start,
//max is the size of the array
{
i = c;
if(c == start)
break;
arr[c-1] = arr[c];
}
printf("%c",arr[c]);
A character array in C does not easily permit deleting entries. All you can do is move the data (for instance using memmove). Example:
char string[20] = "strring";
/* delete the duplicate r*/
int duppos=3;
memmove(string+duppos, string+duppos+1, strlen(string)-duppos);
You have an array of characters c:
char c[] = "abcDELETEdefg";
You want a different array that contains only "abcdefg" (plus the null terminator). You can do this:
#define PUT_INTO 3
#define TAKE_FROM 9
int put, take;
for (put = START_CUT, take = END_CUT; c[take] != '\0'; put++, take++)
{
c[put] = c[take];
}
c[put] = '\0';
There are more efficient ways to do this using memcpy or memmove, and it could be made more general, but this is the essence. If you really care about speed, you should probably make a new array that doesn't contain the characters you don't want.
Here's one approach. Instead of removing characters in place and shuffling the remaining characters (which is a pain), you copy the characters you want to keep to another array:
#include <string.h>
...
void removeSubstr(const char *src, const char *substr, char *target)
{
/**
* Use the strstr() library function to find the beginning of
* the substring in src; if the substring is not present,
* strstr returns NULL.
*/
char *start = strstr(src, substr);
if (start)
{
/**
* Copy characters from src up to the location of the substring
*/
while (src != start) *target++ = *src++;
/**
* Skip over the substring
*/
src += strlen(substr);
}
/**
* Copy the remaining characters to the target, including 0 terminator
*/
while ((*target++ = *src++))
; // empty loop body;
}
int main(void)
{
char *src = "This is NOT a test";
char *sub = "NOT ";
char result[20];
removeSubstr(src, sub, result);
printf("Src: \"%s\", Substr: \"%s\", Result: \"%s\"\n", src, sub, result);
return 0;
}
string = H E L L O \0
string_length = 5 (or just use strlen inside if you don't want to cache it outside this call
remove_char_at_index = 1 if you want to delete the 'E'
copy to the 'E' position (string + 1)
from the first 'L' position (string + 1 + 1)
4 bytes (want to get the NULL), so 5 - 1 = 4
remove_character_at_location(char * string, int string_length, int remove_char_at_index) {
/* Use memmove because the locations overlap */.
memmove(string+remove_char_at_index,
string+remove_char_at_index+1,
string_length - remove_char_at_position);
}

Resources