The fastest way to save graph to file in C

The fastest way to save graph to file in C - c

I have such a problem: I need to save a large graph with > million edges to txt file. Each edge is represented with a structure containing 3 integers: from, to, cost. My task is to write a program that will fastly save the whole graph to txt file in such format: "from to cost\n".
I am interested in the method, how to do that.
My idea is in creating a huge buffer of chars, where I add each digit to the buffer without the need to reverse then (first of all I get the number of digits of each integer, then add each digit to the buffer, then I add whitespace/new line symbol and do it until the last number is added.
Then I save the whole buffer to file by using fwrite() function.
Despite the fact that this method is relatively fast, I saw programs that do it faster. My question is: do you know more efficient way to implement this program in order to get faster results?
The program must be in C language.
typedef struct {
int edge_start;
int edge_count;
int parent;
int cost;
} node_t;
typedef struct {
graph_t *graph;
node_t *nodes;
int num_nodes;
int start_node;
} dijkstra_t;
The function to get the number of digits:
int getNumberOfDigitsBig(int x) {
if (x >= 10000) {
if (x >= 10000000) {
if (x >= 100000000) {
if (x >= 1000000000)
return 9;
return 8;
}
return 7;
}
if (x >= 100000) {
if (x >= 1000000)
return 6;
return 5;
}
return 4;
}
if (x >= 100) {
if (x >= 1000)
return 3;
return 2;
}
if (x >= 10)
return 1;
return 0;
}
Save function:
const dijkstra_t *const dij = (dijkstra_t*)dijkstra;
if (dij) {
FILE *f = fopen(filename, "w");
if (f) {
int numberOfNodes = dij->num_nodes;
long bufferLength = numberOfNodes * (9 * 3 + 3);
buffer = (char *)malloc(bufferLength + 1);
long bufferCounter = 0;
int number;
// printf("i = %d\n", number);
int counter;
int digits;
buffer[bufferCounter++] = '0';
buffer[bufferCounter++] = ' ';
buffer[bufferCounter++] = '0';
buffer[bufferCounter++] = ' ';
buffer[bufferCounter++] = '-';
buffer[bufferCounter++] = '1';
buffer[bufferCounter++] = '\n';
for(int i = 1; i < numberOfNodes; i++) {
const node_t *const node = &(dij->nodes[i]);
number = i;
digits = getNumberOfDigits(number);
counter = bufferCounter;
do {
buffer[counter + digits] = ZERO + number % 10;
--digits;
++bufferCounter;
} while(number /= 10);
buffer[bufferCounter++] = ' ';
number = node->cost;
if(number != -1) {
digits = getNumberOfDigitsBig(number);
counter = bufferCounter;
do {
buffer[counter + digits] = ZERO + number % 10;
digits = digits - 1;
bufferCounter = bufferCounter + 1;
} while(number /= 10);
} else {
buffer[bufferCounter++] = '-';
buffer[bufferCounter++] = '1';
}
buffer[bufferCounter++] = ' ';
buffer[bufferCounter++] = ' ';
number = node->parent;
if(number != -1) {
digits = getNumberOfDigitsBig(number);
counter = bufferCounter;
do {
buffer[counter + digits] = ZERO + number % 10;
--digits;
++bufferCounter;
} while(number /= 10);
} else {
buffer[bufferCounter++] = '-';
buffer[bufferCounter++] = '1';
}
buffer[bufferCounter++] = '\n';
}
fwrite(buffer, 1, bufferCounter, f);
ret = fclose(f) == 0;
free(buffer);
}
}
Thanks for attention.

I assume that what you need is an optimized version of printf that only processes positive integers. I did not benchmark it, but I would try to do as little comparisons and operations as possible, so I ended with that function:
int printint(FILE *fd, int n) {
char buffer[32]; // an uint64_t uses max 20 chars in base 10
int i = sizeof(buffer);
do {
buffer[--i] = '0' + n%10; // write digits from the right of buffer
n /= 10;
} while(n > 0);
return fwrite(buffer + i, 1, sizeof(buffer) - i, fd);
}
Then I would not use a huge buffer, but just rely on the default buffering of a FILE *
Saving code could then become (more or less just started with the example form the question):
const dijkstra_t *const dij = (dijkstra_t*)dijkstra;
if (dij) {
FILE *f = fopen(filename, "w");
if (f) {
int numberOfNodes = dij->num_nodes;
fputs("0 0 -1\n", f);
for(int i = 1; i < numberOfNodes; i++) {
const node_t *const node = &(dij->nodes[i]);
fputc(' ', f);
number = node->parent;
//printf("parent = %d\n", number);
if(number != -1) {
printint(number, f);
} else {
fwrite("-1", 1, 2, f);
}
fputc('\n', f);
}
ret = fclose(f) == 0;
free(buffer);
}
}

You can improve a little by using this "itoa":
void gwf_i2a(char *d, int i, int l) {
char *e = d + l;
while (l > 0) {
e--;
l--;
e[0] = '0' + (i % 10);
i /= 10;
}
}
ORIGINAL TIME: 76 clicks (7.6e-05 seconds).
NEW TIME: 39 clicks (3.9e-05 seconds).
source:
#include <ctime>
#include <iostream>
#include <random>
#include <vector>
#define ZERO '0'
void gwf_i2a(char *d, int i, int l) {
char *e = d + l;
while (l > 0) {
e--;
l--;
e[0] = '0' + (i % 10);
i /= 10;
}
}
typedef struct {
int x, y, z;
} graph_t;
typedef struct {
int edge_start;
int edge_count;
int parent;
int cost;
} node_t;
typedef struct {
graph_t *graph;
node_t *nodes;
int num_nodes;
int start_node;
} dijkstra_t;
graph_t graph = {111, 222, 3456789};
node_t nodes[] = {{1, 1, 1, 9999}, {2, 2, 2, 8999}, {2, 2, 2, 1234567890}};
dijkstra_t data[] = {&graph, (node_t *)&nodes, 4, 0};
int getNumberOfDigits(int x) {
if (x >= 100) {
if (x >= 1000) return 3;
return 2;
}
if (x >= 10) return 1;
return 0;
}
int getNumberOfDigitsBig(int x) {
if (x >= 10000) {
if (x >= 10000000) {
if (x >= 100000000) {
if (x >= 1000000000) return 9;
return 8;
}
return 7;
}
if (x >= 100000) {
if (x >= 1000000) return 6;
return 5;
}
return 4;
}
if (x >= 100) {
if (x >= 1000) return 3;
return 2;
}
if (x >= 10) return 1;
return 0;
}
void save(const char *filename, const dijkstra_t *dijkstra) {
int ret;
const dijkstra_t *const dij = (dijkstra_t *)dijkstra;
char *buffer;
if (dij) {
FILE *f = fopen(filename, "w");
if (f) {
int numberOfNodes = dij->num_nodes;
long bufferLength = numberOfNodes * (9 * 3 + 3);
buffer = (char *)malloc(bufferLength + 1);
long bufferCounter = 0;
int number;
// printf("i = %d\n", number);
int counter;
int digits;
buffer[bufferCounter++] = '0';
buffer[bufferCounter++] = ' ';
buffer[bufferCounter++] = '0';
buffer[bufferCounter++] = ' ';
buffer[bufferCounter++] = '-';
buffer[bufferCounter++] = '1';
buffer[bufferCounter++] = '\n';
for (int i = 1; i < numberOfNodes; i++) {
const node_t *const node = &(dij->nodes[i]);
number = i;
digits = getNumberOfDigits(number);
counter = bufferCounter;
do {
buffer[counter + digits] = ZERO + number % 10;
--digits;
++bufferCounter;
} while (number /= 10);
buffer[bufferCounter++] = ' ';
number = node->cost;
if (number != -1) {
digits = getNumberOfDigitsBig(number);
counter = bufferCounter;
do {
buffer[counter + digits] = ZERO + number % 10;
digits = digits - 1;
bufferCounter = bufferCounter + 1;
} while (number /= 10);
} else {
buffer[bufferCounter++] = '-';
buffer[bufferCounter++] = '1';
}
buffer[bufferCounter++] = ' ';
buffer[bufferCounter++] = ' ';
number = node->parent;
if (number != -1) {
digits = getNumberOfDigitsBig(number);
counter = bufferCounter;
do {
buffer[counter + digits] = ZERO + number % 10;
--digits;
++bufferCounter;
} while (number /= 10);
} else {
buffer[bufferCounter++] = '-';
buffer[bufferCounter++] = '1';
}
buffer[bufferCounter++] = '\n';
}
fwrite(buffer, 1, bufferCounter, f);
ret = fclose(f) == 0;
free(buffer);
}
}
}
void new_save(const char *filename, const dijkstra_t *dijkstra) {
int ret;
const dijkstra_t *const dij = (dijkstra_t *)dijkstra;
char *buffer;
if (dij) {
FILE *f = fopen(filename, "w");
if (f) {
int numberOfNodes = dij->num_nodes;
long bufferLength = numberOfNodes * (9 * 3 + 3);
buffer = (char *)malloc(bufferLength + 1);
long bufferCounter = 0;
int number;
int counter;
int digits;
buffer[bufferCounter++] = '0';
buffer[bufferCounter++] = ' ';
buffer[bufferCounter++] = '0';
buffer[bufferCounter++] = ' ';
buffer[bufferCounter++] = '-';
buffer[bufferCounter++] = '1';
buffer[bufferCounter++] = '\n';
for (int i = 1; i < numberOfNodes; i++) {
const node_t *const node = &(dij->nodes[i]);
int len = getNumberOfDigits(i) + 1;
gwf_i2a((char *)&buffer[bufferCounter], i, len);
bufferCounter += len;
buffer[bufferCounter++] = ' ';
number = node->cost;
if (number != -1) {
len = getNumberOfDigitsBig(number) + 1;
gwf_i2a((char *)&buffer[bufferCounter], number, len);
bufferCounter += len;
} else {
buffer[bufferCounter++] = '-';
buffer[bufferCounter++] = '1';
}
buffer[bufferCounter++] = ' ';
buffer[bufferCounter++] = ' ';
number = node->parent;
if (number != -1) {
digits = getNumberOfDigitsBig(number);
counter = bufferCounter;
do {
buffer[counter + digits] = ZERO + number % 10;
--digits;
++bufferCounter;
} while (number /= 10);
} else {
buffer[bufferCounter++] = '-';
buffer[bufferCounter++] = '1';
}
buffer[bufferCounter++] = '\n';
}
fwrite(buffer, 1, bufferCounter, f);
ret = fclose(f) == 0;
free(buffer);
}
}
}
void original() {
clock_t t;
t = clock();
save("bogus.txt", data);
t = clock() - t;
std::cout << "original: " << t << " clicks (" << ((float)t) / CLOCKS_PER_SEC
<< " seconds)." << std::endl;
}
void new_test() {
clock_t t;
t = clock();
new_save("new_bogus.txt", data);
t = clock() - t;
std::cout << "NEW: " << t << " clicks (" << ((float)t) / CLOCKS_PER_SEC
<< " seconds)." << std::endl;
}
int main(int argc, char **argv) {
original();
new_test();
return 0;
}

[Rewritten on 2018-01-13.]
Standard I/O (printf() etc.) is indeed comparatively slow in converting numerical data to text form. Here, the problem is to output lines of form
<node> <cost> <parent>
where all three are either unsigned (32-bit) integers in decimal notation, or -1. For simplicity, let's reserve the value UINT32_MAX (4294967295) for -1.
I suggest a two-fold approach:
Construct each record from right to left. This avoids the need of checking how many digits there are in a number.
Buffer a number of records at once. This reduces the number of fwrite() calls, at the cost of a modest dynamically allocated buffer.
Note that this means that the records in each chunk must be processed last-to-first, in order to retain the correct order.
Consider the following code. Note that I've reduced the definitions of node_t and dijkstra_t to the fields that area actually used, so that the following example can be compiled as-is. Also note that instead of -1 for parent or cost, one must use UINT32_MAX, as their types are now uint32_t.
#include <stdlib.h>
#include <stdint.h>
#include <limits.h>
#include <stdio.h>
typedef struct {
uint32_t parent; /* Use UINT32_MAX for -1 */
uint32_t cost; /* Use UINT32_MAX for -1 */
} node_t;
typedef struct {
node_t *nodes;
uint32_t num_nodes;
} dijkstra_t;
/* This function will store an unsigned 32-bit value
in decimal form, ending at 'end'.
UINT32_MAX will be written as "-1", however.
Returns a pointer to the start of the value.
*/
static inline char *prepend_value(char *end, uint32_t value)
{
if (value == UINT32_MAX) {
*(--end) = '1';
*(--end) = '-';
} else {
do {
*(--end) = '0' + (value % 10u);
value /= 10u;
} while (value);
}
return end;
}
/* Each record consists of three unsigned 32-bit integers,
each at most 10 characters, with spaces in between
and a newline at end. Thus, at most 33 characters. */
#define RECORD_MAXLEN 33
/* We process records in chunks of 16384.
Maximum number of records (nodes) is 2**32 - 2 - RECORD_CHUNK,
or 4,294,950,910 in this case. */
#define RECORD_CHUNK 16384
/* Each chunk of record is up to CHUNK_CHARS long.
(Roughly half a megabyte in this case.) */
#define CHUNK_CHARS (RECORD_MAXLEN * RECORD_CHUNK)
/* Save the edges in a graph to a stream.
Returns 0 if success, -1 if an error occurs.
*/
int save_edges(dijkstra_t *dij, FILE *out)
{
if (dij && out && !ferror(out)) {
const int nodes = dij->num_nodes;
const node_t *node = dij->nodes;
const uint32_t root_parent = dij->nodes->parent;
const uint32_t root_cost = dij->nodes->cost;
char *buf, *end, *ptr;
uint32_t o;
/* Allocate memory for the chunk buffer. */
buf = malloc(CHUNK_CHARS);
if (!buf)
return -1;
end = buf + CHUNK_CHARS;
/* Temporarily, we reset the root node parent
to UINT32_MAX and cost to 0, so that the
very first record in the output is "0 0 -1". */
dij->nodes->cost = 0;
dij->nodes->parent = UINT32_MAX;
for (o = 0; o < nodes; o += RECORD_CHUNK) {
uint32_t i = (o + RECORD_CHUNK < nodes) ? o + RECORD_CHUNK : nodes;
/* Fill buffer back-to-front. */
ptr = end;
while (i-->o) {
const node_t *curr = node + i;
/* Format: <i> ' ' <cost> ' ' <parent> '\n' */
/* We construct the record from right to left. */
*(--ptr) = '\n';
ptr = prepend_value(ptr, curr->parent);
*(--ptr) = ' ';
ptr = prepend_value(ptr, curr->cost);
*(--ptr) = ' ';
ptr = prepend_value(ptr, i);
}
/* Write the chunk buffer out. */
if (fwrite(ptr, 1, (size_t)(end - ptr), out) != (size_t)(end - ptr)) {
dij->nodes->cost = root_cost;
dij->nodes->parent = root_parent;
free(buf);
return -1;
}
}
/* Reset root node, and free the buffer. */
dij->nodes->cost = root_cost;
dij->nodes->parent = root_parent;
free(buf);
/* Check for write errors. */
if (fflush(out))
return -1;
if (ferror(out))
return -1;
/* Success. */
return 0;
}
return -1;
}
Additional speedup is possible, if we can use POSIX low-level I/O (open(), close(), write(), and fstat() from <unistd.h>). When the destination is a pipe or device, we can just directly write the data; when the destination is a file, we should write in chunks of multiples of st_blksize, to avoid read-modify-write cycles. Unlike standard I/O, with low-level I/O we can do that with just one "overflow" buffer of st_blksize, without having to copy the entire chunk buffer around in memory. However, since the question is not tagged posix, I shall refrain from further discussion along those edges.
OP stated their own version is still faster. I found that difficult to believe, because it does much more work than my version above. When I checked, on my machine a large dataset (of say 100,000,000) cannot be written in a single fwrite() call, as it only does a partial write; a loop is required to actually write the entire dataset. Therefore, in my opinion, the benchmark OP uses to compare the different version is very suspect.
Consider the following microbenchmark instead. It generates a singly linked list, and uses an externally compiled save_graph() function to output it (to standard output). There are three versions implemented: null, which does not save anything at all; antonkretov, for OP's implementation (adapted to work here); and nominalanimal, for mine.
Makefile:
CC := gcc
CFLAGS := -std=c99 -O2 -Wall
LDFLAGS :=
BINS := test-null test-antonkretov test-nominalanimal
NODES := 100000000
.PHONY: all clean run
all: clean $(BINS)
clean:
rm -f $(BINS) *.o
%.o: %.c
$(CC) $(CFLAGS) -c $^
test-null: main.o data-null.o
$(CC) $(CFLAGS) $^ -o $#
test-antonkretov: main.o data-antonkretov.o
$(CC) $(CFLAGS) $^ -o $#
test-nominalanimal: main.o data-nominalanimal.o
$(CC) $(CFLAGS) $^ -o $#
run: $(BINS)
#echo "Testing $(NODES) nodes."
#./test-null $(NODES) > /dev/null
#echo "Overhead (nothing saved):"
#bash -c 'time ./test-null $(NODES) > /dev/null'
#echo ""
#echo "Anton Kretov:"
#bash -c 'time ./test-antonkretov $(NODES) > /dev/null'
#echo ""
#echo "Nominal Animal:"
#bash -c 'time ./test-nominalanimal $(NODES) > /dev/null'
#echo ""
Note that this forum converts Tabs to spaces, and Makefile format requires the indentation to use spaces, so if you copy and paste the above to file, you need to run e.g. sed -e 's|^ *|\t|' -i Makefile to fix it.
data.h:
#ifndef DATA_H
#define DATA_H
#include <stdint.h>
#include <limits.h>
#include <stdio.h>
#define INVALID_COST UINT32_MAX
#define INVALID_PARENT UINT32_MAX
typedef struct {
uint32_t parent; /* Use INVALID_PARENT for -1 */
uint32_t cost; /* Use INVALID_COST for -1 */
} node_t;
typedef struct {
node_t *nodes;
uint32_t num_nodes;
} dijkstra_t;
int save_graph(dijkstra_t *, FILE *);
#endif /* DATA_H */
data-null.c, for measuring runtime overhead:
#include "data.h"
int save_graph(dijkstra_t *dij, FILE *out)
{
/* Does not do anything */
return 0;
}
data-antonkretov.c, version of OP's save routine, for comparison:
#include <stdlib.h>
#include "data.h"
int getNumberOfDigits(uint32_t x)
{
if (x >= 10000) {
if (x >= 10000000) {
if (x >= 100000000) {
if (x >= 1000000000)
return 9;
return 8;
}
return 7;
}
if (x >= 100000) {
if (x >= 1000000)
return 6;
return 5;
}
return 4;
}
if (x >= 100) {
if (x >= 1000)
return 3;
return 2;
}
if (x >= 10)
return 1;
return 0;
}
int save_graph(dijkstra_t *dij, FILE *out)
{
uint32_t numberOfNodes = dij->num_nodes;
size_t bufferLength = numberOfNodes * (size_t)33;
size_t bufferCounter = 0, counter;
size_t bytes;
uint32_t number, digits, i;
char *buffer;
if ((size_t)(bufferLength / 33) != numberOfNodes)
return -1;
buffer = malloc(bufferLength);
if (!buffer)
return -1;
buffer[bufferCounter++] = '0';
buffer[bufferCounter++] = ' ';
buffer[bufferCounter++] = '0';
buffer[bufferCounter++] = ' ';
buffer[bufferCounter++] = '-';
buffer[bufferCounter++] = '1';
buffer[bufferCounter++] = '\n';
for (i = 1; i < numberOfNodes; i++) {
const node_t *const node = dij->nodes + i;
number = i;
digits = getNumberOfDigits(number);
counter = bufferCounter;
do {
buffer[counter + digits] = '0' + (number % 10u);
--digits;
++bufferCounter;
} while (number /= 10u);
buffer[bufferCounter++] = ' ';
number = node->cost;
if (number != UINT32_MAX) {
digits = getNumberOfDigits(number);
counter = bufferCounter;
do {
buffer[counter + digits] = '0' + (number % 10u);
--digits;
++bufferCounter;
} while (number /= 10u);
} else {
buffer[bufferCounter++] = '-';
buffer[bufferCounter++] = '1';
}
buffer[bufferCounter++] = ' ';
number = node->parent;
if (number != UINT32_MAX) {
digits = getNumberOfDigits(number);
counter = bufferCounter;
do {
buffer[counter + digits] = '0' + (number % 10u);
--digits;
++bufferCounter;
} while (number /= 10u);
} else {
buffer[bufferCounter++] = '-';
buffer[bufferCounter++] = '1';
}
buffer[bufferCounter++] = '\n';
}
counter = 0;
while (counter < bufferCounter) {
bytes = fwrite(buffer + counter, 1, bufferCounter - counter, out);
if (!bytes) {
free(buffer);
return -1;
}
counter += bytes;
}
free(buffer);
return 0;
}
data-nominalanimal.c, my chunked back-to-front version of the save routine:
#include <stdlib.h>
#include "data.h"
/* This function will store an unsigned 32-bit value
in decimal form, ending at 'end'.
UINT32_MAX will be written as "-1", however.
Returns a pointer to the start of the value.
*/
static inline char *prepend_value(char *end, uint32_t value)
{
if (value == UINT32_MAX) {
*(--end) = '1';
*(--end) = '-';
} else {
do {
*(--end) = '0' + (value % 10u);
value /= 10u;
} while (value);
}
return end;
}
/* Each record consists of three unsigned 32-bit integers,
each at most 10 characters, with spaces in between
and a newline at end. Thus, at most 33 characters. */
#define RECORD_MAXLEN 33
/* We process records in chunks of 16384.
Maximum number of records (nodes) is 2**32 - 2 - RECORD_CHUNK,
or 4,294,950,910 in this case. */
#define RECORD_CHUNK 16384
/* Each chunk of record is up to CHUNK_CHARS long.
(Roughly half a megabyte in this case.) */
#define CHUNK_CHARS (RECORD_MAXLEN * RECORD_CHUNK)
/* Save the edges in a graph to a stream.
Returns 0 if success, -1 if an error occurs.
*/
int save_graph(dijkstra_t *dij, FILE *out)
{
if (dij && out && !ferror(out)) {
const int nodes = dij->num_nodes;
const node_t *node = dij->nodes;
const uint32_t root_parent = dij->nodes->parent;
const uint32_t root_cost = dij->nodes->cost;
char *buf, *end, *ptr;
uint32_t o;
/* Allocate memory for the chunk buffer. */
buf = malloc(CHUNK_CHARS);
if (!buf)
return -1;
end = buf + CHUNK_CHARS;
/* Temporarily, we reset the root node parent
to UINT32_MAX and cost to 0, so that the
very first record in the output is "0 0 -1". */
dij->nodes->cost = 0;
dij->nodes->parent = UINT32_MAX;
for (o = 0; o < nodes; o += RECORD_CHUNK) {
uint32_t i = (o + RECORD_CHUNK < nodes) ? o + RECORD_CHUNK : nodes;
/* Fill buffer back-to-front. */
ptr = end;
while (i-->o) {
const node_t *curr = node + i;
/* Format: <i> ' ' <cost> ' ' <parent> '\n' */
/* We construct the record from right to left. */
*(--ptr) = '\n';
ptr = prepend_value(ptr, curr->parent);
*(--ptr) = ' ';
ptr = prepend_value(ptr, curr->cost);
*(--ptr) = ' ';
ptr = prepend_value(ptr, i);
}
/* Write buffer. */
if (fwrite(ptr, 1, (size_t)(end - ptr), out) != (size_t)(end - ptr)) {
dij->nodes->cost = root_cost;
dij->nodes->parent = root_parent;
free(buf);
return -1;
}
}
/* Reset root node, and free the buffer. */
dij->nodes->cost = root_cost;
dij->nodes->parent = root_parent;
free(buf);
if (fflush(out))
return -1;
if (ferror(out))
return -1;
return 0;
}
return -1;
}
and finally the main program itself, main.c, that generates the data and calls the save_graph() functions:
#include <stdlib.h>
#include <inttypes.h>
#include <limits.h>
#include <string.h>
#include "data.h"
#define EDGES_MAX 4294901759
int main(int argc, char *argv[])
{
dijkstra_t graph;
size_t bytes;
uint32_t edges, i;
char dummy;
if (argc != 2 || !strcmp(argv[1], "-h") || !strcmp(argv[1], "--help")) {
fprintf(stderr, "\nUsage: %s EDGES\n\n", argv[0]);
return EXIT_SUCCESS;
}
if (sscanf(argv[1], " %" SCNu32 " %c", &edges, &dummy) != 1 || edges < 1 || edges > EDGES_MAX) {
fprintf(stderr, "%s: Invalid number of edges.\n", argv[1]);
return EXIT_FAILURE;
}
bytes = (1 + (size_t)edges) * sizeof graph.nodes[0];
if ((size_t)(bytes / (1 + (size_t)edges)) != sizeof graph.nodes[0]) {
fprintf(stderr, "%s: Too many edges.\n", argv[1]);
return EXIT_FAILURE;
}
graph.num_nodes = edges + 1;
graph.nodes = malloc(bytes);
if (!graph.nodes) {
fprintf(stderr, "%s: Too many edges: out of memory.\n", argv[1]);
return EXIT_FAILURE;
}
/* Generate a graph; no randomness, to keep timing steady. */
graph.nodes[0].parent = INVALID_COST;
graph.nodes[0].cost = 0;
for (i = 1; i <= edges; i++) {
graph.nodes[i].parent = i - 1;
graph.nodes[i].cost = 1 + (i % 10);
}
/* Print graph. */
if (save_graph(&graph, stdout)) {
fprintf(stderr, "Write error!\n");
return EXIT_FAILURE;
}
/* Done. */
return EXIT_SUCCESS;
}
Running make clean run (or make NODES=100000000 clean run) recompiles the benchmarks, and measures their run time, for a graph with 100,000,000 nodes. On my machine, the output is
Testing 100000000 nodes.
Overhead (nothing saved):
real0m0.514s
user0m0.297s
sys0m0.217s
Anton Kretov:
real0m4.059s
user0m3.379s
sys0m0.680s
Nominal Animal:
real0m3.336s
user0m3.151s
sys0m0.184s
which shows that mine is significantly faster. If we ignore the overhead (of generating the graph), mine took about 2.8 seconds of real time to save the data to /dev/null, whereas OP's took about 3.5 seconds. In other words, mine shows a 20% speed improvement.
It is important to note that both tests do produce the exact same output. For example, both ./test-nominalanimal 100000000 | sha256sum - and ./test-antonkretov 100000000 | sha256sum - show the exact same SHA256 checksums, 7504a1c97167701297c03c4aab8b0f20c5cac82a50128074d6e09c474353d0f8.
(You can also save the output to a file, and compare them; both are exactly 1,987,777,795 bytes long, and contain the exact same data. I did check.)
If you want to run a benchmark that stores the data to storage, for the comparisons to be fair, you need to start with cold caches. Otherwise the order in which you run the benchmarks will heavily impact their timings.

Related

How to unscramble a word and find all its matches in a txt file in C?

So given a string of up to 7 letters, I need to find every permutation of that string (with and without all the letters) and then check if any of those permutations can be found in my dictionary.txt file, and print the ones that match. So basically, if the user inputs "try," the permutations would be try, tr, tyr, ty, t, rty, etc., and then check if any of them match words in the txt file. I tried to do this using strncopy and strcmp, but the program doesn't always correctly deduce that two things are equal, it takes forever to run, and there's a bug where it counts having zero letters as a permutation of the original string.
Here is my code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define SIZE 100 /* number of words in dictionary.txt */
#define MAX 7 /* max number of letters in given string */
/* function to swap values at two pointers */
void swap(char *x, char *y){
char temp;
temp = *x;
*x = *y;
*y = temp;
}
/* function to find permutations of the string */
void permute(char *letters, int l, int r){
if (l == r){
char *a[SIZE];
FILE *file = fopen("dictionary.txt", "r");
char target[MAX_2];
memset(target, '\0', sizeof(target));
for (int i = 0; i < SIZE; i++){
a[i] = malloc(100000);
fscanf(file, "%s", a[i]);
}
for (int i = 0; i < 10; i++){
for (int j = 0; j < r - 1; j++){
strcpy(target, a[i]);
if (strcmp(target, &letters[i]) == 0){
printf("%s\n", target);
printf("%s\n", letters);
printf("Match\n");
}
/*else if (strcmp(target, &letters[i]) != 0){
printf("%s\n", target);
printf("%s\n", letters);
printf("Not a match\n");
}
*/
}
}
for (int i = 0; i < SIZE; i++){
free (a[i]);
}
fclose(file);
}
else{
for (int i = l; i <= r; i++){
swap((tiles+l), (tiles+i));
permute(tiles, l+1, r);
swap((tiles+l), (tiles+i));
}
}
}
int main(){
/* initializing tile input */
char letters[MAX];
printf("Please enter your letters: ");
scanf("%s", letters);
/* finding size of input */
int size = strlen(letters);
/* finds all the permutation of the input */
/* parameters: string; start of the string; end of the string */
permute(letters, 0, size);
return 0;
}
Any help or suggestions to pinpoint what I'm doing wrong would be greatly appreciated.

As hinted in my comment, you can map all permutations of a string to a single code value, just by using the bits of a big enough unsigned integer as a bit set. Thus, the (same length) permutations of e.g. the word "try" all map to the same value.
As far as I understood your problem, you also want to match words, which start out with a substring of the wanted word. For that to work, you need to generated N such codes, if N is the number of letters, a word contains. I.e. For a three letter word, the code for the first letter, the first 2 letters and the code for all 3 letters.
Since reading from a file is probably not the problem, here the code, showcasing the "code based" string matching idea (which should be reasonably fast):
#include <stdio.h>
#include <inttypes.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>
#define MAX_WORD_LENGTH 7
typedef uint32_t WordCode;
typedef struct WordCodes_tag {
size_t count;
WordCode codes[MAX_WORD_LENGTH];
} WordCodes_t;
bool word_to_code(const char* word,
size_t start,
size_t end,
WordCode* code) {
if ((end - start) > MAX_WORD_LENGTH)
return false;
*code = 0;
for (size_t i = start; i < end; i++) {
char c = word[i];
if ((c >= 'a') && (c <= 'z')) {
char bit = c - 'a';
WordCode mask = 1 << bit;
(*code) |= mask;
} else {
return false;
}
}
return true;
}
bool word_to_codes(const char* word, WordCodes_t* codes) {
if (NULL == codes)
return false;
if (NULL == word)
return false;
codes->count = 0;
size_t nchars = strlen(word);
if (nchars > MAX_WORD_LENGTH)
return false;
for (size_t len = nchars; len >= 1; len--) {
WordCode word_code;
if (word_to_code(word, 0, len, &word_code)) {
codes->codes[codes->count] = word_code;
codes->count++;
} else {
return false;
}
}
return true;
}
void show_word_codes(const WordCodes_t* codes) {
if (NULL == codes) return;
printf("(");
for (size_t i = 0; i < codes->count; i++) {
if (i > 0)
printf(", %d", codes->codes[i]);
else
printf("%d", codes->codes[i]);
}
printf(")\n");
}
bool is_match(const WordCodes_t* a, const WordCodes_t* b) {
if ((NULL == a) || (NULL == b))
return false;
if ((0 == a->count) || (0 == b->count))
return false;
const WordCodes_t *temp = NULL;
if (a->count < b->count) {
temp = a;
a = b;
b = temp;
}
size_t a_offset = a->count - b->count;
for (size_t i = a_offset, j = 0; i < a->count; i++, j++) {
if (a->codes[i] == b->codes[j])
return true;
}
return false;
}
int main(int argc, const char* argv[]) {
const char* wanted = "try";
const char* dictionary[] = {
"house", "mouse", "cat", "tree", "try", "yrt", "t"
};
size_t dict_len = sizeof(dictionary) / sizeof(char*);
WordCodes_t wanted_codes;
if (word_to_codes(wanted, &wanted_codes)) {
printf("word codes of the wanted word '%s': ", wanted);
show_word_codes(&wanted_codes);
for (size_t i = 0; i < dict_len; i++) {
WordCodes_t found_codes;
if (word_to_codes(dictionary[i],&found_codes)) {
printf("word codes of dictionary word '%s' (%s): ",
dictionary[i],
is_match(&wanted_codes, &found_codes) ?
"match" : "no match");
show_word_codes(&found_codes);
} else {
printf("word_to_codes(%s) failed!", dictionary[i]);
}
}
} else {
puts("word_to_codes() failed!");
return -1;
}
}
As function is_match() above shows, you need only compare the codes for the respective substring length. Thus, even if you have 2 sets of up to 7 numbers, you need only maximum 7 comparisons.
The output looks like this (which seems to make sense):
./search
word codes of the wanted word 'try': (17432576, 655360, 524288)
word codes of dictionary word 'house' (no match): (1327248, 1327232, 1065088, 16512, 128)
word codes of dictionary word 'mouse' (no match): (1331216, 1331200, 1069056, 20480, 4096)
word codes of dictionary word 'cat' (no match): (524293, 5, 4)
word codes of dictionary word 'tree' (match): (655392, 655376, 655360, 524288)
word codes of dictionary word 'try' (match): (17432576, 655360, 524288)
word codes of dictionary word 'yrt' (match): (17432576, 16908288, 16777216)
word codes of dictionary word 't' (match): (524288)

If you want to match the words in a dictionary against all partial permutations of a search term, you don't have to create all permutations. (The number of permutations n! grows very quickly with the length of the search term, n.)
Instead, it is easier to write a customized search function. You can make use of two strategies here:
A word w is a permutation of the search term s if both words are eaqual if the letters are sorted. For example, "integral" and "triangle" are anagrams of each other, because both sort to "aegilnrt".
You can skip letters in the search term when searching to account for partial anagrams. Because the search term and the word will be sorted, you know which ones to skip: The ones that are lexically "smaller" than the next letter in the word.
So your matching function should sort the words first and then compare the words character by character in such a way that characters from the search term can be skipped.
Here's code that does that:
int char_cmp(const void *pa, const void *pb)
{
const char *a = pa;
const char *b = pb;
return *a - *b;
}
bool partial_anagram(const char *aa, const char *bb)
{
char A[64];
char B[64];
const char *a = strcpy(A, aa);
const char *b = strcpy(B, bb);
qsort(A, strlen(A), 1, char_cmp);
qsort(B, strlen(B), 1, char_cmp);
while (*b) {
while (*a && *a < *b) a++;
if (*a != *b) return false;
a++;
b++;
}
return true;
}
Things to note:
Sorting is done with the function qsort from <stdlib.h>, for which you need a comparator function, in this case char_cmp.
The sorted strings are copies, so that the original strings are not modified. (The code above is unsafe, because it doesn't enforce that the length of the strings is less than 64 characters. Unfortunately, the function strncpy, which can accept a maximum buffer size, is not safe, either, because it can leave the buffer unterminated. A safe way to copy the strings would be snprintf(A, sizeof(A), "%s", aa), but I've kept the strcpy for, er, "simplicity".)
The function partial_anagram takes unsorted strings and sorts them. That makes for a clean interface, but it is inefficient when you want to test against the same search term repeatedly as in your case. You could change the function, so that it expects already sorted strings. This will reduce the function to just the loop and will place the responsibility of sorting to the caller.
If you really have a lot of searches, there is yet more room for optimization. For example, you could insert the sorted dictionary into a trie. Given that you original code read the whole file for each permutation, I guess you're not worried that much about performance. :)
I've put a working example online. The code above works with pointers. If you are more at ease with indices, you can rewrite the function:
bool partial_anagram(const char *aa, const char *bb)
{
char a[64];
char b[64];
unsigned i = 0;
unsigned j = 0;
strcpy(a, aa);
strcpy(b, bb);
qsort(a, strlen(a), 1, char_cmp);
qsort(b, strlen(b), 1, char_cmp);
while (b[j]) {
while (a[i] && a[i] < b[j]) i++;
if (a[i] != b[j]) return false;
i++;
j++;
}
return true;
}

Problem
One is using an algorithm that has exponentially growing run-time with the problem size. There are probably lots of ways to speed this up, but, as suggested by #SparKot, a trie, or prefix tree, is a particularly good fit. One can build a trie from an dictionary array of size n, assuming the length of the strings in your dictionary are bounded, in O(n log n). Looking up angrams in the worst-case, where the letters never run out, (ignoring the arbitrary 7 limit,) is still worst case O(n).
$ bin/trie AAABBBCCCDDDEEEFFFGGGHHHIIIJJJKKKLLLMMMNNNOOOPPPQQQRRRSSSTTTUUUVVVWWWXXXYYYZZZ < Tutte_le_parole_inglesi.txt
build_index warning: duplicate "OUTSOURCING".
build_index warning: duplicate "OUTSOURCINGS".
Loaded 216553 trie entries.
AA
AAH
AAHED
AAHING
...
ZYTHUMS
ZYZZYVA
ZYZZYVAS
211929 words found.
Proposal
The reason a prefix tree is so effective, is it allows you to query prefixes as (even more) efficiently as lookup. With this, one can do a very effective branch-and-bound-style algorithm. That is, the longer the string, the less words it will be a prefix match to; if the string is not a prefix match for any of the words in the dictionary, one can rule out any longer strings and just not test them.
So my idea is, form a histogram with the Scrabble-string of length k in O(k). Then, recursively, add more and more letters, matching, until no dictionary entries are prefix matches of the string. This will run in (*I think) O(n log n + k), assuming a bound on the number of comparisons needed to distinguish words; ie, one's dictionary is not { a, aa, aaa, aaaa, aaaaa, aaaaaa, ... }.
Implementation
I use a PATRiCA tree. It is especially attractive because a lot of data is implicit; one can use a simple array to represent the leaves on a complete binary tree. Specifically, n leaves are already just the list of words in lexicographical order, we want to build an index of n - 1 branches. It requires a stop code; the null-termination in C is perfect. I don't have to create copies of everything and manage them. The below code first sets up a dynamic array, which is useful for input, then sets up a trie, then implements the algorithm.
#include <stdlib.h> /* EXIT malloc free qsort */
#include <stdio.h> /* printf */
#include <string.h> /* memmove memcpy */
#include <assert.h> /* assert */
#include <errno.h> /* errno */
#include <limits.h> /* UINT_MAX */
#include <ctype.h> /* isgraph */
/* Dynamic array. */
#define MIN_ARRAY(name, type) \
struct name##_array { type *data; size_t size, capacity; }; \
static int name##_array_reserve(struct name##_array *const a, \
const size_t min) { \
size_t c0; \
type *data; \
const size_t max_size = (size_t)-1 / sizeof *a->data; \
if(a->data) { \
if(min <= a->capacity) return 1; \
c0 = a->capacity < 7 ? 7 : a->capacity; \
} else { \
if(!min) return 1; \
c0 = 7; \
} \
if(min > max_size) return errno = ERANGE, 0; \
/* `c_n = a1.625^n`, approximation golden ratio `\phi ~ 1.618`. */ \
while(c0 < min) { \
size_t c1 = c0 + (c0 >> 1) + (c0 >> 3); \
if(c0 >= c1) { c0 = max_size; break; } /* Unlikely. */ \
c0 = c1; \
} \
if(!(data = realloc(a->data, sizeof *a->data * c0))) \
{ if(!errno) errno = ERANGE; return 0; } \
a->data = data, a->capacity = c0; \
return 1; \
} \
static type *name##_array_buffer(struct name##_array *const a, \
const size_t n) { \
if(a->size > (size_t)-1 - n) { errno = ERANGE; return 0; } \
return name##_array_reserve(a, a->size + n) \
&& a->data ? a->data + a->size : 0; \
} \
static type *name##_array_append(struct name##_array *const a, \
const size_t n) { \
type *b; \
if(!(b = name##_array_buffer(a, n))) return 0; \
return a->size += n, b; \
} \
static type *name##_array_new(struct name##_array *const a) \
{ return name##_array_append(a, 1); } \
static struct name##_array name##_array(void) \
{ struct name##_array a; a.data = 0, a.capacity = a.size = 0; return a; } \
static void name##_array_(struct name##_array *const a) \
{ if(a) free(a->data), *a = name##_array(); }
MIN_ARRAY(char, char)
/** Append a file, `fp`, to `c`, and add a '\0'.
#return Success. A partial read is failure. #throws[fopen, fread, malloc]
#throws[EISEQ] The text file has embedded nulls.
#throws[ERANGE] If the standard library does not follow POSIX. */
static int append_file(struct char_array *c, FILE *const fp) {
const size_t granularity = 4096;
size_t nread;
char *cursor;
int success = 0;
assert(c && fp);
/* Read entire file in chunks. */
do if(!(cursor = char_array_buffer(c, granularity))
|| (nread = fread(cursor, 1, granularity, fp), ferror(fp))
|| !char_array_append(c, nread)) goto catch;
while(nread == granularity);
/* File to `C` string. */
if(!(cursor = char_array_new(c))) goto catch;
*cursor = '\0';
/* Binary files with embedded '\0' are not allowed. */
if(strchr(c->data, '\0') != cursor) { errno = EILSEQ; goto catch; }
{ success = 1; goto finally; }
catch:
if(!errno) errno = EILSEQ; /* Will never be true on POSIX. */
finally:
if(fp) fclose(fp);
return success;
}
/* Trie is base-2 compact radix tree, described in <Morrison, 1968 PATRICiA>.
Specifically, this is a full binary tree. */
struct branch { unsigned skip, left; };
static const size_t skip_max = UINT_MAX, left_max = UINT_MAX;
MIN_ARRAY(branch, struct branch)
MIN_ARRAY(leaf, char *)
struct trie { struct branch_array branches; struct leaf_array leaves; };
static struct trie trie(void) { struct trie t;
t.branches = branch_array(), t.leaves = leaf_array(); return t; }
static void trie_(struct trie *const t) { if(t) branch_array_(&t->branches),
leaf_array_(&t->leaves), *t = trie(); }
/** From string `a`, extract `bit`, either 0 or 1. */
static int is_bit(const char *const a, const size_t bit) {
const size_t byte = bit >> 3;
const unsigned char mask = 128 >> (bit & 7);
return !!(a[byte] & mask);
}
/** #return Whether `a` and `b` are equal up to the minimum of their lengths'. */
static int is_prefix(const char *a, const char *b) {
for( ; ; a++, b++) {
if(*a == '\0') return 1;
if(*a != *b) return *b == '\0';
}
}
/** [low, high). */
struct range { size_t low, high; };
static int init_branches_r(struct trie *const t, size_t bit,
const struct range range) {
struct range r;
size_t skip = 0, left;
struct branch *branch;
assert(t && t->leaves.size);
assert(t->branches.capacity >= t->leaves.size - 1);
assert(range.low <= range.high && range.high <= t->leaves.size);
if(range.low + 1 >= range.high) return 1; /* Only one, leaf. */
/* Endpoints of sorted range: skip [_1_111...] or [...000_0_] don't care. */
while(is_bit(t->leaves.data[range.low], bit)
|| !is_bit(t->leaves.data[range.high - 1], bit)) {
if(skip == skip_max) return errno = ERANGE, 0;
bit++, skip++;
}
/* Binary search for the rightmost 0 (+1). */
r = range;
while(r.low < r.high) {
size_t m = r.low + (r.high - r.low) / 2;
if(is_bit(t->leaves.data[m], bit)) r.high = m; else r.low = m + 1;
}
if((left = r.low - range.low - 1) > left_max) return errno = ERANGE, 0;
/* Should have space for all branches pre-allocated. */
branch = branch_array_new(&t->branches), assert(branch);
branch->left = (unsigned)left;
branch->skip = (unsigned)skip;
bit++;
return (r.low = range.low, r.high = range.low + left + 1,
init_branches_r(t, bit, r)) && (r.low = r.high, r.high = range.high,
init_branches_r(t, bit, r)) /* && (printf("}\n"), 1) */;
}
/** Orders `a` and `b` by their pointed-to-strings. #implements qsort bsearch */
static int vstrcmp(const void *const a, const void *const b)
{ return strcmp(*(const char *const*)a, *(const char *const*)b); }
/** #param[a] A zero-terminated file containing words. Will be parsed and
modified.
#param[t] An idle tree that is initialized from `a`. Any modification of `a`
invalidates `t`.
#return Whether the tree initialization was entirely successful. */
static int build_trie(struct trie *const t, struct char_array *const a) {
struct range range;
size_t i;
char *cursor, *end, **leaf;
int is_run = 0;
/* Strict for processing ease; this could be made more permissive. */
assert(a && a->size && a->data[a->size - 1] == '\0'
&& t && !t->branches.size && !t->leaves.size);
for(cursor = a->data, end = a->data + a->size; cursor < end; cursor++) {
/* Fixme: 7-bit; mælström would be parsed as "m", "lstr", "m". */
if(!isgraph(*cursor)) {
*cursor = '\0', is_run = 0;
} else if(!is_run) {
if(!(leaf = leaf_array_new(&t->leaves))) return 0;
*leaf = cursor, is_run = 1;
}
}
if(!t->leaves.size) return errno = EILSEQ, 0; /* No parseable info. */
/* Sort and de-duplicate (inefficiently.) Want to treat it as an index. */
qsort(t->leaves.data, t->leaves.size, sizeof *t->leaves.data, &vstrcmp);
for(i = 1; i < t->leaves.size; i++) {
if(strcmp(t->leaves.data[i - 1], t->leaves.data[i]) < 0) continue;
fprintf(stderr, "build_index warning: duplicate \"%s\".\n",
t->leaves.data[i]);
memmove(t->leaves.data + i, t->leaves.data + i + 1,
sizeof *t->leaves.data * (t->leaves.size - i - 1));
t->leaves.size--, i--;
}
range.low = 0, range.high = t->leaves.size;
if(!branch_array_reserve(&t->branches, t->leaves.size - 1)
|| !init_branches_r(t, 0, range)) return 0;
assert(t->branches.size + 1 == t->leaves.size);
return 1;
}
/** #return In `t`, which must be non-empty, given a `prefix`, stores all leaf
prefix matches, only given the index, ignoring don't care bits.
#order \O(`prefix.length`) */
static struct range partial_prefix(const struct trie *const t,
const char *const prefix) {
size_t n0 = 0, n1 = t->branches.size, i = 0, left;
struct branch *branch;
size_t byte, key_byte = 0, bit = 0;
struct range range = { 0, 0 };
assert(t && prefix);
assert(n1 + 1 == t->leaves.size); /* Full binary tree. */
while(n0 < n1) {
branch = t->branches.data + n0;
bit += branch->skip;
/* '\0' is not included for partial match. */
for(byte = bit >> 3; key_byte <= byte; key_byte++)
if(prefix[key_byte] == '\0') goto finally;
left = branch->left;
if(!is_bit(prefix, bit++)) n1 = ++n0 + left;
else n0 += left + 1, i += left + 1;
}
assert(n0 == n1);
finally:
assert(n0 <= n1 && i - n0 + n1 < t->leaves.size);
range.low = i, range.high = i - n0 + n1 + 1;
return range;
}
/* #return Given a `prefix`, what is the range of matched strings in `t`. */
static struct range prefix(const struct trie *const t,
const char *const prefix) {
struct range range;
assert(t && prefix);
if(!t->leaves.size) goto catch;
range = partial_prefix(t, prefix);
if(range.low <= range.high)
if(!is_prefix(prefix, t->leaves.data[range.low])) goto catch;
goto finally;
catch:
range.low = range.high = 0;
finally:
return range;
}
/* Debug graph. */
/** Given a branch `b` in `tr` branches, calculate the right child branches.
#order \O(log `size`) */
static unsigned right_count(const struct trie *const tr,
const unsigned b) {
unsigned left, right, total = (unsigned)tr->branches.size, b0 = 0;
assert(tr && b < tr->branches.size);
for( ; ; ) {
right = total - (left = tr->branches.data[b0].left) - 1;
assert(left < total && right < total);
if(b0 >= b) break;
if(b <= b0 + left) total = left, b0++;
else total = right, b0 += left + 1;
}
assert(b0 == b);
return right;
}
/** #return Follows the branches to `b` in `tr` and returns the leaf. */
static unsigned left_leaf(const struct trie *const tr,
const unsigned b) {
unsigned left, right, total = (unsigned)tr->branches.size, i = 0, b0 = 0;
assert(tr && b < tr->branches.size);
for( ; ; ) {
right = total - (left = tr->branches.data[b0].left) - 1;
assert(left < tr->branches.size && right < tr->branches.size);
if(b0 >= b) break;
if(b <= b0 + left) total = left, b0++;
else total = right, b0 += left + 1, i += left + 1;
}
assert(b0 == b);
return i;
}
static void graph(const struct trie *const tr, const char *const fn) {
unsigned left, right, b, i;
FILE *fp = 0;
assert(tr && fn);
if(!(fp = fopen(fn, "w"))) { perror(fn); return; }
fprintf(fp, "digraph {\n"
"\tgraph [truecolor=true, bgcolor=transparent];\n"
"\tfontface=modern;\n"
"\tnode [shape=none];\n"
"\n");
if(!tr->branches.size) {
assert(!tr->leaves.size);
fprintf(fp, "\tidle;\n");
} else {
assert(tr->branches.size + 1 == tr->leaves.size);
fprintf(fp, "\t// branches\n");
for(b = 0; b < tr->branches.size; b++) { /* Branches. */
const struct branch *branch = tr->branches.data + b;
left = branch->left, right = right_count(tr, b);
fprintf(fp, "\ttree%pbranch%u [label = \"%u\", shape = circle, "
"style = filled, fillcolor = Grey95];\n"
"\ttree%pbranch%u -> ", (const void *)tr, b, branch->skip,
(const void *)tr, b);
if(left) fprintf(fp, "tree%pbranch%u [arrowhead = rnormal];\n",
(const void *)tr, b + 1);
else fprintf(fp,
"tree%pleaf%u [color = Gray85, arrowhead = rnormal];\n",
(const void *)tr, left_leaf(tr, b));
fprintf(fp, "\ttree%pbranch%u -> ", (const void *)tr, b);
if(right) fprintf(fp, "tree%pbranch%u [arrowhead = lnormal];\n",
(const void *)tr, b + left + 1);
else fprintf(fp,
"tree%pleaf%u [color = Gray85, arrowhead = lnormal];\n",
(const void *)tr, left_leaf(tr, b) + left + 1);
}
}
fprintf(fp, "\t// leaves\n");
for(i = 0; i < tr->leaves.size; i++) fprintf(fp,
"\ttree%pleaf%u [label = <%s<FONT COLOR=\"Gray85\">⊔</FONT>>];\n",
(const void *)tr, i, tr->leaves.data[i]);
fprintf(fp, "\n"
"\tnode [color = \"Red\"];\n"
"}\n");
fclose(fp);
}
/* Actual program. */
/* The input argument histogram. Used in <fn:find_r>. (Simple, but questionable
design choice.) */
static unsigned char hist[128];
static const size_t hist_max = UCHAR_MAX,
hist_size = sizeof hist / sizeof *hist;
static size_t words_found;
/** Branch-and-bound recursive function. */
static void find_r(const struct trie *const tr, char *const word) {
struct range r;
size_t len, i;
assert(word);
r = prefix(tr, word);
if(r.low >= r.high) return; /* Found nothing, we can bound this branch. */
if(!strcmp(word, tr->leaves.data[r.low])) { /* Found a match. */
printf("%s\n", word), words_found++;
if(++r.low == r.high) return;
}
len = strlen(word);
for(i = 0; i < hist_size; i++) {
unsigned char *freq;
if(!*(freq = hist + i)) continue;
(*freq)--;
word[len] = (char)i, word[len + 1] = '\0';
find_r(tr, word);
(*freq)++;
}
}
int main(int argc, char *argv[]) {
struct char_array dict = char_array();
struct trie tr = trie();
char *word;
size_t i;
int success = EXIT_FAILURE;
assert(CHAR_BIT == 8); /* C89 this value can change, assumes C99 value. */
if(argc != 2) { errno = EILSEQ;
fprintf(stderr, "Needs argument and dictionary input.\n"); goto catch; }
word = argv[1];
/* Load the dictionary from stdin and index it into a trie. */
if(!append_file(&dict, stdin) || !build_trie(&tr, &dict)) goto catch;
fprintf(stderr, "Loaded %lu trie entries.\n",(unsigned long)tr.leaves.size);
graph(&tr, "dictionary.gv");
/* Histogram the argument. */
for(i = 0; word[i] != '\0'; i++) {
unsigned char *freq;
if(word[i] & 0x80) continue; /* UTF-8 is not supported. :[ */
if(*(freq = hist + word[i]) == hist_max)
{ errno = ERANGE; goto catch; } /* "aaaaaaaaa..." x 5M? */
(*freq)++;
}
/* Might as well re-use the word now that we're finished with it; it's the
right length. */
*word = '\0', find_r(&tr, word);
fprintf(stderr, "%lu words found.\n", (unsigned long)words_found);
{ success = EXIT_SUCCESS; goto finally; }
catch:
perror("word");
finally:
trie_(&tr);
char_array_(&dict);
return success;
}

Is binary to decimal conversion rounded? how?

Got to transform a binary number to decimal for recoding printf (no lib or functions allowed except malloc and write). i'm doing my calculs on char *, so it can't overflow. But when i hit a certain size, my result differ from a online binary converter, and i noticed that the binary converter keep always only 20 digits.
for exemple :
binary : 1.11111111111111111111111111
binary converter = 1.99999998509883880615,
my o converter == 1.99799896499882880605234375,
I guess the online converter keep the result rounded in an unsigned long long, but i don't understand how this rounding is calculated.
Do you have any clues?
Here is my code:
#include <stdio.h>
#include <ctype.h>
#include <stdlib.h>
#include <string.h>
char *ft_binary_pow(char *tmp, int i)
{
int j;
int div;
int remnant;
j = 0;
remnant = 0;
tmp[0] = '1';
tmp.sign = 0;
while (i > 0)
{
while (isdigit(tmp[j]) || remnant != 0)
{
if (tmp[j])
div = ((tmp[j] - '0') * 10) / 2;
else
div = 0;
tmp[j] = ((div / 10) + remnant) + '0';
remnant = div % 10;
j++;
}
j = 0;
i--;
}
return (tmp);
}
char *ft_add_tmp(char *ret, char *tmp)
{
int i;
int j;
int add;
int remnant;
i = strlen(tmp);
add = 0;
remnant = 0;
while (--i >= 0)
{
if (!ret[i])
add = tmp[i] - '0';
else
add = (ret[i] - '0') + (tmp[i] - '0');
if ((add % 10 + remnant) < 10)
ret[i] = (add % 10 + remnant) + '0';
else
ret[i] = '0';
remnant = add / 10 ? 1 : 0;
}
return (ret);
}
int main(void)
{
int i;
char *lol = "11111111111111111111111111";
char *ret;
char *tmp;
if (!(ret = (char *)calloc(340, sizeof(char))))
return (0);
i = 0;
ret[0] = '1';
while (lol[i])
{
if (lol[i] == '1')
{
if (!(tmp = (char *)calloc(50, sizeof(char))))
return (0);
tmp = ft_binary_pow(tmp, i + 1);
ret = ft_add_tmp(ret, tmp);
free(tmp);
}
i++;
}
printf("ret = %s\n", ret);
return (0);
}
Edited for more readable code
Thanks for your time!

Code fails even with lol = "111111111".
Code fails in ft_add_tmp() to handle a carry (overflow) into the next most significant digit.
else {
ret.decimal[i] = '0';
// add something like this and then also prorogate carry as needed to higher digits
ret.decimal[i-1]++;
}
Suggested correction and simplification:
char* ft_add_tmp(char *ret, const char *tmp) {
size_t i = strlen(tmp);
unsigned carry = 0;
while (i-- > 0) {
unsigned sum = ret[i] ? ret[i] - '0' : 0;
sum += tmp[i] - '0' + carry;
ret[i] = sum % 10 + '0';
carry = sum / 10;
}
assert(carry == 0);
return (ret);
}

How can I multiply two strings containing 'huge numbers' (over 30 digits)?

I'm doing school project which I'm needed to first receive 2 huge numbers (unlimited size, for the sake of example, lets say over 30 digits), second step is to take the 2 input numbers the create new number of the multiplication of the two, which I'm really breaking a sweat trying to do so.
My code so far:
Type definition to making sure I'm handling the right variables:
typedef char* verylong;
#define MAX_SIZE 100
Input method:
verylong input_long() {
int i, len; //i for loop, len for strlen - using integer for it to avoid invoking the method more than 1 time
verylong number;
char temp_str[MAX_SIZE]; //the input from user - limited to 100
gets(temp_str); //user input
len = strlen(temp_str); //saving the length of the input
number = (char*)calloc(len + 1, sizeof(char)); //allocating memory for the verylong and saving space for \0
for (i = 0; i < len; i++) {
if (temp_str[i] - '0' < 0 || temp_str[i] - '0' > 9) { //the input is not a digit
printf("\nBad input!\n");
return NULL;
}
number[i] = temp_str[i]; //all is good -> add to the verylong number
}
number[i] = '\0'; //setting last spot
return number;
}
My sad attempt of completing my task:
verylong multiply_verylong(verylong vl1, verylong vl2) {
verylong mult;
int cur, i, j, k, lrg, sml, temp_size;
char *temp;
j = 1;
temp = (char*)calloc(lrg + sml + 1, sizeof(char)); //maximum amount of digits
if (strlen(vl1) > strlen(vl2)) {
lrg = strlen(vl1);
sml = strlen(vl2);
}
else {
lrg = strlen(vl2);
sml = strlen(vl1);
}
cur = 0;
for (i = sml-1; i >= 0; i--) {
k = 0;
temp_size = 0;
cur = (vl1[i] - '0')*(vl2[i] - '0');
printf("\ncur=%d", cur);
if (cur > 9)
temp_size = 2;
else
temp_size = 1;
while (k < temp_size) {
if (cur > 9)
temp[j++] = (cur % 10) + '0';
else
temp[j++] = cur + '0';
cur /= 10;
k++;
}
}
mult = (char*)calloc(j + 1, sizeof(char));
for (i = 0; i < j; i++) {
mult[i] = temp[i];
}
mult[i] = '\0';
free(temp);
return mult;
}
Long story short, I know I'm doing mistake at my multiplication method since I'm adding the numbers by simply adding the mult of 2 digits at a time, over that I truly am lost.
Thanks.

My advice would be to break the task into a number of simpler task.
How would you do the multiplication on paper?
123 * 456 -> 1 * (456 * 100) + 2 * (456 * 10) + 3 * (456 * 1)
or written differently
3 * ( 1 * 456)
+ 2 * ( 10 * 456)
+ 1 * (100 * 456)
---------------
SUM TO GET RESULT
or
3 * 456
+ 2 * 4560
+ 1 * 45600
---------------
SUM TO GET RESULT
From this you can identify 3 tasks
Multiplying with powers of 10, i.e. 1, 10, 100, etc. (i.e. add zeros to the end)
Multiplying a string-number with a single digit
Adding two string-numbers.
Write simple functions for each of these steps.
char* mulPowerOf10(char* sn, unsigned power)
{
...
}
char* mulDigit(char* sn, char digit)
{
...
}
char* addNumbers(char* snA, char* snB)
{
...
}
Using these 3 simple functions you can put the real multiplication together. In psedo-code:
char* mulNumbers(char* snA, char* snB)
{
char* result = malloc(2);
strcpy(result, "0");
unsigned power = 0;
for_each_digit D in snA
{
char* t1 = mulPowerOf10(snB, power)
char* t2 = mulDigit(t1, D)
result = addNumbers(result, t2)
++power;
}
free(.. what needs to be freed ..);
return result;
}

Here is a code example.
I found it simpler to store the number as a sequence of digits along with the length in a struct. The number may have leading zeros.
#define MAX_SIZE 1024
typedef struct Number {
int len;
char digits[];
} Number;
// Instantiate a number with room for len digits.
Number *newNumber(int len) {
Number *n = malloc(sizeof(Number)+len);
n->len = len;
memset(n->digits, 0, len);
return n;
}
// inputNumber reads a number from stdin. It return NULL if the input
// is invalid, otherwise it returns a Number containing the given digits.
Number *inputNumber() {
char temp[MAX_SIZE];
if (fgets(temp, sizeof temp, stdin) == NULL)
return NULL; // use fgets because gets is deprecated since C11
// remove trailing \n if any
int len = strlen(temp);
if (len > 0 && temp[len-1] == '\n')
temp[--len] = '\0';
// check input validity
if (len == 0)
return NULL;
for (int i = 0; temp[i] != '\0'; i++)
if (temp[i] < '0' || temp[i] > '9')
return NULL;
Number *n = newNumber(len);
for (int i = 0; temp[i] != '\0'; i++)
n->digits[i] = temp[i] - '0';
return n;
}
To multiply two numbers n1 and n2, we multiply n1 with each digit of n2, and accumulate the result shifted on the left by the position of the n2 digit in the final result.
For instance, to multiply 123*456, we compute 123*4 + 123*5*10 + 123*6*100. Note that *10 and *100 are simply left shifts.
We thus need a function that multiplies a number with a digit, and another function that accumulates a number with a left shift in a result number.
// multiply stores the result of n time digit in result.
// Requires the len of result is the len of n + 1.
void multiplyNumber(Number *n, char digit, Number *result) {
char carry = 0;
for (int i = r->len-1, j = n->len-1; i > 0; i--, j--) {
char x = n->digits[j] * d + carry;
r->digits[i] = x%10;
carry = x/10;
}
r->digits[0] = carry;
}
// accumutateNumber adds n with the left shift s to the number r.
// Requires the len of r is at least len of n + s + 1.
void accumulateNumber(Number *n, int s, Number *r) {
char carry = 0;
for (int i = r->len-1-s, j = n->len-1; j >= 0; i--, j--) {
char x = r->digits[i] + n->digits[j] + carry;
r->digits[i] = x%10;
carry = x/10;
}
r->digits[r->len-1-s-n->len] = carry;
}
Finally, we also need a function to print the number
void printNumber(Number *n) {
int i = 0;
// skip 0 at the front
while (i < n->len && n->digits[i] == 0)
i++;
if (i == n->len) {
printf("0\n");
return;
}
while (i < n->len)
putchar(n->digits[i++] + '0');
putchar('\n');
}
And this is it. We can now write the main function with the input of the numbers, the multiplication of number 1 with each digit of number 2 and accumulate the result with a shift to get the final result.
int main() {
printf("number 1: ");
Number *n1 = inputNumber();
if (n1 == NULL) {
printf("number 1 is invalid\n");
return 1;
}
printf("number 2: ");
Number *n2 = inputNumber();
if (n2 == NULL) {
printf("number 2 is invalid\n");
return 1;
}
Number *r = newNumber(n1->len+n2->len);
Number *tmp = newNumber(n1->len+1);
for (int i = 0; i < n2->len; i++) {
multiplyNumber(n1, n2->digits[n2->len-1-i], tmp);
accumulateNumber(tmp, i, r);
}
printf("result: ");
printNumber(r);
return 0;
}

Here you may have a look at a 'string only' version, multiplying like you would do with a pencil.
It works with 2 loops. The outer loop takes the digits of value2 from the right and multiplies in the inner loop with every digit of value1 from right. The right digit of the multiplication is stored in result, the rest goes in carry for the next inner loop.
At the end of the inner loop, carry is added to result.
After the first outer loop, we have to add previous results to our multiplication.
This is done in if(!first && *lresp) r += toI(*lresp)
The final loop moves the result to the start of the char array.
#include <stdio.h>
#include <stdlib.h>
#define toI(x) ((x)-'0')
#define toC(x) ((x)+'0')
#define max(a,b) ((a)>(b)) ? (a):(b)
char *mul(char *buf1, char *buf2) {
int size, v1, v2, r, carry=0, first=1;
char *startp1, *endp1, *lendp1, *startp2, *endp2;
char *startres, *endres, *resp, *lresp, *result;
for(endp1 = startp1 = buf1; *endp1; endp1++); // start and endpointer 1st value
for(endp2 = startp2 = buf2; *endp2; endp2++); // start and end pointer 2nd value
size = endp2-startp2 + endp1-startp1; // result size
startres = endres = resp = result = malloc(size+10); // some reserve
endres += size+10-1; // result end pointer
for(resp = startres; resp <= endres; resp++) *resp = '\0'; // init result
for(endp1--, endp2--, resp-=2; endp2>=startp2; endp2--, resp--, first=0) {
v2 = toI(*endp2); // current digit of value2
for(lresp = resp, lendp1 = endp1; lendp1 >= startp1; lendp1--, lresp--) {
v1 = toI(*lendp1); // current digit of value1
r = v1 * v2 + carry; // multiply + carry
if(!first && *lresp) r += toI(*lresp); // add result of previous loops
*lresp = toC(r%10); // store last digit
carry = r/10;
}
for( ; carry != 0; carry /= 10)
*lresp-- = toC(carry%10);
}
// we began right with reserve, now move to start of result
for(lresp++; lresp < endres; lresp++, startres++)
*startres=*lresp;
*startres = '\0';
return result;
}
int main() {
char *result = mul("123456789", "12345678");
printf("\n%s\n", result);
free(result);
}

How to format number adding points between each 3 numbers [duplicate]

In C, how can I format a large number from e.g. 1123456789 to 1,123,456,789?
I tried using printf("%'10d\n", 1123456789), but that doesn't work.
Could you advise anything? The simpler the solution the better.

If your printf supports the ' flag (as required by POSIX 2008 printf()), you can probably do it just by setting your locale appropriately. Example:
#include <stdio.h>
#include <locale.h>
int main(void)
{
setlocale(LC_NUMERIC, "");
printf("%'d\n", 1123456789);
return 0;
}
And build & run:
$ ./example
1,123,456,789
Tested on Mac OS X & Linux (Ubuntu 10.10).

You can do it recursively as follows (beware INT_MIN if you're using two's complement, you'll need extra code to manage that):
void printfcomma2 (int n) {
if (n < 1000) {
printf ("%d", n);
return;
}
printfcomma2 (n/1000);
printf (",%03d", n%1000);
}
void printfcomma (int n) {
if (n < 0) {
printf ("-");
n = -n;
}
printfcomma2 (n);
}
A summmary:
User calls printfcomma with an integer, the special case of negative numbers is handled by simply printing "-" and making the number positive (this is the bit that won't work with INT_MIN).
When you enter printfcomma2, a number less than 1,000 will just print and return.
Otherwise the recursion will be called on the next level up (so 1,234,567 will be called with 1,234, then 1) until a number less than 1,000 is found.
Then that number will be printed and we'll walk back up the recursion tree, printing a comma and the next number as we go.
There is also the more succinct version though it does unnecessary processing in checking for negative numbers at every level (not that this will matter given the limited number of recursion levels). This one is a complete program for testing:
#include <stdio.h>
void printfcomma (int n) {
if (n < 0) {
printf ("-");
printfcomma (-n);
return;
}
if (n < 1000) {
printf ("%d", n);
return;
}
printfcomma (n/1000);
printf (",%03d", n%1000);
}
int main (void) {
int x[] = {-1234567890, -123456, -12345, -1000, -999, -1,
0, 1, 999, 1000, 12345, 123456, 1234567890};
int *px = x;
while (px != &(x[sizeof(x)/sizeof(*x)])) {
printf ("%-15d: ", *px);
printfcomma (*px);
printf ("\n");
px++;
}
return 0;
}
and the output is:
-1234567890 : -1,234,567,890
-123456 : -123,456
-12345 : -12,345
-1000 : -1,000
-999 : -999
-1 : -1
0 : 0
1 : 1
999 : 999
1000 : 1,000
12345 : 12,345
123456 : 123,456
1234567890 : 1,234,567,890
An iterative solution for those who don't trust recursion (although the only problem with recursion tends to be stack space which will not be an issue here since it'll only be a few levels deep even for a 64-bit integer):
void printfcomma (int n) {
int n2 = 0;
int scale = 1;
if (n < 0) {
printf ("-");
n = -n;
}
while (n >= 1000) {
n2 = n2 + scale * (n % 1000);
n /= 1000;
scale *= 1000;
}
printf ("%d", n);
while (scale != 1) {
scale /= 1000;
n = n2 / scale;
n2 = n2 % scale;
printf (",%03d", n);
}
}
Both of these generate 2,147,483,647 for INT_MAX.
All the code above is for comma-separating three-digit groups but you can use other characters as well, such as a space:
void printfspace2 (int n) {
if (n < 1000) {
printf ("%d", n);
return;
}
printfspace2 (n/1000);
printf (" %03d", n%1000);
}
void printfspace (int n) {
if (n < 0) {
printf ("-");
n = -n;
}
printfspace2 (n);
}

Here's a very simple implementation. This function contains no error checking, buffer sizes must be verified by the caller. It also does not work for negative numbers. Such improvements are left as an exercise for the reader.
void format_commas(int n, char *out)
{
int c;
char buf[20];
char *p;
sprintf(buf, "%d", n);
c = 2 - strlen(buf) % 3;
for (p = buf; *p != 0; p++) {
*out++ = *p;
if (c == 1) {
*out++ = ',';
}
c = (c + 1) % 3;
}
*--out = 0;
}

Egads! I do this all the time, using gcc/g++ and glibc on linux and yes, the ' operator may be non-standard, but I like the simplicity of it.
#include <stdio.h>
#include <locale.h>
int main()
{
int bignum=12345678;
setlocale(LC_ALL,"");
printf("Big number: %'d\n",bignum);
return 0;
}
Gives output of:
Big number: 12,345,678
Just have to remember the 'setlocale' call in there, otherwise it won't format anything.

Perhaps a locale-aware version would be interesting.
#include <stdlib.h>
#include <locale.h>
#include <string.h>
#include <limits.h>
static int next_group(char const **grouping) {
if ((*grouping)[1] == CHAR_MAX)
return 0;
if ((*grouping)[1] != '\0')
++*grouping;
return **grouping;
}
size_t commafmt(char *buf, /* Buffer for formatted string */
int bufsize, /* Size of buffer */
long N) /* Number to convert */
{
int i;
int len = 1;
int posn = 1;
int sign = 1;
char *ptr = buf + bufsize - 1;
struct lconv *fmt_info = localeconv();
char const *tsep = fmt_info->thousands_sep;
char const *group = fmt_info->grouping;
char const *neg = fmt_info->negative_sign;
size_t sep_len = strlen(tsep);
size_t group_len = strlen(group);
size_t neg_len = strlen(neg);
int places = (int)*group;
if (bufsize < 2)
{
ABORT:
*buf = '\0';
return 0;
}
*ptr-- = '\0';
--bufsize;
if (N < 0L)
{
sign = -1;
N = -N;
}
for ( ; len <= bufsize; ++len, ++posn)
{
*ptr-- = (char)((N % 10L) + '0');
if (0L == (N /= 10L))
break;
if (places && (0 == (posn % places)))
{
places = next_group(&group);
for (int i=sep_len; i>0; i--) {
*ptr-- = tsep[i-1];
if (++len >= bufsize)
goto ABORT;
}
}
if (len >= bufsize)
goto ABORT;
}
if (sign < 0)
{
if (len >= bufsize)
goto ABORT;
for (int i=neg_len; i>0; i--) {
*ptr-- = neg[i-1];
if (++len >= bufsize)
goto ABORT;
}
}
memmove(buf, ++ptr, len + 1);
return (size_t)len;
}
#ifdef TEST
#include <stdio.h>
#define elements(x) (sizeof(x)/sizeof(x[0]))
void show(long i) {
char buffer[32];
commafmt(buffer, sizeof(buffer), i);
printf("%s\n", buffer);
commafmt(buffer, sizeof(buffer), -i);
printf("%s\n", buffer);
}
int main() {
long inputs[] = {1, 12, 123, 1234, 12345, 123456, 1234567, 12345678 };
for (int i=0; i<elements(inputs); i++) {
setlocale(LC_ALL, "");
show(inputs[i]);
}
return 0;
}
#endif
This does have a bug (but one I'd consider fairly minor). On two's complement hardware, it won't convert the most-negative number correctly, because it attempts to convert a negative number to its equivalent positive number with N = -N; In two's complement, the maximally negative number doesn't have a corresponding positive number, unless you promote it to a larger type. One way to get around this is by promoting the number the corresponding unsigned type (but it's is somewhat non-trivial).

Without recursion or string handling, a mathematical approach:
#include <stdio.h>
#include <math.h>
void print_number( int n )
{
int order_of_magnitude = (n == 0) ? 1 : (int)pow( 10, ((int)floor(log10(abs(n))) / 3) * 3 ) ;
printf( "%d", n / order_of_magnitude ) ;
for( n = abs( n ) % order_of_magnitude, order_of_magnitude /= 1000;
order_of_magnitude > 0;
n %= order_of_magnitude, order_of_magnitude /= 1000 )
{
printf( ",%03d", abs(n / order_of_magnitude) ) ;
}
}
Similar in principle to Pax's recursive solution, but by calculating the order of magnitude in advance, recursion is avoided (at some considerable expense perhaps).
Note also that the actual character used to separate thousands is locale specific.
Edit:See #Chux's comments below for improvements.

Based on #Greg Hewgill's, but takes negative numbers into account and returns the string size.
size_t str_format_int_grouped(char dst[16], int num)
{
char src[16];
char *p_src = src;
char *p_dst = dst;
const char separator = ',';
int num_len, commas;
num_len = sprintf(src, "%d", num);
if (*p_src == '-') {
*p_dst++ = *p_src++;
num_len--;
}
for (commas = 2 - num_len % 3;
*p_src;
commas = (commas + 1) % 3)
{
*p_dst++ = *p_src++;
if (commas == 1) {
*p_dst++ = separator;
}
}
*--p_dst = '\0';
return (size_t)(p_dst - dst);
}

Needed to do something similar myself but rather than printing directly, needed to go to a buffer. Here's what I came up with. Works backwards.
unsigned int IntegerToCommaString(char *String, unsigned long long Integer)
{
unsigned int Digits = 0, Offset, Loop;
unsigned long long Copy = Integer;
do {
Digits++;
Copy /= 10;
} while (Copy);
Digits = Offset = ((Digits - 1) / 3) + Digits;
String[Offset--] = '\0';
Copy = Integer;
Loop = 0;
do {
String[Offset] = '0' + (Copy % 10);
if (!Offset--)
break;
if (Loop++ % 3 == 2)
String[Offset--] = ',';
Copy /= 10;
} while (1);
return Digits;
}
Be aware that it's only designed for unsigned integers and you must ensure that the buffer is large enough.

There's no real simple way to do this in C. I would just modify an int-to-string function to do it:
void format_number(int n, char * out) {
int i;
int digit;
int out_index = 0;
for (i = n; i != 0; i /= 10) {
digit = i % 10;
if ((out_index + 1) % 4 == 0) {
out[out_index++] = ',';
}
out[out_index++] = digit + '0';
}
out[out_index] = '\0';
// then you reverse the out string as it was converted backwards (it's easier that way).
// I'll let you figure that one out.
strrev(out);
}

My answer does not format the result exactly like the illustration in the question, but may fulfill the actual need in some cases with a simple one-liner or macro. One can extend it to generate more thousand-groups as necessary.
The result will look for example as follows:
Value: 0'000'012'345
The code:
printf("Value: %llu'%03lu'%03lu'%03lu\n", (value / 1000 / 1000 / 1000), (value / 1000 / 1000) % 1000, (value / 1000) % 1000, value % 1000);

#include <stdio.h>
void punt(long long n){
char s[28];
int i = 27;
if(n<0){n=-n; putchar('-');}
do{
s[i--] = n%10 + '0';
if(!(i%4) && n>9)s[i--]='.';
n /= 10;
}while(n);
puts(&s[++i]);
}
int main(){
punt(2134567890);
punt(987);
punt(9876);
punt(-987);
punt(-9876);
punt(-654321);
punt(0);
punt(1000000000);
punt(0x7FFFFFFFFFFFFFFF);
punt(0x8000000000000001); // -max + 1 ...
}
My solution uses a . instead of a ,
It is left to the reader to change this.

This is old and there are plenty of answers but the question was not "how can I write a routine to add commas" but "how can it be done in C"? The comments pointed to this direction but on my Linux system with GCC, this works for me:
#include <stdio.h>
#include <stdlib.h>
#include <locale.h>
int main()
{
unsetenv("LC_ALL");
setlocale(LC_NUMERIC, "");
printf("%'lld\n", 3141592653589);
}
When this is run, I get:
$ cc -g comma.c -o comma && ./comma
3,141,592,653,589
If I unset the LC_ALL variable before running the program the unsetenv is not necessary.

Another solution, by saving the result into an int array, maximum size of 7 because the long long int type can handle numbers in the range 9,223,372,036,854,775,807 to -9,223,372,036,854,775,807. (Note it is not an unsigned value).
Non-recursive printing function
static void printNumber (int numbers[8], int loc, int negative)
{
if (negative)
{
printf("-");
}
if (numbers[1]==-1)//one number
{
printf("%d ", numbers[0]);
}
else
{
printf("%d,", numbers[loc]);
while(loc--)
{
if(loc==0)
{// last number
printf("%03d ", numbers[loc]);
break;
}
else
{ // number in between
printf("%03d,", numbers[loc]);
}
}
}
}
main function call
static void getNumWcommas (long long int n, int numbers[8])
{
int i;
int negative=0;
if (n < 0)
{
negative = 1;
n = -n;
}
for(i = 0; i < 7; i++)
{
if (n < 1000)
{
numbers[i] = n;
numbers[i+1] = -1;
break;
}
numbers[i] = n%1000;
n/=1000;
}
printNumber(numbers, i, negative);// non recursive print
}
testing output
-9223372036854775807: -9,223,372,036,854,775,807
-1234567890 : -1,234,567,890
-123456 : -123,456
-12345 : -12,345
-1000 : -1,000
-999 : -999
-1 : -1
0 : 0
1 : 1
999 : 999
1000 : 1,000
12345 : 12,345
123456 : 123,456
1234567890 : 1,234,567,890
9223372036854775807 : 9,223,372,036,854,775,807
In main() function:
int numberSeparated[8];
long long int number = 1234567890LL;
getNumWcommas(number, numberSeparated);
If printing is all that's needed then move int numberSeparated[8]; inside the function getNumWcommas and call it this way getNumWcommas(number).

Another iterative function
int p(int n) {
if(n < 0) {
printf("-");
n = -n;
}
int a[sizeof(int) * CHAR_BIT / 3] = { 0 };
int *pa = a;
while(n > 0) {
*++pa = n % 1000;
n /= 1000;
}
printf("%d", *pa);
while(pa > a + 1) {
printf(",%03d", *--pa);
}
}

Here is the slimiest, size and speed efficient implementation of this kind of decimal digit formating:
const char *formatNumber (
int value,
char *endOfbuffer,
bool plus)
{
int savedValue;
int charCount;
savedValue = value;
if (unlikely (value < 0))
value = - value;
*--endOfbuffer = 0;
charCount = -1;
do
{
if (unlikely (++charCount == 3))
{
charCount = 0;
*--endOfbuffer = ',';
}
*--endOfbuffer = (char) (value % 10 + '0');
}
while ((value /= 10) != 0);
if (unlikely (savedValue < 0))
*--endOfbuffer = '-';
else if (unlikely (plus))
*--endOfbuffer = '+';
return endOfbuffer;
}
Use as following:
char buffer[16];
fprintf (stderr, "test : %s.", formatNumber (1234567890, buffer + 16, true));
Output:
test : +1,234,567,890.
Some advantages:
Function taking end of string buffer because of reverse ordered formatting. Finally, where is no need in revering generated string (strrev).
This function produces one string that can be used in any algo after. It not depends nor require multiple printf/sprintf calls, which is terrible slow and always context specific.
Minimum number of divide operators (/, %).

Secure format_commas, with negative numbers:
Because VS < 2015 doesn't implement snprintf, you need to do this
#if defined(_WIN32)
#define snprintf(buf,len, format,...) _snprintf_s(buf, len,len, format, __VA_ARGS__)
#endif
And then
char* format_commas(int n, char *out)
{
int c;
char buf[100];
char *p;
char* q = out; // Backup pointer for return...
if (n < 0)
{
*out++ = '-';
n = abs(n);
}
snprintf(buf, 100, "%d", n);
c = 2 - strlen(buf) % 3;
for (p = buf; *p != 0; p++) {
*out++ = *p;
if (c == 1) {
*out++ = '\'';
}
c = (c + 1) % 3;
}
*--out = 0;
return q;
}
Example usage:
size_t currentSize = getCurrentRSS();
size_t peakSize = getPeakRSS();
printf("Current size: %d\n", currentSize);
printf("Peak size: %d\n\n\n", peakSize);
char* szcurrentSize = (char*)malloc(100 * sizeof(char));
char* szpeakSize = (char*)malloc(100 * sizeof(char));
printf("Current size (f): %s\n", format_commas((int)currentSize, szcurrentSize));
printf("Peak size (f): %s\n", format_commas((int)currentSize, szpeakSize));
free(szcurrentSize);
free(szpeakSize);

A modified version of #paxdiablo solution, but using WCHAR and wsprinf:
static WCHAR buffer[10];
static int pos = 0;
void printfcomma(const int &n) {
if (n < 0) {
wsprintf(buffer + pos, TEXT("-"));
pos = lstrlen(buffer);
printfcomma(-n);
return;
}
if (n < 1000) {
wsprintf(buffer + pos, TEXT("%d"), n);
pos = lstrlen(buffer);
return;
}
printfcomma(n / 1000);
wsprintf(buffer + pos, TEXT(",%03d"), n % 1000);
pos = lstrlen(buffer);
}
void my_sprintf(const int &n)
{
pos = 0;
printfcomma(n);
}

I'm new in C programming. Here is my simple code.
int main()
{
// 1223 => 1,223
int n;
int a[10];
printf(" n: ");
scanf_s("%d", &n);
int i = 0;
while (n > 0)
{
int temp = n % 1000;
a[i] = temp;
n /= 1000;
i++;
}
for (int j = i - 1; j >= 0; j--)
{
if (j == 0)
{
printf("%d.", a[j]);
}
else printf("%d,",a[j]);
}
getch();
return 0;
}

Require: <stdio.h> + <string.h>.
Advantage: short, readable, based on the format of scanf-family. And assume no comma on the right of decimal point.
void add_commas(char *in, char *out) {
int len_in = strlen(in);
int len_int = -1; /* len_int(123.4) = 3 */
for (int i = 0; i < len_in; ++i) if (in[i] == '.') len_int = i;
int pos = 0;
for (int i = 0; i < len_in; ++i) {
if (i>0 && i<len_int && (len_int-i)%3==0)
out[pos++] = ',';
out[pos++] = in[i];
}
out[pos] = 0; /* Append the '\0' */
}
Example, to print a formatted double:
#include <stdio.h>
#include <string.h>
#define COUNT_DIGIT_MAX 100
int main() {
double sum = 30678.7414;
char input[COUNT_DIGIT_MAX+1] = { 0 }, output[COUNT_DIGIT_MAX+1] = { 0 };
snprintf(input, COUNT_DIGIT_MAX, "%.2f", sum/12);
add_commas(input, output);
printf("%s\n", output);
}
Output:
2,556.56

Using C++'s std::string as return value with possibly the least overhead and not using any std library functions (sprintf, to_string, etc.).
string group_digs_c(int num)
{
const unsigned int BUF_SIZE = 128;
char buf[BUF_SIZE] = { 0 }, * pbuf = &buf[BUF_SIZE - 1];
int k = 0, neg = 0;
if (num < 0) { neg = 1; num = num * -1; };
while(num)
{
if (k > 0 && k % 3 == 0)
*pbuf-- = ',';
*pbuf-- = (num % 10) + '0';
num /= 10;
++k;
}
if (neg)
*pbuf = '-';
else
++pbuf;
int cc = buf + BUF_SIZE - pbuf;
memmove(buf, pbuf, cc);
buf[cc] = 0;
string rv = buf;
return rv;
}

Here is a simple portable solution relying on sprintf:
#include <stdio.h>
// assuming out points to an array of sufficient size
char *format_commas(char *out, int n, int min_digits) {
int len = sprintf(out, "%.*d", min_digits, n);
int i = (*out == '-'), j = len, k = (j - i - 1) / 3;
out[j + k] = '\0';
while (k-- > 0) {
j -= 3;
out[j + k + 3] = out[j + 2];
out[j + k + 2] = out[j + 1];
out[j + k + 1] = out[j + 0];
out[j + k + 0] = ',';
}
return out;
}
The code is easy to adapt for other integer types.

There are many interesting contributions here. Some covered all cases, some did not. I picked four of the contributions to test, found some failure cases during testing and then added a solution of my own.
I tested all methods for both accuracy and speed. Even though the OP only requested a solution for one positive number, I upgraded the contributions that didn't cover all possible numbers (so the code below may be slightly different from the original postings). The cases that weren't covered include: 0, negative numbers and the minimum number (INT_MIN).
I changed the declared type from "int" to "long long" since it's more general and all ints will get promoted to long long. I also standardized the call interface to include the number as well as a buffer to contain the formatted string (like some of the contributions) and returned a pointer to the buffer:
char* funcName(long long number_to_format, char* string_buffer);
Including a buffer parameter is considered by some to be "better" than having the function: 1) contain a static buffer (would not be re-entrant) or 2) allocate space for the buffer (would require caller to de-allocate the memory) or 3) print the result directly to stdout (would not be as generally useful since the output may be targeted for a GUI widget, file, pty, pipe, etc.).
I tried to use the same function names as the original contributions to make it easier to refer back to the originals. Contributed functions were modified as needed to pass the accuracy test so that the speed test would be meaningful. The results are included here in case you would like to test more of the contributed techniques for comparison. All code and test code used to generate the results are shown below.
So, here are the results:
Accuracy Test (test cases: LLONG_MIN, -999, -99, 0, 99, 999, LLONG_MAX):
----------------------------------------------------
print_number:
-9,223,372,036,854,775,808, -999, -99, 0, 99, 999, 9,223,372,036,854,775,807
fmtLocale:
-9,223,372,036,854,775,808, -999, -99, 0, 99, 999, 9,223,372,036,854,775,807
fmtCommas:
-9,223,372,036,854,775,808, -999, -99, 0, 99, 999, 9,223,372,036,854,775,807
format_number:
-9,223,372,036,854,775,808, -999, -99, 0, 99, 999, 9,223,372,036,854,775,807
itoa_commas:
-9,223,372,036,854,775,808, -999, -99, 0, 99, 999, 9,223,372,036,854,775,807
Speed Test: (1 million calls, values reflect average time per call)
----------------------------------------------------
print_number: 0.747 us (microsec) per call
fmtLocale: 0.222 us (microsec) per call
fmtCommas: 0.212 us (microsec) per call
format_number: 0.124 us (microsec) per call
itoa_commas: 0.085 us (microsec) per call
Since all contributed techniques are fast (< 1 microsecond on my laptop), unless you need to format millions of numbers, any of the techniques should be acceptable. It's probably best to choose the technique that is most readable to you.
Here is the code:
#line 2 "comma.c"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <math.h>
#include <locale.h>
#include <limits.h>
// ----------------------------------------------------------
char* print_number( long long n, char buf[32] ) {
long long order_of_magnitude = (n == 0) ? 1
: (long long)pow( 10, ((long long)floor(log10(fabs(n))) / 3) * 3 ) ;
char *ptr = buf;
sprintf(ptr, "%d", n / order_of_magnitude ) ;
for( n %= order_of_magnitude, order_of_magnitude /= 1000;
order_of_magnitude > 0;
n %= order_of_magnitude, order_of_magnitude /= 1000 )
{
ptr += strlen(ptr);
sprintf(ptr, ",%03d", abs(n / order_of_magnitude) );
}
return buf;
}
// ----------------------------------------------------------
char* fmtLocale(long long i, char buf[32]) {
sprintf(buf, "%'lld", i); // requires setLocale in main
return buf;
}
// ----------------------------------------------------------
char* fmtCommas(long long num, char dst[32]) {
char src[27];
char *p_src = src;
char *p_dst = dst;
const char separator = ',';
int num_len, commas;
num_len = sprintf(src, "%lld", num);
if (*p_src == '-') {
*p_dst++ = *p_src++;
num_len--;
}
for (commas = 2 - num_len % 3;
*p_src;
commas = (commas + 1) % 3)
{
*p_dst++ = *p_src++;
if (commas == 1) {
*p_dst++ = separator;
}
}
*--p_dst = '\0';
return dst;
}
// ----------------------------------------------------------
char* format_number(long long n, char out[32]) {
int digit;
int out_index = 0;
long long i = (n < 0) ? -n : n;
if (i == LLONG_MIN) i = LLONG_MAX; // handle MIN, offset by 1
if (i == 0) { out[out_index++] = '0'; } // handle 0
for ( ; i != 0; i /= 10) {
digit = i % 10;
if ((out_index + 1) % 4 == 0) {
out[out_index++] = ',';
}
out[out_index++] = digit + '0';
}
if (n == LLONG_MIN) { out[0]++; } // correct for offset
if (n < 0) { out[out_index++] = '-'; }
out[out_index] = '\0';
// then you reverse the out string
for (int i=0, j = strlen(out) - 1; i<=j; ++i, --j) {
char tmp = out[i];
out[i] = out[j];
out[j] = tmp;
}
return out;
}
// ----------------------------------------------------------
char* itoa_commas(long long i, char buf[32]) {
char* p = buf + 31;
*p = '\0'; // terminate string
if (i == 0) { *(--p) = '0'; return p; } // handle 0
long long n = (i < 0) ? -i : i;
if (n == LLONG_MIN) n = LLONG_MAX; // handle MIN, offset by 1
for (int j=0; 1; ++j) {
*--p = '0' + n % 10; // insert digit
if ((n /= 10) <= 0) break;
if (j % 3 == 2) *--p = ','; // insert a comma
}
if (i == LLONG_MIN) { p[24]++; } // correct for offset
if (i < 0) { *--p = '-'; }
return p;
}
// ----------------------------------------------------------
// Test Accuracy
// ----------------------------------------------------------
void test_accuracy(char* name, char* (*func)(long long n, char* buf)) {
char sbuf[32]; // string buffer
long long nbuf[] = { LLONG_MIN, -999, -99, 0, 99, 999, LLONG_MAX };
printf("%s:\n", name);
printf(" %s", func(nbuf[0], sbuf));
for (int i=1; i < sizeof(nbuf) / sizeof(long long int); ++i) {
printf(", %s", func(nbuf[i], sbuf));
}
printf("\n");
}
// ----------------------------------------------------------
// Test Speed
// ----------------------------------------------------------
void test_speed(char* name, char* (*func)(long long n, char* buf)) {
int cycleCount = 1000000;
//int cycleCount = 1;
clock_t start;
double elapsed;
char sbuf[32]; // string buffer
start = clock();
for (int i=0; i < cycleCount; ++i) {
char* s = func(LLONG_MAX, sbuf);
}
elapsed = (double)(clock() - start) / (CLOCKS_PER_SEC / 1000000.0);
printf("%14s: %7.3f us (microsec) per call\n", name, elapsed / cycleCount);
}
// ----------------------------------------------------------
int main(int argc, char* argv[]){
setlocale(LC_ALL, "");
printf("\nAccuracy Test: (LLONG_MIN, -999, 0, 99, LLONG_MAX)\n");
printf("----------------------------------------------------\n");
test_accuracy("print_number", print_number);
test_accuracy("fmtLocale", fmtLocale);
test_accuracy("fmtCommas", fmtCommas);
test_accuracy("format_number", format_number);
test_accuracy("itoa_commas", itoa_commas);
printf("\nSpeed Test: 1 million calls\n\n");
printf("----------------------------------------------------\n");
test_speed("print_number", print_number);
test_speed("fmtLocale", fmtLocale);
test_speed("fmtCommas", fmtCommas);
test_speed("format_number", format_number);
test_speed("itoa_commas", itoa_commas);
return 0;
}

Can be done pretty easily...
//Make sure output buffer is big enough and that input is a valid null terminated string
void pretty_number(const char* input, char * output)
{
int iInputLen = strlen(input);
int iOutputBufferPos = 0;
for(int i = 0; i < iInputLen; i++)
{
if((iInputLen-i) % 3 == 0 && i != 0)
{
output[iOutputBufferPos++] = ',';
}
output[iOutputBufferPos++] = input[i];
}
output[iOutputBufferPos] = '\0';
}
Example call:
char szBuffer[512];
pretty_number("1234567", szBuffer);
//strcmp(szBuffer, "1,234,567") == 0

void printfcomma ( long long unsigned int n)
{
char nstring[100];
int m;
int ptr;
int i,j;
sprintf(nstring,"%llu",n);
m=strlen(nstring);
ptr=m%3;
if (ptr)
{ for (i=0;i<ptr;i++) // print first digits before comma
printf("%c", nstring[i]);
printf(",");
}
j=0;
for (i=ptr;i<m;i++) // print the rest inserting commas
{
printf("%c",nstring[i]);
j++;
if (j%3==0)
if(i<(m-1)) printf(",");
}
}

// separate thousands
int digit;
int idx = 0;
static char buffer[32];
char* p = &buffer[32];
*--p = '\0';
for (int i = fCounter; i != 0; i /= 10)
{
digit = i % 10;
if ((p - buffer) % 4 == 0)
*--p = ' ';
*--p = digit + '0';
}

How to format a number using comma as thousands separator in C?

In C, how can I format a large number from e.g. 1123456789 to 1,123,456,789?
I tried using printf("%'10d\n", 1123456789), but that doesn't work.
Could you advise anything? The simpler the solution the better.

If your printf supports the ' flag (as required by POSIX 2008 printf()), you can probably do it just by setting your locale appropriately. Example:
#include <stdio.h>
#include <locale.h>
int main(void)
{
setlocale(LC_NUMERIC, "");
printf("%'d\n", 1123456789);
return 0;
}
And build & run:
$ ./example
1,123,456,789
Tested on Mac OS X & Linux (Ubuntu 10.10).

You can do it recursively as follows (beware INT_MIN if you're using two's complement, you'll need extra code to manage that):
void printfcomma2 (int n) {
if (n < 1000) {
printf ("%d", n);
return;
}
printfcomma2 (n/1000);
printf (",%03d", n%1000);
}
void printfcomma (int n) {
if (n < 0) {
printf ("-");
n = -n;
}
printfcomma2 (n);
}
A summmary:
User calls printfcomma with an integer, the special case of negative numbers is handled by simply printing "-" and making the number positive (this is the bit that won't work with INT_MIN).
When you enter printfcomma2, a number less than 1,000 will just print and return.
Otherwise the recursion will be called on the next level up (so 1,234,567 will be called with 1,234, then 1) until a number less than 1,000 is found.
Then that number will be printed and we'll walk back up the recursion tree, printing a comma and the next number as we go.
There is also the more succinct version though it does unnecessary processing in checking for negative numbers at every level (not that this will matter given the limited number of recursion levels). This one is a complete program for testing:
#include <stdio.h>
void printfcomma (int n) {
if (n < 0) {
printf ("-");
printfcomma (-n);
return;
}
if (n < 1000) {
printf ("%d", n);
return;
}
printfcomma (n/1000);
printf (",%03d", n%1000);
}
int main (void) {
int x[] = {-1234567890, -123456, -12345, -1000, -999, -1,
0, 1, 999, 1000, 12345, 123456, 1234567890};
int *px = x;
while (px != &(x[sizeof(x)/sizeof(*x)])) {
printf ("%-15d: ", *px);
printfcomma (*px);
printf ("\n");
px++;
}
return 0;
}
and the output is:
-1234567890 : -1,234,567,890
-123456 : -123,456
-12345 : -12,345
-1000 : -1,000
-999 : -999
-1 : -1
0 : 0
1 : 1
999 : 999
1000 : 1,000
12345 : 12,345
123456 : 123,456
1234567890 : 1,234,567,890
An iterative solution for those who don't trust recursion (although the only problem with recursion tends to be stack space which will not be an issue here since it'll only be a few levels deep even for a 64-bit integer):
void printfcomma (int n) {
int n2 = 0;
int scale = 1;
if (n < 0) {
printf ("-");
n = -n;
}
while (n >= 1000) {
n2 = n2 + scale * (n % 1000);
n /= 1000;
scale *= 1000;
}
printf ("%d", n);
while (scale != 1) {
scale /= 1000;
n = n2 / scale;
n2 = n2 % scale;
printf (",%03d", n);
}
}
Both of these generate 2,147,483,647 for INT_MAX.
All the code above is for comma-separating three-digit groups but you can use other characters as well, such as a space:
void printfspace2 (int n) {
if (n < 1000) {
printf ("%d", n);
return;
}
printfspace2 (n/1000);
printf (" %03d", n%1000);
}
void printfspace (int n) {
if (n < 0) {
printf ("-");
n = -n;
}
printfspace2 (n);
}

Here's a very simple implementation. This function contains no error checking, buffer sizes must be verified by the caller. It also does not work for negative numbers. Such improvements are left as an exercise for the reader.
void format_commas(int n, char *out)
{
int c;
char buf[20];
char *p;
sprintf(buf, "%d", n);
c = 2 - strlen(buf) % 3;
for (p = buf; *p != 0; p++) {
*out++ = *p;
if (c == 1) {
*out++ = ',';
}
c = (c + 1) % 3;
}
*--out = 0;
}

Egads! I do this all the time, using gcc/g++ and glibc on linux and yes, the ' operator may be non-standard, but I like the simplicity of it.
#include <stdio.h>
#include <locale.h>
int main()
{
int bignum=12345678;
setlocale(LC_ALL,"");
printf("Big number: %'d\n",bignum);
return 0;
}
Gives output of:
Big number: 12,345,678
Just have to remember the 'setlocale' call in there, otherwise it won't format anything.

Perhaps a locale-aware version would be interesting.
#include <stdlib.h>
#include <locale.h>
#include <string.h>
#include <limits.h>
static int next_group(char const **grouping) {
if ((*grouping)[1] == CHAR_MAX)
return 0;
if ((*grouping)[1] != '\0')
++*grouping;
return **grouping;
}
size_t commafmt(char *buf, /* Buffer for formatted string */
int bufsize, /* Size of buffer */
long N) /* Number to convert */
{
int i;
int len = 1;
int posn = 1;
int sign = 1;
char *ptr = buf + bufsize - 1;
struct lconv *fmt_info = localeconv();
char const *tsep = fmt_info->thousands_sep;
char const *group = fmt_info->grouping;
char const *neg = fmt_info->negative_sign;
size_t sep_len = strlen(tsep);
size_t group_len = strlen(group);
size_t neg_len = strlen(neg);
int places = (int)*group;
if (bufsize < 2)
{
ABORT:
*buf = '\0';
return 0;
}
*ptr-- = '\0';
--bufsize;
if (N < 0L)
{
sign = -1;
N = -N;
}
for ( ; len <= bufsize; ++len, ++posn)
{
*ptr-- = (char)((N % 10L) + '0');
if (0L == (N /= 10L))
break;
if (places && (0 == (posn % places)))
{
places = next_group(&group);
for (int i=sep_len; i>0; i--) {
*ptr-- = tsep[i-1];
if (++len >= bufsize)
goto ABORT;
}
}
if (len >= bufsize)
goto ABORT;
}
if (sign < 0)
{
if (len >= bufsize)
goto ABORT;
for (int i=neg_len; i>0; i--) {
*ptr-- = neg[i-1];
if (++len >= bufsize)
goto ABORT;
}
}
memmove(buf, ++ptr, len + 1);
return (size_t)len;
}
#ifdef TEST
#include <stdio.h>
#define elements(x) (sizeof(x)/sizeof(x[0]))
void show(long i) {
char buffer[32];
commafmt(buffer, sizeof(buffer), i);
printf("%s\n", buffer);
commafmt(buffer, sizeof(buffer), -i);
printf("%s\n", buffer);
}
int main() {
long inputs[] = {1, 12, 123, 1234, 12345, 123456, 1234567, 12345678 };
for (int i=0; i<elements(inputs); i++) {
setlocale(LC_ALL, "");
show(inputs[i]);
}
return 0;
}
#endif
This does have a bug (but one I'd consider fairly minor). On two's complement hardware, it won't convert the most-negative number correctly, because it attempts to convert a negative number to its equivalent positive number with N = -N; In two's complement, the maximally negative number doesn't have a corresponding positive number, unless you promote it to a larger type. One way to get around this is by promoting the number the corresponding unsigned type (but it's is somewhat non-trivial).

Without recursion or string handling, a mathematical approach:
#include <stdio.h>
#include <math.h>
void print_number( int n )
{
int order_of_magnitude = (n == 0) ? 1 : (int)pow( 10, ((int)floor(log10(abs(n))) / 3) * 3 ) ;
printf( "%d", n / order_of_magnitude ) ;
for( n = abs( n ) % order_of_magnitude, order_of_magnitude /= 1000;
order_of_magnitude > 0;
n %= order_of_magnitude, order_of_magnitude /= 1000 )
{
printf( ",%03d", abs(n / order_of_magnitude) ) ;
}
}
Similar in principle to Pax's recursive solution, but by calculating the order of magnitude in advance, recursion is avoided (at some considerable expense perhaps).
Note also that the actual character used to separate thousands is locale specific.
Edit:See #Chux's comments below for improvements.

Based on #Greg Hewgill's, but takes negative numbers into account and returns the string size.
size_t str_format_int_grouped(char dst[16], int num)
{
char src[16];
char *p_src = src;
char *p_dst = dst;
const char separator = ',';
int num_len, commas;
num_len = sprintf(src, "%d", num);
if (*p_src == '-') {
*p_dst++ = *p_src++;
num_len--;
}
for (commas = 2 - num_len % 3;
*p_src;
commas = (commas + 1) % 3)
{
*p_dst++ = *p_src++;
if (commas == 1) {
*p_dst++ = separator;
}
}
*--p_dst = '\0';
return (size_t)(p_dst - dst);
}

Needed to do something similar myself but rather than printing directly, needed to go to a buffer. Here's what I came up with. Works backwards.
unsigned int IntegerToCommaString(char *String, unsigned long long Integer)
{
unsigned int Digits = 0, Offset, Loop;
unsigned long long Copy = Integer;
do {
Digits++;
Copy /= 10;
} while (Copy);
Digits = Offset = ((Digits - 1) / 3) + Digits;
String[Offset--] = '\0';
Copy = Integer;
Loop = 0;
do {
String[Offset] = '0' + (Copy % 10);
if (!Offset--)
break;
if (Loop++ % 3 == 2)
String[Offset--] = ',';
Copy /= 10;
} while (1);
return Digits;
}
Be aware that it's only designed for unsigned integers and you must ensure that the buffer is large enough.

There's no real simple way to do this in C. I would just modify an int-to-string function to do it:
void format_number(int n, char * out) {
int i;
int digit;
int out_index = 0;
for (i = n; i != 0; i /= 10) {
digit = i % 10;
if ((out_index + 1) % 4 == 0) {
out[out_index++] = ',';
}
out[out_index++] = digit + '0';
}
out[out_index] = '\0';
// then you reverse the out string as it was converted backwards (it's easier that way).
// I'll let you figure that one out.
strrev(out);
}

My answer does not format the result exactly like the illustration in the question, but may fulfill the actual need in some cases with a simple one-liner or macro. One can extend it to generate more thousand-groups as necessary.
The result will look for example as follows:
Value: 0'000'012'345
The code:
printf("Value: %llu'%03lu'%03lu'%03lu\n", (value / 1000 / 1000 / 1000), (value / 1000 / 1000) % 1000, (value / 1000) % 1000, value % 1000);

#include <stdio.h>
void punt(long long n){
char s[28];
int i = 27;
if(n<0){n=-n; putchar('-');}
do{
s[i--] = n%10 + '0';
if(!(i%4) && n>9)s[i--]='.';
n /= 10;
}while(n);
puts(&s[++i]);
}
int main(){
punt(2134567890);
punt(987);
punt(9876);
punt(-987);
punt(-9876);
punt(-654321);
punt(0);
punt(1000000000);
punt(0x7FFFFFFFFFFFFFFF);
punt(0x8000000000000001); // -max + 1 ...
}
My solution uses a . instead of a ,
It is left to the reader to change this.

This is old and there are plenty of answers but the question was not "how can I write a routine to add commas" but "how can it be done in C"? The comments pointed to this direction but on my Linux system with GCC, this works for me:
#include <stdio.h>
#include <stdlib.h>
#include <locale.h>
int main()
{
unsetenv("LC_ALL");
setlocale(LC_NUMERIC, "");
printf("%'lld\n", 3141592653589);
}
When this is run, I get:
$ cc -g comma.c -o comma && ./comma
3,141,592,653,589
If I unset the LC_ALL variable before running the program the unsetenv is not necessary.

Another solution, by saving the result into an int array, maximum size of 7 because the long long int type can handle numbers in the range 9,223,372,036,854,775,807 to -9,223,372,036,854,775,807. (Note it is not an unsigned value).
Non-recursive printing function
static void printNumber (int numbers[8], int loc, int negative)
{
if (negative)
{
printf("-");
}
if (numbers[1]==-1)//one number
{
printf("%d ", numbers[0]);
}
else
{
printf("%d,", numbers[loc]);
while(loc--)
{
if(loc==0)
{// last number
printf("%03d ", numbers[loc]);
break;
}
else
{ // number in between
printf("%03d,", numbers[loc]);
}
}
}
}
main function call
static void getNumWcommas (long long int n, int numbers[8])
{
int i;
int negative=0;
if (n < 0)
{
negative = 1;
n = -n;
}
for(i = 0; i < 7; i++)
{
if (n < 1000)
{
numbers[i] = n;
numbers[i+1] = -1;
break;
}
numbers[i] = n%1000;
n/=1000;
}
printNumber(numbers, i, negative);// non recursive print
}
testing output
-9223372036854775807: -9,223,372,036,854,775,807
-1234567890 : -1,234,567,890
-123456 : -123,456
-12345 : -12,345
-1000 : -1,000
-999 : -999
-1 : -1
0 : 0
1 : 1
999 : 999
1000 : 1,000
12345 : 12,345
123456 : 123,456
1234567890 : 1,234,567,890
9223372036854775807 : 9,223,372,036,854,775,807
In main() function:
int numberSeparated[8];
long long int number = 1234567890LL;
getNumWcommas(number, numberSeparated);
If printing is all that's needed then move int numberSeparated[8]; inside the function getNumWcommas and call it this way getNumWcommas(number).

Another iterative function
int p(int n) {
if(n < 0) {
printf("-");
n = -n;
}
int a[sizeof(int) * CHAR_BIT / 3] = { 0 };
int *pa = a;
while(n > 0) {
*++pa = n % 1000;
n /= 1000;
}
printf("%d", *pa);
while(pa > a + 1) {
printf(",%03d", *--pa);
}
}

Here is the slimiest, size and speed efficient implementation of this kind of decimal digit formating:
const char *formatNumber (
int value,
char *endOfbuffer,
bool plus)
{
int savedValue;
int charCount;
savedValue = value;
if (unlikely (value < 0))
value = - value;
*--endOfbuffer = 0;
charCount = -1;
do
{
if (unlikely (++charCount == 3))
{
charCount = 0;
*--endOfbuffer = ',';
}
*--endOfbuffer = (char) (value % 10 + '0');
}
while ((value /= 10) != 0);
if (unlikely (savedValue < 0))
*--endOfbuffer = '-';
else if (unlikely (plus))
*--endOfbuffer = '+';
return endOfbuffer;
}
Use as following:
char buffer[16];
fprintf (stderr, "test : %s.", formatNumber (1234567890, buffer + 16, true));
Output:
test : +1,234,567,890.
Some advantages:
Function taking end of string buffer because of reverse ordered formatting. Finally, where is no need in revering generated string (strrev).
This function produces one string that can be used in any algo after. It not depends nor require multiple printf/sprintf calls, which is terrible slow and always context specific.
Minimum number of divide operators (/, %).

Secure format_commas, with negative numbers:
Because VS < 2015 doesn't implement snprintf, you need to do this
#if defined(_WIN32)
#define snprintf(buf,len, format,...) _snprintf_s(buf, len,len, format, __VA_ARGS__)
#endif
And then
char* format_commas(int n, char *out)
{
int c;
char buf[100];
char *p;
char* q = out; // Backup pointer for return...
if (n < 0)
{
*out++ = '-';
n = abs(n);
}
snprintf(buf, 100, "%d", n);
c = 2 - strlen(buf) % 3;
for (p = buf; *p != 0; p++) {
*out++ = *p;
if (c == 1) {
*out++ = '\'';
}
c = (c + 1) % 3;
}
*--out = 0;
return q;
}
Example usage:
size_t currentSize = getCurrentRSS();
size_t peakSize = getPeakRSS();
printf("Current size: %d\n", currentSize);
printf("Peak size: %d\n\n\n", peakSize);
char* szcurrentSize = (char*)malloc(100 * sizeof(char));
char* szpeakSize = (char*)malloc(100 * sizeof(char));
printf("Current size (f): %s\n", format_commas((int)currentSize, szcurrentSize));
printf("Peak size (f): %s\n", format_commas((int)currentSize, szpeakSize));
free(szcurrentSize);
free(szpeakSize);

A modified version of #paxdiablo solution, but using WCHAR and wsprinf:
static WCHAR buffer[10];
static int pos = 0;
void printfcomma(const int &n) {
if (n < 0) {
wsprintf(buffer + pos, TEXT("-"));
pos = lstrlen(buffer);
printfcomma(-n);
return;
}
if (n < 1000) {
wsprintf(buffer + pos, TEXT("%d"), n);
pos = lstrlen(buffer);
return;
}
printfcomma(n / 1000);
wsprintf(buffer + pos, TEXT(",%03d"), n % 1000);
pos = lstrlen(buffer);
}
void my_sprintf(const int &n)
{
pos = 0;
printfcomma(n);
}

I'm new in C programming. Here is my simple code.
int main()
{
// 1223 => 1,223
int n;
int a[10];
printf(" n: ");
scanf_s("%d", &n);
int i = 0;
while (n > 0)
{
int temp = n % 1000;
a[i] = temp;
n /= 1000;
i++;
}
for (int j = i - 1; j >= 0; j--)
{
if (j == 0)
{
printf("%d.", a[j]);
}
else printf("%d,",a[j]);
}
getch();
return 0;
}

Require: <stdio.h> + <string.h>.
Advantage: short, readable, based on the format of scanf-family. And assume no comma on the right of decimal point.
void add_commas(char *in, char *out) {
int len_in = strlen(in);
int len_int = -1; /* len_int(123.4) = 3 */
for (int i = 0; i < len_in; ++i) if (in[i] == '.') len_int = i;
int pos = 0;
for (int i = 0; i < len_in; ++i) {
if (i>0 && i<len_int && (len_int-i)%3==0)
out[pos++] = ',';
out[pos++] = in[i];
}
out[pos] = 0; /* Append the '\0' */
}
Example, to print a formatted double:
#include <stdio.h>
#include <string.h>
#define COUNT_DIGIT_MAX 100
int main() {
double sum = 30678.7414;
char input[COUNT_DIGIT_MAX+1] = { 0 }, output[COUNT_DIGIT_MAX+1] = { 0 };
snprintf(input, COUNT_DIGIT_MAX, "%.2f", sum/12);
add_commas(input, output);
printf("%s\n", output);
}
Output:
2,556.56

Using C++'s std::string as return value with possibly the least overhead and not using any std library functions (sprintf, to_string, etc.).
string group_digs_c(int num)
{
const unsigned int BUF_SIZE = 128;
char buf[BUF_SIZE] = { 0 }, * pbuf = &buf[BUF_SIZE - 1];
int k = 0, neg = 0;
if (num < 0) { neg = 1; num = num * -1; };
while(num)
{
if (k > 0 && k % 3 == 0)
*pbuf-- = ',';
*pbuf-- = (num % 10) + '0';
num /= 10;
++k;
}
if (neg)
*pbuf = '-';
else
++pbuf;
int cc = buf + BUF_SIZE - pbuf;
memmove(buf, pbuf, cc);
buf[cc] = 0;
string rv = buf;
return rv;
}

Here is a simple portable solution relying on sprintf:
#include <stdio.h>
// assuming out points to an array of sufficient size
char *format_commas(char *out, int n, int min_digits) {
int len = sprintf(out, "%.*d", min_digits, n);
int i = (*out == '-'), j = len, k = (j - i - 1) / 3;
out[j + k] = '\0';
while (k-- > 0) {
j -= 3;
out[j + k + 3] = out[j + 2];
out[j + k + 2] = out[j + 1];
out[j + k + 1] = out[j + 0];
out[j + k + 0] = ',';
}
return out;
}
The code is easy to adapt for other integer types.

There are many interesting contributions here. Some covered all cases, some did not. I picked four of the contributions to test, found some failure cases during testing and then added a solution of my own.
I tested all methods for both accuracy and speed. Even though the OP only requested a solution for one positive number, I upgraded the contributions that didn't cover all possible numbers (so the code below may be slightly different from the original postings). The cases that weren't covered include: 0, negative numbers and the minimum number (INT_MIN).
I changed the declared type from "int" to "long long" since it's more general and all ints will get promoted to long long. I also standardized the call interface to include the number as well as a buffer to contain the formatted string (like some of the contributions) and returned a pointer to the buffer:
char* funcName(long long number_to_format, char* string_buffer);
Including a buffer parameter is considered by some to be "better" than having the function: 1) contain a static buffer (would not be re-entrant) or 2) allocate space for the buffer (would require caller to de-allocate the memory) or 3) print the result directly to stdout (would not be as generally useful since the output may be targeted for a GUI widget, file, pty, pipe, etc.).
I tried to use the same function names as the original contributions to make it easier to refer back to the originals. Contributed functions were modified as needed to pass the accuracy test so that the speed test would be meaningful. The results are included here in case you would like to test more of the contributed techniques for comparison. All code and test code used to generate the results are shown below.
So, here are the results:
Accuracy Test (test cases: LLONG_MIN, -999, -99, 0, 99, 999, LLONG_MAX):
----------------------------------------------------
print_number:
-9,223,372,036,854,775,808, -999, -99, 0, 99, 999, 9,223,372,036,854,775,807
fmtLocale:
-9,223,372,036,854,775,808, -999, -99, 0, 99, 999, 9,223,372,036,854,775,807
fmtCommas:
-9,223,372,036,854,775,808, -999, -99, 0, 99, 999, 9,223,372,036,854,775,807
format_number:
-9,223,372,036,854,775,808, -999, -99, 0, 99, 999, 9,223,372,036,854,775,807
itoa_commas:
-9,223,372,036,854,775,808, -999, -99, 0, 99, 999, 9,223,372,036,854,775,807
Speed Test: (1 million calls, values reflect average time per call)
----------------------------------------------------
print_number: 0.747 us (microsec) per call
fmtLocale: 0.222 us (microsec) per call
fmtCommas: 0.212 us (microsec) per call
format_number: 0.124 us (microsec) per call
itoa_commas: 0.085 us (microsec) per call
Since all contributed techniques are fast (< 1 microsecond on my laptop), unless you need to format millions of numbers, any of the techniques should be acceptable. It's probably best to choose the technique that is most readable to you.
Here is the code:
#line 2 "comma.c"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <math.h>
#include <locale.h>
#include <limits.h>
// ----------------------------------------------------------
char* print_number( long long n, char buf[32] ) {
long long order_of_magnitude = (n == 0) ? 1
: (long long)pow( 10, ((long long)floor(log10(fabs(n))) / 3) * 3 ) ;
char *ptr = buf;
sprintf(ptr, "%d", n / order_of_magnitude ) ;
for( n %= order_of_magnitude, order_of_magnitude /= 1000;
order_of_magnitude > 0;
n %= order_of_magnitude, order_of_magnitude /= 1000 )
{
ptr += strlen(ptr);
sprintf(ptr, ",%03d", abs(n / order_of_magnitude) );
}
return buf;
}
// ----------------------------------------------------------
char* fmtLocale(long long i, char buf[32]) {
sprintf(buf, "%'lld", i); // requires setLocale in main
return buf;
}
// ----------------------------------------------------------
char* fmtCommas(long long num, char dst[32]) {
char src[27];
char *p_src = src;
char *p_dst = dst;
const char separator = ',';
int num_len, commas;
num_len = sprintf(src, "%lld", num);
if (*p_src == '-') {
*p_dst++ = *p_src++;
num_len--;
}
for (commas = 2 - num_len % 3;
*p_src;
commas = (commas + 1) % 3)
{
*p_dst++ = *p_src++;
if (commas == 1) {
*p_dst++ = separator;
}
}
*--p_dst = '\0';
return dst;
}
// ----------------------------------------------------------
char* format_number(long long n, char out[32]) {
int digit;
int out_index = 0;
long long i = (n < 0) ? -n : n;
if (i == LLONG_MIN) i = LLONG_MAX; // handle MIN, offset by 1
if (i == 0) { out[out_index++] = '0'; } // handle 0
for ( ; i != 0; i /= 10) {
digit = i % 10;
if ((out_index + 1) % 4 == 0) {
out[out_index++] = ',';
}
out[out_index++] = digit + '0';
}
if (n == LLONG_MIN) { out[0]++; } // correct for offset
if (n < 0) { out[out_index++] = '-'; }
out[out_index] = '\0';
// then you reverse the out string
for (int i=0, j = strlen(out) - 1; i<=j; ++i, --j) {
char tmp = out[i];
out[i] = out[j];
out[j] = tmp;
}
return out;
}
// ----------------------------------------------------------
char* itoa_commas(long long i, char buf[32]) {
char* p = buf + 31;
*p = '\0'; // terminate string
if (i == 0) { *(--p) = '0'; return p; } // handle 0
long long n = (i < 0) ? -i : i;
if (n == LLONG_MIN) n = LLONG_MAX; // handle MIN, offset by 1
for (int j=0; 1; ++j) {
*--p = '0' + n % 10; // insert digit
if ((n /= 10) <= 0) break;
if (j % 3 == 2) *--p = ','; // insert a comma
}
if (i == LLONG_MIN) { p[24]++; } // correct for offset
if (i < 0) { *--p = '-'; }
return p;
}
// ----------------------------------------------------------
// Test Accuracy
// ----------------------------------------------------------
void test_accuracy(char* name, char* (*func)(long long n, char* buf)) {
char sbuf[32]; // string buffer
long long nbuf[] = { LLONG_MIN, -999, -99, 0, 99, 999, LLONG_MAX };
printf("%s:\n", name);
printf(" %s", func(nbuf[0], sbuf));
for (int i=1; i < sizeof(nbuf) / sizeof(long long int); ++i) {
printf(", %s", func(nbuf[i], sbuf));
}
printf("\n");
}
// ----------------------------------------------------------
// Test Speed
// ----------------------------------------------------------
void test_speed(char* name, char* (*func)(long long n, char* buf)) {
int cycleCount = 1000000;
//int cycleCount = 1;
clock_t start;
double elapsed;
char sbuf[32]; // string buffer
start = clock();
for (int i=0; i < cycleCount; ++i) {
char* s = func(LLONG_MAX, sbuf);
}
elapsed = (double)(clock() - start) / (CLOCKS_PER_SEC / 1000000.0);
printf("%14s: %7.3f us (microsec) per call\n", name, elapsed / cycleCount);
}
// ----------------------------------------------------------
int main(int argc, char* argv[]){
setlocale(LC_ALL, "");
printf("\nAccuracy Test: (LLONG_MIN, -999, 0, 99, LLONG_MAX)\n");
printf("----------------------------------------------------\n");
test_accuracy("print_number", print_number);
test_accuracy("fmtLocale", fmtLocale);
test_accuracy("fmtCommas", fmtCommas);
test_accuracy("format_number", format_number);
test_accuracy("itoa_commas", itoa_commas);
printf("\nSpeed Test: 1 million calls\n\n");
printf("----------------------------------------------------\n");
test_speed("print_number", print_number);
test_speed("fmtLocale", fmtLocale);
test_speed("fmtCommas", fmtCommas);
test_speed("format_number", format_number);
test_speed("itoa_commas", itoa_commas);
return 0;
}

Can be done pretty easily...
//Make sure output buffer is big enough and that input is a valid null terminated string
void pretty_number(const char* input, char * output)
{
int iInputLen = strlen(input);
int iOutputBufferPos = 0;
for(int i = 0; i < iInputLen; i++)
{
if((iInputLen-i) % 3 == 0 && i != 0)
{
output[iOutputBufferPos++] = ',';
}
output[iOutputBufferPos++] = input[i];
}
output[iOutputBufferPos] = '\0';
}
Example call:
char szBuffer[512];
pretty_number("1234567", szBuffer);
//strcmp(szBuffer, "1,234,567") == 0

void printfcomma ( long long unsigned int n)
{
char nstring[100];
int m;
int ptr;
int i,j;
sprintf(nstring,"%llu",n);
m=strlen(nstring);
ptr=m%3;
if (ptr)
{ for (i=0;i<ptr;i++) // print first digits before comma
printf("%c", nstring[i]);
printf(",");
}
j=0;
for (i=ptr;i<m;i++) // print the rest inserting commas
{
printf("%c",nstring[i]);
j++;
if (j%3==0)
if(i<(m-1)) printf(",");
}
}

// separate thousands
int digit;
int idx = 0;
static char buffer[32];
char* p = &buffer[32];
*--p = '\0';
for (int i = fCounter; i != 0; i /= 10)
{
digit = i % 10;
if ((p - buffer) % 4 == 0)
*--p = ' ';
*--p = digit + '0';
}

Develop Reference

c reactjs sql-server angularjs arrays wpf database batch-file google-app-engine silverlight

The fastest way to save graph to file in C - c

Related

How to unscramble a word and find all its matches in a txt file in C?

Is binary to decimal conversion rounded? how?

How can I multiply two strings containing 'huge numbers' (over 30 digits)?

How to format number adding points between each 3 numbers [duplicate]

How to format a number using comma as thousands separator in C?

Categories

Resources