csvparser.c - randomcrap - random crap programs of varying quality
(HTM) git clone git://git.codemadness.org/randomcrap
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
csvparser.c (3646B)
---
1 /*
2 * CSV parser (and example to convert to TSV).
3 *
4 * See also for reference:
5 * RFC4180 - Common Format and MIME Type for Comma-Separated Values (CSV) Files
6 * https://www.ietf.org/rfc/rfc4180.txt
7 *
8 * There are some changes against the RFC:
9 *
10 * - Text-encoding: UTF-8 or ASCII is assumed.
11 * - Strings are quoted, quotes are escaped.
12 * - Strings may be unquoted, but then it may not have a separator or CR or LF.
13 * - Quoted strings can contain all control-characters until a non-quoted ".
14 * - Line-endings do not matter (CRLF or LF are allowed).
15 * - Each record does not need to have the same amount of columns.
16 * - Maximum number of records is hardcoded (64 at the moment).
17 * - Ignores empty lines between records.
18 *
19 * - TIP: use `sed 1d` to remove column headers (assuming it is one line).
20 */
21
22 #include <errno.h>
23 #include <stdint.h>
24 #include <stdio.h>
25 #include <stdlib.h>
26 #include <string.h>
27
28 #ifdef __OpenBSD__
29 #include <unistd.h>
30 #else
31 #define pledge(a,b) 0
32 #endif
33
34 #define GETNEXT getchar
35
36 struct field {
37 char *buf;
38 size_t bufsiz;
39 };
40
41 #define MAX_FIELDS 64
42 static struct field fields[MAX_FIELDS];
43 static int separator = ','; /* default separator */
44
45 void
46 fatal(const char *s)
47 {
48 fputs(s, stderr);
49 exit(1);
50 }
51
52 void
53 capacity(char **value, size_t *sz, size_t cur, size_t inc)
54 {
55 size_t need, newsiz;
56 char *newp;
57
58 /* check for addition overflow */
59 if (cur > SIZE_MAX - inc) {
60 errno = EOVERFLOW;
61 fprintf(stderr, "realloc: %s\n", strerror(errno));
62 exit(1);
63 }
64 need = cur + inc;
65
66 if (need > *sz) {
67 if (need > SIZE_MAX / 2) {
68 newsiz = SIZE_MAX;
69 } else {
70 for (newsiz = *sz < 64 ? 64 : *sz; newsiz <= need; newsiz *= 2)
71 ;
72 }
73 if (!(newp = realloc(*value, newsiz))) {
74 fprintf(stderr, "realloc: %s\n", strerror(errno));
75 exit(1);
76 }
77 *value = newp;
78 *sz = newsiz;
79 }
80 }
81
82 void
83 record(struct field *f, size_t nfields)
84 {
85 size_t i;
86 const char *s;
87
88 for (i = 0; i < nfields; i++) {
89 if (i)
90 putchar('\t');
91 for (s = f[i].buf; *s; s++) {
92 switch (*s) {
93 case '\\': putchar('\\'); putchar('\\'); break;
94 case '\n': putchar('\\'); putchar('n'); break;
95 case '\t': putchar('\\'); putchar('t'); break;
96 default:
97 /* ignore control-characters */
98 if ((unsigned char)*s < 0x20 || (unsigned char)*s == 0x7f)
99 break;
100 putchar(*s);
101 break;
102 }
103 }
104 }
105 putchar('\n');
106 }
107
108 int
109 main(int argc, char *argv[])
110 {
111 size_t i, v;
112 int c;
113
114 if (pledge("stdio", NULL) == -1)
115 fatal("pledge: stdio\n");
116
117 /* separator, should not be: "\r\n */
118 if (argc > 1 && argv[1][0] && !strchr("\r\n\"", argv[1][0]))
119 separator = argv[1][0];
120
121 for (i = 0, v = 0; (c = GETNEXT()) != EOF;) {
122 parse:
123 switch (c) {
124 case '"':
125 while ((c = GETNEXT()) != EOF) {
126 if (c == '"') {
127 if ((c = GETNEXT()) == EOF)
128 goto end;
129 else if (c != '"')
130 goto parse;
131 }
132 capacity(&fields[i].buf, &fields[i].bufsiz, v, 1);
133 fields[i].buf[v] = c;
134 v++;
135 }
136 if ((c = GETNEXT()) == EOF)
137 fatal("unexpected EOF\n");
138 break;
139 case '\n': /* new record */
140 capacity(&fields[i].buf, &fields[i].bufsiz, v, 1);
141 fields[i].buf[v] = '\0';
142 if (i || v)
143 record(fields, i + 1);
144 v = 0;
145 i = 0;
146 break;
147 case '\r': /* ignore CR */
148 break;
149 default:
150 capacity(&fields[i].buf, &fields[i].bufsiz, v, 1);
151 if (c == separator) {
152 fields[i].buf[v] = '\0';
153 v = 0;
154 if (i + 1 >= MAX_FIELDS)
155 fatal("too many fields\n");
156 i++;
157 } else {
158 fields[i].buf[v] = c;
159 v++;
160 }
161 }
162 }
163 end:
164 /* handle end of record without newline */
165 capacity(&fields[i].buf, &fields[i].bufsiz, v, 1);
166 fields[i].buf[v] = '\0';
167 if (i || v)
168 record(fields, i + 1);
169
170 return 0;
171 }