csvparser.c - randomcrap - random crap programs of varying quality
 (HTM) git clone git://git.codemadness.org/randomcrap
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
       csvparser.c (3646B)
       ---
            1 /*
            2  * CSV parser (and example to convert to TSV).
            3  *
            4  * See also for reference:
            5  * RFC4180 - Common Format and MIME Type for Comma-Separated Values (CSV) Files
            6  * https://www.ietf.org/rfc/rfc4180.txt
            7  *
            8  * There are some changes against the RFC:
            9  *
           10  * - Text-encoding: UTF-8 or ASCII is assumed.
           11  * - Strings are quoted, quotes are escaped.
           12  * - Strings may be unquoted, but then it may not have a separator or CR or LF.
           13  * - Quoted strings can contain all control-characters until a non-quoted ".
           14  * - Line-endings do not matter (CRLF or LF are allowed).
           15  * - Each record does not need to have the same amount of columns.
           16  * - Maximum number of records is hardcoded (64 at the moment).
           17  * - Ignores empty lines between records.
           18  *
           19  * - TIP: use `sed 1d` to remove column headers (assuming it is one line).
           20  */
           21 
           22 #include <errno.h>
           23 #include <stdint.h>
           24 #include <stdio.h>
           25 #include <stdlib.h>
           26 #include <string.h>
           27 
           28 #ifdef __OpenBSD__
           29 #include <unistd.h>
           30 #else
           31 #define pledge(a,b) 0
           32 #endif
           33 
           34 #define GETNEXT getchar
           35 
           36 struct field {
           37         char *buf;
           38         size_t bufsiz;
           39 };
           40 
           41 #define MAX_FIELDS 64
           42 static struct field fields[MAX_FIELDS];
           43 static int separator = ','; /* default separator */
           44 
           45 void
           46 fatal(const char *s)
           47 {
           48         fputs(s, stderr);
           49         exit(1);
           50 }
           51 
           52 void
           53 capacity(char **value, size_t *sz, size_t cur, size_t inc)
           54 {
           55         size_t need, newsiz;
           56         char *newp;
           57 
           58         /* check for addition overflow */
           59         if (cur > SIZE_MAX - inc) {
           60                 errno = EOVERFLOW;
           61                 fprintf(stderr, "realloc: %s\n", strerror(errno));
           62                 exit(1);
           63         }
           64         need = cur + inc;
           65 
           66         if (need > *sz) {
           67                 if (need > SIZE_MAX / 2) {
           68                         newsiz = SIZE_MAX;
           69                 } else {
           70                         for (newsiz = *sz < 64 ? 64 : *sz; newsiz <= need; newsiz *= 2)
           71                                 ;
           72                 }
           73                 if (!(newp = realloc(*value, newsiz))) {
           74                         fprintf(stderr, "realloc: %s\n", strerror(errno));
           75                         exit(1);
           76                 }
           77                 *value = newp;
           78                 *sz = newsiz;
           79         }
           80 }
           81 
           82 void
           83 record(struct field *f, size_t nfields)
           84 {
           85         size_t i;
           86         const char *s;
           87 
           88         for (i = 0; i < nfields; i++) {
           89                 if (i)
           90                         putchar('\t');
           91                 for (s = f[i].buf; *s; s++) {
           92                         switch (*s) {
           93                         case '\\': putchar('\\'); putchar('\\'); break;
           94                         case '\n': putchar('\\'); putchar('n'); break;
           95                         case '\t': putchar('\\'); putchar('t'); break;
           96                         default:
           97                                 /* ignore control-characters */
           98                                 if ((unsigned char)*s < 0x20 || (unsigned char)*s == 0x7f)
           99                                         break;
          100                                 putchar(*s);
          101                                 break;
          102                         }
          103                 }
          104         }
          105         putchar('\n');
          106 }
          107 
          108 int
          109 main(int argc, char *argv[])
          110 {
          111         size_t i, v;
          112         int c;
          113 
          114         if (pledge("stdio", NULL) == -1)
          115                 fatal("pledge: stdio\n");
          116 
          117         /* separator, should not be: "\r\n */
          118         if (argc > 1 && argv[1][0] && !strchr("\r\n\"", argv[1][0]))
          119                 separator = argv[1][0];
          120 
          121         for (i = 0, v = 0; (c = GETNEXT()) != EOF;) {
          122 parse:
          123                 switch (c) {
          124                 case '"':
          125                         while ((c = GETNEXT()) != EOF) {
          126                                 if (c == '"') {
          127                                         if ((c = GETNEXT()) == EOF)
          128                                                 goto end;
          129                                         else if (c != '"')
          130                                                 goto parse;
          131                                 }
          132                                 capacity(&fields[i].buf, &fields[i].bufsiz, v, 1);
          133                                 fields[i].buf[v] = c;
          134                                 v++;
          135                         }
          136                         if ((c = GETNEXT()) == EOF)
          137                                 fatal("unexpected EOF\n");
          138                         break;
          139                 case '\n': /* new record */
          140                         capacity(&fields[i].buf, &fields[i].bufsiz, v, 1);
          141                         fields[i].buf[v] = '\0';
          142                         if (i || v)
          143                                 record(fields, i + 1);
          144                         v = 0;
          145                         i = 0;
          146                         break;
          147                 case '\r': /* ignore CR */
          148                         break;
          149                 default:
          150                         capacity(&fields[i].buf, &fields[i].bufsiz, v, 1);
          151                         if (c == separator) {
          152                                 fields[i].buf[v] = '\0';
          153                                 v = 0;
          154                                 if (i + 1 >= MAX_FIELDS)
          155                                         fatal("too many fields\n");
          156                                 i++;
          157                         } else {
          158                                 fields[i].buf[v] = c;
          159                                 v++;
          160                         }
          161                 }
          162         }
          163 end:
          164         /* handle end of record without newline */
          165         capacity(&fields[i].buf, &fields[i].bufsiz, v, 1);
          166         fields[i].buf[v] = '\0';
          167         if (i || v)
          168                 record(fields, i + 1);
          169 
          170         return 0;
          171 }