json.c - json2tsv - JSON to TSV converter
 (HTM) git clone git://git.codemadness.org/json2tsv
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
       json.c (8152B)
       ---
            1 #include <errno.h>
            2 #include <stdint.h>
            3 #include <stdio.h>
            4 #include <stdlib.h>
            5 #include <string.h>
            6 
            7 #ifndef GETNEXT
            8 #define GETNEXT getchar_unlocked
            9 #endif
           10 
           11 #include "json.h"
           12 
           13 /* ctype-like macros, but always compatible with ASCII / UTF-8 */
           14 #define ISDIGIT(c) (((unsigned)c) - '0' < 10)
           15 #define ISXDIGIT(c) ((((unsigned)c) - '0' < 10) || ((unsigned)c | 32) - 'a' < 6)
           16 
           17 static int
           18 codepointtoutf8(long r, char *s)
           19 {
           20         if (r == 0) {
           21                 return 0; /* NUL byte */
           22         } else if (r <= 0x7F) {
           23                 /* 1 byte: 0aaaaaaa */
           24                 s[0] = r;
           25                 return 1;
           26         } else if (r <= 0x07FF) {
           27                 /* 2 bytes: 00000aaa aabbbbbb */
           28                 s[0] = 0xC0 | ((r & 0x0007C0) >>  6); /* 110aaaaa */
           29                 s[1] = 0x80 |  (r & 0x00003F);        /* 10bbbbbb */
           30                 return 2;
           31         } else if (r <= 0xFFFF) {
           32                 /* 3 bytes: aaaabbbb bbcccccc */
           33                 s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */
           34                 s[1] = 0x80 | ((r & 0x000FC0) >>  6); /* 10bbbbbb */
           35                 s[2] = 0x80 |  (r & 0x00003F);        /* 10cccccc */
           36                 return 3;
           37         } else {
           38                 /* 4 bytes: 000aaabb bbbbcccc ccdddddd */
           39                 s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */
           40                 s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */
           41                 s[2] = 0x80 | ((r & 0x000FC0) >>  6); /* 10cccccc */
           42                 s[3] = 0x80 |  (r & 0x00003F);        /* 10dddddd */
           43                 return 4;
           44         }
           45 }
           46 
           47 static int
           48 hexdigit(int c)
           49 {
           50         if (c >= '0' && c <= '9')
           51                 return c - '0';
           52         else if (c >= 'a' && c <= 'f')
           53                 return 10 + (c - 'a');
           54         else if (c >= 'A' && c <= 'F')
           55                 return 10 + (c - 'A');
           56         return 0;
           57 }
           58 
           59 static int
           60 capacity(char **value, size_t *sz, size_t cur, size_t inc)
           61 {
           62         size_t need, newsiz;
           63         char *newp;
           64 
           65         /* check for addition overflow */
           66         if (cur > SIZE_MAX - inc) {
           67                 errno = ENOMEM;
           68                 return -1;
           69         }
           70         need = cur + inc;
           71 
           72         if (need > *sz) {
           73                 if (need > SIZE_MAX / 2) {
           74                         newsiz = SIZE_MAX;
           75                 } else {
           76                         for (newsiz = *sz < 64 ? 64 : *sz; newsiz <= need; newsiz *= 2)
           77                                 ;
           78                 }
           79                 if (!(newp = realloc(*value, newsiz)))
           80                         return -1; /* up to caller to free *value */
           81                 *value = newp;
           82                 *sz = newsiz;
           83         }
           84         return 0;
           85 }
           86 
           87 #define EXPECT_VALUE         "{[\"-0123456789tfn"
           88 #define EXPECT_STRING        "\""
           89 #define EXPECT_END           "}],"
           90 #define EXPECT_OBJECT_STRING EXPECT_STRING "}"
           91 #define EXPECT_OBJECT_KEY    ":"
           92 #define EXPECT_ARRAY_VALUE   EXPECT_VALUE "]"
           93 
           94 #define JSON_INVALID()       do { ret = JSON_ERROR_INVALID; goto end; } while (0);
           95 
           96 int
           97 parsejson(void (*cb)(struct json_node *, size_t, const char *, size_t))
           98 {
           99         struct json_node nodes[JSON_MAX_NODE_DEPTH] = { { 0 } };
          100         size_t depth = 0, p = 0, len, sz = 0;
          101         long cp, hi, lo;
          102         char pri[128], *str = NULL;
          103         int c, i, escape, iskey = 0, ret = JSON_ERROR_MEM;
          104         const char *expect = EXPECT_VALUE;
          105 
          106         if (capacity(&(nodes[0].name), &(nodes[0].namesiz), 0, 1) == -1)
          107                 goto end;
          108         nodes[0].name[0] = '\0';
          109 
          110         while (1) {
          111                 c = GETNEXT();
          112 handlechr:
          113                 if (c == EOF)
          114                         break;
          115 
          116                 /* skip JSON white-space, (NOTE: no \v, \f, \b etc) */
          117                 if (c == ' ' || c == '\t' || c == '\n' || c == '\r')
          118                         continue;
          119 
          120                 if (!c || !strchr(expect, c))
          121                         JSON_INVALID();
          122 
          123                 switch (c) {
          124                 case ':':
          125                         iskey = 0;
          126                         expect = EXPECT_VALUE;
          127                         break;
          128                 case '"':
          129                         nodes[depth].type = JSON_TYPE_STRING;
          130                         escape = 0;
          131                         len = 0;
          132                         while (1) {
          133                                 c = GETNEXT();
          134 chr:
          135                                 /* EOF or control char: 0x7f is not defined as a control char in RFC 8259 */
          136                                 if (c < 0x20)
          137                                         JSON_INVALID();
          138 
          139                                 if (escape) {
          140 escchr:
          141                                         escape = 0;
          142                                         switch (c) {
          143                                         case '"': /* FALLTHROUGH */
          144                                         case '\\':
          145                                         case '/': break;
          146                                         case 'b': c = '\b'; break;
          147                                         case 'f': c = '\f'; break;
          148                                         case 'n': c = '\n'; break;
          149                                         case 'r': c = '\r'; break;
          150                                         case 't': c = '\t'; break;
          151                                         case 'u': /* hex hex hex hex */
          152                                                 if (capacity(&str, &sz, len, 4) == -1)
          153                                                         goto end;
          154                                                 for (i = 12, cp = 0; i >= 0; i -= 4) {
          155                                                         if ((c = GETNEXT()) == EOF || !ISXDIGIT(c))
          156                                                                 JSON_INVALID(); /* invalid code point */
          157                                                         cp |= (hexdigit(c) << i);
          158                                                 }
          159                                                 /* RFC 8259 - 7. Strings - surrogates.
          160                                                  * 0xd800 - 0xdbff - high surrogates */
          161                                                 if (cp >= 0xd800 && cp <= 0xdbff) {
          162                                                         if ((c = GETNEXT()) != '\\') {
          163                                                                 len += codepointtoutf8(cp, &str[len]);
          164                                                                 goto chr;
          165                                                         }
          166                                                         if ((c = GETNEXT()) != 'u') {
          167                                                                 len += codepointtoutf8(cp, &str[len]);
          168                                                                 goto escchr;
          169                                                         }
          170                                                         for (hi = cp, i = 12, lo = 0; i >= 0; i -= 4) {
          171                                                                 if ((c = GETNEXT()) == EOF || !ISXDIGIT(c))
          172                                                                         JSON_INVALID(); /* invalid code point */
          173                                                                 lo |= (hexdigit(c) << i);
          174                                                         }
          175                                                         /* 0xdc00 - 0xdfff - low surrogates */
          176                                                         if (lo >= 0xdc00 && lo <= 0xdfff) {
          177                                                                 cp = (hi << 10) + lo - 56613888; /* - offset */
          178                                                         } else {
          179                                                                 /* handle graceful: raw invalid output bytes */
          180                                                                 len += codepointtoutf8(hi, &str[len]);
          181                                                                 if (capacity(&str, &sz, len, 4) == -1)
          182                                                                         goto end;
          183                                                                 len += codepointtoutf8(lo, &str[len]);
          184                                                                 continue;
          185                                                         }
          186                                                 }
          187                                                 len += codepointtoutf8(cp, &str[len]);
          188                                                 continue;
          189                                         default:
          190                                                 JSON_INVALID(); /* invalid escape char */
          191                                         }
          192                                         if (capacity(&str, &sz, len, 1) == -1)
          193                                                 goto end;
          194                                         str[len++] = c;
          195                                 } else if (c == '\\') {
          196                                         escape = 1;
          197                                 } else if (c == '"') {
          198                                         if (capacity(&str, &sz, len, 1) == -1)
          199                                                 goto end;
          200                                         str[len++] = '\0';
          201 
          202                                         if (iskey) {
          203                                                 /* copy string as key, including NUL byte */
          204                                                 if (capacity(&(nodes[depth].name), &(nodes[depth].namesiz), len, 1) == -1)
          205                                                         goto end;
          206                                                 memcpy(nodes[depth].name, str, len);
          207                                         } else {
          208                                                 cb(nodes, depth + 1, str, len - 1); /* length excluding NUL byte */
          209                                         }
          210                                         break;
          211                                 } else {
          212                                         if (capacity(&str, &sz, len, 1) == -1)
          213                                                 goto end;
          214                                         str[len++] = c;
          215                                 }
          216                         }
          217                         if (iskey)
          218                                 expect = EXPECT_OBJECT_KEY;
          219                         else
          220                                 expect = EXPECT_END;
          221                         break;
          222                 case '[':
          223                 case '{':
          224                         if (depth + 1 >= JSON_MAX_NODE_DEPTH)
          225                                 JSON_INVALID(); /* too deep */
          226 
          227                         nodes[depth].index = 0;
          228                         if (c == '[') {
          229                                 nodes[depth].type = JSON_TYPE_ARRAY;
          230                                 expect = EXPECT_ARRAY_VALUE;
          231                         } else if (c == '{') {
          232                                 iskey = 1;
          233                                 nodes[depth].type = JSON_TYPE_OBJECT;
          234                                 expect = EXPECT_OBJECT_STRING;
          235                         }
          236 
          237                         cb(nodes, depth + 1, "", 0);
          238 
          239                         depth++;
          240                         nodes[depth].index = 0;
          241                         if (capacity(&(nodes[depth].name), &(nodes[depth].namesiz), 0, 1) == -1)
          242                                 goto end;
          243                         nodes[depth].name[0] = '\0';
          244                         break;
          245                 case ']':
          246                 case '}':
          247                         if (!depth ||
          248                            (c == ']' && nodes[depth - 1].type != JSON_TYPE_ARRAY) ||
          249                            (c == '}' && nodes[depth - 1].type != JSON_TYPE_OBJECT))
          250                                 JSON_INVALID(); /* unbalanced nodes */
          251 
          252                         depth--;
          253                         nodes[depth].index++;
          254                         expect = EXPECT_END;
          255                         break;
          256                 case ',':
          257                         if (!depth)
          258                                 JSON_INVALID(); /* unbalanced nodes */
          259 
          260                         nodes[depth - 1].index++;
          261                         if (nodes[depth - 1].type == JSON_TYPE_OBJECT) {
          262                                 iskey = 1;
          263                                 expect = EXPECT_STRING;
          264                         } else {
          265                                 iskey = 0;
          266                                 expect = EXPECT_VALUE;
          267                         }
          268                         break;
          269                 case 't': /* true */
          270                         if (GETNEXT() != 'r' || GETNEXT() != 'u' || GETNEXT() != 'e')
          271                                 JSON_INVALID();
          272                         nodes[depth].type = JSON_TYPE_BOOL;
          273                         cb(nodes, depth + 1, "true", 4);
          274                         expect = EXPECT_END;
          275                         break;
          276                 case 'f': /* false */
          277                         if (GETNEXT() != 'a' || GETNEXT() != 'l' || GETNEXT() != 's' ||
          278                             GETNEXT() != 'e')
          279                                 JSON_INVALID();
          280                         nodes[depth].type = JSON_TYPE_BOOL;
          281                         cb(nodes, depth + 1, "false", 5);
          282                         expect = EXPECT_END;
          283                         break;
          284                 case 'n': /* null */
          285                         if (GETNEXT() != 'u' || GETNEXT() != 'l' || GETNEXT() != 'l')
          286                                 JSON_INVALID();
          287                         nodes[depth].type = JSON_TYPE_NULL;
          288                         cb(nodes, depth + 1, "null", 4);
          289                         expect = EXPECT_END;
          290                         break;
          291                 default: /* number */
          292                         nodes[depth].type = JSON_TYPE_NUMBER;
          293                         p = 0;
          294                         pri[p++] = c;
          295                         expect = EXPECT_END;
          296                         while (1) {
          297                                 c = GETNEXT();
          298                                 if (c == EOF ||
          299                                     (!ISDIGIT(c) && c != 'e' && c != 'E' &&
          300                                      c != '+' && c != '-' && c != '.') ||
          301                                     p + 1 >= sizeof(pri)) {
          302                                         pri[p] = '\0';
          303                                         cb(nodes, depth + 1, pri, p);
          304                                         goto handlechr; /* do not read next char, handle this */
          305                                 } else {
          306                                         pri[p++] = c;
          307                                 }
          308                         }
          309                 }
          310         }
          311         if (depth)
          312                 JSON_INVALID(); /* unbalanced nodes */
          313 
          314         ret = 0; /* success */
          315 end:
          316         for (depth = 0; depth < sizeof(nodes) / sizeof(nodes[0]); depth++)
          317                 free(nodes[depth].name);
          318         free(str);
          319 
          320         return ret;
          321 }