json.c - tscrape - twitter scraper (not working anymore)
 (HTM) git clone git://git.codemadness.org/tscrape
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
       json.c (7806B)
       ---
            1 #include <ctype.h>
            2 #include <errno.h>
            3 #include <stdint.h>
            4 #include <stdio.h>
            5 #include <stdlib.h>
            6 #include <string.h>
            7 
            8 #define GETNEXT getchar
            9 
           10 #include "json.h"
           11 
           12 static int
           13 codepointtoutf8(long r, char *s)
           14 {
           15         if (r == 0) {
           16                 return 0; /* NUL byte */
           17         } else if (r <= 0x7F) {
           18                 /* 1 byte: 0aaaaaaa */
           19                 s[0] = r;
           20                 return 1;
           21         } else if (r <= 0x07FF) {
           22                 /* 2 bytes: 00000aaa aabbbbbb */
           23                 s[0] = 0xC0 | ((r & 0x0007C0) >>  6); /* 110aaaaa */
           24                 s[1] = 0x80 |  (r & 0x00003F);        /* 10bbbbbb */
           25                 return 2;
           26         } else if (r <= 0xFFFF) {
           27                 /* 3 bytes: aaaabbbb bbcccccc */
           28                 s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */
           29                 s[1] = 0x80 | ((r & 0x000FC0) >>  6); /* 10bbbbbb */
           30                 s[2] = 0x80 |  (r & 0x00003F);        /* 10cccccc */
           31                 return 3;
           32         } else {
           33                 /* 4 bytes: 000aaabb bbbbcccc ccdddddd */
           34                 s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */
           35                 s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */
           36                 s[2] = 0x80 | ((r & 0x000FC0) >>  6); /* 10cccccc */
           37                 s[3] = 0x80 |  (r & 0x00003F);        /* 10dddddd */
           38                 return 4;
           39         }
           40 }
           41 
           42 static int
           43 hexdigit(int c)
           44 {
           45         if (c >= '0' && c <= '9')
           46                 return c - '0';
           47         else if (c >= 'a' && c <= 'f')
           48                 return 10 + (c - 'a');
           49         else if (c >= 'A' && c <= 'F')
           50                 return 10 + (c - 'A');
           51         return 0;
           52 }
           53 
           54 static int
           55 capacity(char **value, size_t *sz, size_t cur, size_t inc)
           56 {
           57         size_t need, newsiz;
           58         char *newp;
           59 
           60         /* check for addition overflow */
           61         if (cur > SIZE_MAX - inc) {
           62                 errno = EOVERFLOW;
           63                 return -1;
           64         }
           65         need = cur + inc;
           66 
           67         if (need > *sz) {
           68                 if (need > SIZE_MAX / 2) {
           69                         newsiz = SIZE_MAX;
           70                 } else {
           71                         for (newsiz = *sz < 64 ? 64 : *sz; newsiz <= need; newsiz *= 2)
           72                                 ;
           73                 }
           74                 if (!(newp = realloc(*value, newsiz)))
           75                         return -1; /* up to caller to free *value */
           76                 *value = newp;
           77                 *sz = newsiz;
           78         }
           79         return 0;
           80 }
           81 
           82 #define EXPECT_VALUE         "{[\"-0123456789tfn"
           83 #define EXPECT_STRING        "\""
           84 #define EXPECT_END           "}],"
           85 #define EXPECT_OBJECT_STRING EXPECT_STRING "}"
           86 #define EXPECT_OBJECT_KEY    ":"
           87 #define EXPECT_ARRAY_VALUE   EXPECT_VALUE "]"
           88 
           89 #define JSON_INVALID()       do { ret = JSON_ERROR_INVALID; goto end; } while (0);
           90 
           91 int
           92 parsejson(void (*cb)(struct json_node *, size_t, const char *))
           93 {
           94         struct json_node nodes[JSON_MAX_NODE_DEPTH] = { 0 };
           95         size_t depth = 0, p = 0, len, sz = 0;
           96         long cp, hi, lo;
           97         char pri[128], *str = NULL;
           98         int c, i, escape, iskey = 0, ret = JSON_ERROR_MEM;
           99         const char *expect = EXPECT_VALUE;
          100 
          101         if (capacity(&(nodes[0].name), &(nodes[0].namesiz), 0, 1) == -1)
          102                 goto end;
          103         nodes[0].name[0] = '\0';
          104 
          105         while (1) {
          106                 c = GETNEXT();
          107 handlechr:
          108                 if (c == EOF)
          109                         break;
          110 
          111                 /* skip JSON white-space, (NOTE: no \v, \f, \b etc) */
          112                 if (c == ' ' || c == '\t' || c == '\n' || c == '\r')
          113                         continue;
          114 
          115                 if (!c || !strchr(expect, c))
          116                         JSON_INVALID();
          117 
          118                 switch (c) {
          119                 case ':':
          120                         iskey = 0;
          121                         expect = EXPECT_VALUE;
          122                         break;
          123                 case '"':
          124                         nodes[depth].type = JSON_TYPE_STRING;
          125                         escape = 0;
          126                         len = 0;
          127                         while (1) {
          128                                 c = GETNEXT();
          129 chr:
          130                                 /* EOF or control char: 0x7f is not defined as a control char in RFC8259 */
          131                                 if (c < 0x20)
          132                                         JSON_INVALID();
          133 
          134                                 if (escape) {
          135 escchr:
          136                                         escape = 0;
          137                                         switch (c) {
          138                                         case '"': /* FALLTHROUGH */
          139                                         case '\\':
          140                                         case '/': break;
          141                                         case 'b': c = '\b'; break;
          142                                         case 'f': c = '\f'; break;
          143                                         case 'n': c = '\n'; break;
          144                                         case 'r': c = '\r'; break;
          145                                         case 't': c = '\t'; break;
          146                                         case 'u': /* hex hex hex hex */
          147                                                 if (capacity(&str, &sz, len, 4) == -1)
          148                                                         goto end;
          149                                                 for (i = 12, cp = 0; i >= 0; i -= 4) {
          150                                                         if ((c = GETNEXT()) == EOF || !isxdigit(c))
          151                                                                 JSON_INVALID(); /* invalid code point */
          152                                                         cp |= (hexdigit(c) << i);
          153                                                 }
          154                                                 /* RFC8259 - 7. Strings - surrogates.
          155                                                  * 0xd800 - 0xdb7f - high surrogates */
          156                                                 if (cp >= 0xd800 && cp <= 0xdb7f) {
          157                                                         if ((c = GETNEXT()) != '\\') {
          158                                                                 len += codepointtoutf8(cp, &str[len]);
          159                                                                 goto chr;
          160                                                         }
          161                                                         if ((c = GETNEXT()) != 'u') {
          162                                                                 len += codepointtoutf8(cp, &str[len]);
          163                                                                 goto escchr;
          164                                                         }
          165                                                         for (hi = cp, i = 12, lo = 0; i >= 0; i -= 4) {
          166                                                                 if ((c = GETNEXT()) == EOF || !isxdigit(c))
          167                                                                         JSON_INVALID(); /* invalid code point */
          168                                                                 lo |= (hexdigit(c) << i);
          169                                                         }
          170                                                         /* 0xdc00 - 0xdfff - low surrogates */
          171                                                         if (lo >= 0xdc00 && lo <= 0xdfff) {
          172                                                                 cp = (hi << 10) + lo - 56613888; /* - offset */
          173                                                         } else {
          174                                                                 /* handle graceful: raw invalid output bytes */
          175                                                                 len += codepointtoutf8(hi, &str[len]);
          176                                                                 if (capacity(&str, &sz, len, 4) == -1)
          177                                                                         goto end;
          178                                                                 len += codepointtoutf8(lo, &str[len]);
          179                                                                 continue;
          180                                                         }
          181                                                 }
          182                                                 len += codepointtoutf8(cp, &str[len]);
          183                                                 continue;
          184                                         default:
          185                                                 JSON_INVALID(); /* invalid escape char */
          186                                         }
          187                                         if (capacity(&str, &sz, len, 1) == -1)
          188                                                 goto end;
          189                                         str[len++] = c;
          190                                 } else if (c == '\\') {
          191                                         escape = 1;
          192                                 } else if (c == '"') {
          193                                         if (capacity(&str, &sz, len, 1) == -1)
          194                                                 goto end;
          195                                         str[len++] = '\0';
          196 
          197                                         if (iskey) {
          198                                                 /* copy string as key, including NUL byte */
          199                                                 if (capacity(&(nodes[depth].name), &(nodes[depth].namesiz), len, 1) == -1)
          200                                                         goto end;
          201                                                 memcpy(nodes[depth].name, str, len);
          202                                         } else {
          203                                                 cb(nodes, depth + 1, str);
          204                                         }
          205                                         break;
          206                                 } else {
          207                                         if (capacity(&str, &sz, len, 1) == -1)
          208                                                 goto end;
          209                                         str[len++] = c;
          210                                 }
          211                         }
          212                         if (iskey)
          213                                 expect = EXPECT_OBJECT_KEY;
          214                         else
          215                                 expect = EXPECT_END;
          216                         break;
          217                 case '[':
          218                 case '{':
          219                         if (depth + 1 >= JSON_MAX_NODE_DEPTH)
          220                                 JSON_INVALID(); /* too deep */
          221 
          222                         nodes[depth].index = 0;
          223                         if (c == '[') {
          224                                 nodes[depth].type = JSON_TYPE_ARRAY;
          225                                 expect = EXPECT_ARRAY_VALUE;
          226                         } else if (c == '{') {
          227                                 iskey = 1;
          228                                 nodes[depth].type = JSON_TYPE_OBJECT;
          229                                 expect = EXPECT_OBJECT_STRING;
          230                         }
          231 
          232                         cb(nodes, depth + 1, "");
          233 
          234                         depth++;
          235                         nodes[depth].index = 0;
          236                         if (capacity(&(nodes[depth].name), &(nodes[depth].namesiz), 0, 1) == -1)
          237                                 goto end;
          238                         nodes[depth].name[0] = '\0';
          239                         break;
          240                 case ']':
          241                 case '}':
          242                         if (!depth ||
          243                            (c == ']' && nodes[depth - 1].type != JSON_TYPE_ARRAY) ||
          244                            (c == '}' && nodes[depth - 1].type != JSON_TYPE_OBJECT))
          245                                 JSON_INVALID(); /* unbalanced nodes */
          246 
          247                         nodes[--depth].index++;
          248                         expect = EXPECT_END;
          249                         break;
          250                 case ',':
          251                         if (!depth)
          252                                 JSON_INVALID(); /* unbalanced nodes */
          253 
          254                         nodes[depth - 1].index++;
          255                         if (nodes[depth - 1].type == JSON_TYPE_OBJECT) {
          256                                 iskey = 1;
          257                                 expect = EXPECT_STRING;
          258                         } else {
          259                                 expect = EXPECT_VALUE;
          260                         }
          261                         break;
          262                 case 't': /* true */
          263                         if (GETNEXT() != 'r' || GETNEXT() != 'u' || GETNEXT() != 'e')
          264                                 JSON_INVALID();
          265                         nodes[depth].type = JSON_TYPE_BOOL;
          266                         cb(nodes, depth + 1, "true");
          267                         expect = EXPECT_END;
          268                         break;
          269                 case 'f': /* false */
          270                         if (GETNEXT() != 'a' || GETNEXT() != 'l' || GETNEXT() != 's' ||
          271                             GETNEXT() != 'e')
          272                                 JSON_INVALID();
          273                         nodes[depth].type = JSON_TYPE_BOOL;
          274                         cb(nodes, depth + 1, "false");
          275                         expect = EXPECT_END;
          276                         break;
          277                 case 'n': /* null */
          278                         if (GETNEXT() != 'u' || GETNEXT() != 'l' || GETNEXT() != 'l')
          279                                 JSON_INVALID();
          280                         nodes[depth].type = JSON_TYPE_NULL;
          281                         cb(nodes, depth + 1, "null");
          282                         expect = EXPECT_END;
          283                         break;
          284                 default: /* number */
          285                         nodes[depth].type = JSON_TYPE_NUMBER;
          286                         p = 0;
          287                         pri[p++] = c;
          288                         expect = EXPECT_END;
          289                         while (1) {
          290                                 c = GETNEXT();
          291                                 if (c == EOF ||
          292                                     !c || !strchr("0123456789eE+-.", c) ||
          293                                     p + 1 >= sizeof(pri)) {
          294                                         pri[p] = '\0';
          295                                         cb(nodes, depth + 1, pri);
          296                                         goto handlechr; /* do not read next char, handle this */
          297                                 } else {
          298                                         pri[p++] = c;
          299                                 }
          300                         }
          301                 }
          302         }
          303         if (depth)
          304                 JSON_INVALID(); /* unbalanced nodes */
          305 
          306         ret = 0; /* success */
          307 end:
          308         for (depth = 0; depth < sizeof(nodes) / sizeof(nodes[0]); depth++)
          309                 free(nodes[depth].name);
          310         free(str);
          311 
          312         return ret;
          313 }