extractjson.c - extractjson - extract embedded JSON metadata from HTML pages
 (HTM) git clone git://git.codemadness.org/extractjson
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
       extractjson.c (7744B)
       ---
            1 #include <ctype.h>
            2 #include <errno.h>
            3 #include <stdio.h>
            4 #include <stdlib.h>
            5 #include <string.h>
            6 #include <strings.h>
            7 
            8 #define GETNEXT getnext
            9 
           10 #define ISALPHA(c) ((((unsigned)c) | 32) - 'a' < 26)
           11 #define ISSPACE(c) ((c) == ' ' || ((((unsigned)c) - '\t') < 5))
           12 
           13 typedef struct xmlparser {
           14         /* current tag */
           15         char tag[1024];
           16         size_t taglen;
           17         /* current tag is in shortform ? <tag /> */
           18         int isshorttag;
           19         /* current attribute name */
           20         char name[1024];
           21         /* data buffer used for tag data, cdata and attribute data */
           22         char data[BUFSIZ];
           23 } XMLParser;
           24 
           25 static XMLParser parser;
           26 static int isjson;
           27 static const char *ignorestate, *endtag;
           28 static int (*getnext)(void) = getchar;
           29 
           30 /* ignore parsing all HTML data inside <script> tags, because they may contain
           31    characters such as '<' and '>' */
           32 static int
           33 getnext_json(void)
           34 {
           35         int c;
           36 
           37         if ((c = getchar()) == EOF)
           38                 return EOF;
           39 
           40         if (tolower(c) == tolower((unsigned char)*ignorestate)) {
           41                 ignorestate++;
           42                 if (*ignorestate == '\0') {
           43                         getnext = getchar; /* restore */
           44                         putchar('\n');
           45                         isjson = 0;
           46                         return c;
           47                 }
           48 
           49         } else {
           50                 ignorestate = endtag;
           51                 if (c != '\r' && c != '\n')
           52                         putchar(c);
           53         }
           54 
           55         return ' ';
           56 }
           57 
           58 static void
           59 xmlattr(XMLParser *x, const char *t, size_t tl, const char *a, size_t al,
           60         const char *v, size_t vl)
           61 {
           62         if (!strcasecmp(t, "script") &&
           63             !strcasecmp(a, "type")  &&
           64             (strstr(v, "application/json") ||
           65             strstr(v, "application/ld+json") ||
           66             strstr(v, "text/json")))
           67                 isjson = 1;
           68 }
           69 
           70 static void
           71 xmltagstartparsed(XMLParser *x, const char *t, size_t tl, int isshort)
           72 {
           73         if (!strcasecmp(t, "script") && isjson) {
           74                 ignorestate = endtag = "</script>";
           75                 getnext = getnext_json;
           76                 return;
           77         }
           78 }
           79 
           80 static void
           81 xml_parseattrs(XMLParser *x)
           82 {
           83         size_t namelen = 0, valuelen;
           84         int c, endsep, endname = 0, valuestart = 0;
           85 
           86         while ((c = GETNEXT()) != EOF) {
           87                 if (ISSPACE(c)) {
           88                         if (namelen)
           89                                 endname = 1;
           90                         continue;
           91                 } else if (c == '?')
           92                         ; /* ignore */
           93                 else if (c == '=') {
           94                         x->name[namelen] = '\0';
           95                         valuestart = 1;
           96                         endname = 1;
           97                 } else if (namelen && ((endname && !valuestart && ISALPHA(c)) || (c == '>' || c == '/'))) {
           98                         /* attribute without value */
           99                         xmlattr(x, x->tag, x->taglen, x->name, namelen, "", 0);
          100                         x->name[namelen] = '\0';
          101                         endname = 0;
          102                         x->name[0] = c;
          103                         namelen = 1;
          104                 } else if (namelen && valuestart) {
          105                         /* attribute with value */
          106                         valuelen = 0;
          107                         if (c == '\'' || c == '"') {
          108                                 endsep = c;
          109                         } else {
          110                                 endsep = ' '; /* ISSPACE() */
          111                                 goto startvalue;
          112                         }
          113 
          114                         while ((c = GETNEXT()) != EOF) {
          115 startvalue:
          116                                 if (c == '&') { /* entities */
          117                                         x->data[valuelen] = '\0';
          118                                         /* call data function with data before entity if there is data */
          119                                         if (valuelen)
          120                                                 xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
          121                                         x->data[0] = c;
          122                                         valuelen = 1;
          123                                         while ((c = GETNEXT()) != EOF) {
          124                                                 if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c))))
          125                                                         break;
          126                                                 if (valuelen < sizeof(x->data) - 1)
          127                                                         x->data[valuelen++] = c;
          128                                                 else {
          129                                                         /* entity too long for buffer, handle as normal data */
          130                                                         x->data[valuelen] = '\0';
          131                                                         xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
          132                                                         x->data[0] = c;
          133                                                         valuelen = 1;
          134                                                         break;
          135                                                 }
          136                                                 if (c == ';') {
          137                                                         x->data[valuelen] = '\0';
          138                                                         valuelen = 0;
          139                                                         break;
          140                                                 }
          141                                         }
          142                                 } else if (c != endsep && !(endsep == ' ' && (c == '>' || ISSPACE(c)))) {
          143                                         if (valuelen < sizeof(x->data) - 1) {
          144                                                 x->data[valuelen++] = c;
          145                                         } else {
          146                                                 x->data[valuelen] = '\0';
          147                                                 xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
          148                                                 x->data[0] = c;
          149                                                 valuelen = 1;
          150                                         }
          151                                 }
          152                                 if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c)))) {
          153                                         x->data[valuelen] = '\0';
          154                                         xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
          155                                         break;
          156                                 }
          157                         }
          158                         namelen = endname = valuestart = 0;
          159                 } else if (namelen < sizeof(x->name) - 1) {
          160                         x->name[namelen++] = c;
          161                 }
          162                 if (c == '>') {
          163                         break;
          164                 } else if (c == '/') {
          165                         x->isshorttag = 1;
          166                         x->name[0] = '\0';
          167                         namelen = 0;
          168                 }
          169         }
          170 }
          171 
          172 static void
          173 xml_parsecomment(XMLParser *x)
          174 {
          175         int c, i = 0;
          176 
          177         while ((c = GETNEXT()) != EOF) {
          178                 if (c == '-') {
          179                         if (++i > 2)
          180                                 i = 2;
          181                         continue;
          182                 } else if (c == '>' && i == 2) {
          183                         return;
          184                 } else if (i) {
          185                         i = 0;
          186                 }
          187         }
          188 }
          189 
          190 static void
          191 xml_parsecdata(XMLParser *x)
          192 {
          193         size_t datalen = 0, i = 0;
          194         int c;
          195 
          196         while ((c = GETNEXT()) != EOF) {
          197                 if (c == ']') {
          198                         if (++i > 2)
          199                                 i = 2;
          200                         continue;
          201                 } else if (c == '>' && i == 2) {
          202                         return;
          203                 } else if (i) {
          204                         i = 0;
          205                 }
          206 
          207                 if (datalen < sizeof(x->data) - 1) {
          208                         x->data[datalen++] = c;
          209                 } else {
          210                         x->data[datalen] = '\0';
          211                         x->data[0] = c;
          212                         datalen = 1;
          213                 }
          214         }
          215 }
          216 
          217 static void
          218 xml_parse(XMLParser *x)
          219 {
          220         size_t datalen, tagdatalen;
          221         int c, isend;
          222 
          223         while ((c = GETNEXT()) != EOF && c != '<')
          224                 ; /* skip until < */
          225 
          226         while (c != EOF) {
          227                 if (c == '<') { /* parse tag */
          228                         if ((c = GETNEXT()) == EOF)
          229                                 return;
          230 
          231                         if (c == '!') { /* cdata and comments */
          232                                 for (tagdatalen = 0; (c = GETNEXT()) != EOF;) {
          233                                         /* NOTE: sizeof(x->data) must be at least sizeof("[CDATA[") */
          234                                         if (tagdatalen <= sizeof("[CDATA[") - 1)
          235                                                 x->data[tagdatalen++] = c;
          236                                         if (c == '>')
          237                                                 break;
          238                                         else if (c == '-' && tagdatalen == sizeof("--") - 1 &&
          239                                                         (x->data[0] == '-')) {
          240                                                 xml_parsecomment(x);
          241                                                 break;
          242                                         } else if (c == '[') {
          243                                                 if (tagdatalen == sizeof("[CDATA[") - 1 &&
          244                                                     !strncmp(x->data, "[CDATA[", tagdatalen)) {
          245                                                         xml_parsecdata(x);
          246                                                         break;
          247                                                 }
          248                                         }
          249                                 }
          250                         } else {
          251                                 /* normal tag (open, short open, close), processing instruction. */
          252                                 x->tag[0] = c;
          253                                 x->taglen = 1;
          254                                 x->isshorttag = isend = 0;
          255 
          256                                 /* treat processing instruction as shorttag, don't strip "?" prefix. */
          257                                 if (c == '?') {
          258                                         x->isshorttag = 1;
          259                                 } else if (c == '/') {
          260                                         if ((c = GETNEXT()) == EOF)
          261                                                 return;
          262                                         x->tag[0] = c;
          263                                         isend = 1;
          264                                 }
          265 
          266                                 while ((c = GETNEXT()) != EOF) {
          267                                         if (c == '/')
          268                                                 x->isshorttag = 1; /* short tag */
          269                                         else if (c == '>' || ISSPACE(c)) {
          270                                                 x->tag[x->taglen] = '\0';
          271                                                 if (isend) { /* end tag, starts with </ */
          272                                                         while (c != '>' && c != EOF) /* skip until > */
          273                                                                 c = GETNEXT();
          274                                                         x->tag[0] = '\0';
          275                                                         x->taglen = 0;
          276                                                 } else {
          277                                                         /* start tag */
          278                                                         if (ISSPACE(c))
          279                                                                 xml_parseattrs(x);
          280                                                         xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag);
          281                                                 }
          282                                                 /* call tagend for shortform or processing instruction */
          283                                                 if (x->isshorttag) {
          284                                                         x->tag[0] = '\0';
          285                                                         x->taglen = 0;
          286                                                 }
          287                                                 break;
          288                                         } else if (x->taglen < sizeof(x->tag) - 1)
          289                                                 x->tag[x->taglen++] = c; /* NOTE: tag name truncation */
          290                                 }
          291                         }
          292                 } else {
          293                         /* parse tag data */
          294                         datalen = 0;
          295                         while ((c = GETNEXT()) != EOF) {
          296                                 if (c == '&') {
          297                                         if (datalen)
          298                                                 x->data[datalen] = '\0';
          299                                         x->data[0] = c;
          300                                         datalen = 1;
          301                                         while ((c = GETNEXT()) != EOF) {
          302                                                 if (c == '<')
          303                                                         break;
          304                                                 if (datalen < sizeof(x->data) - 1)
          305                                                         x->data[datalen++] = c;
          306                                                 else {
          307                                                         /* entity too long for buffer, handle as normal data */
          308                                                         x->data[datalen] = '\0';
          309                                                         x->data[0] = c;
          310                                                         datalen = 1;
          311                                                         break;
          312                                                 }
          313                                                 if (c == ';') {
          314                                                         x->data[datalen] = '\0';
          315                                                         datalen = 0;
          316                                                         break;
          317                                                 }
          318                                         }
          319                                 } else if (c != '<') {
          320                                         if (datalen < sizeof(x->data) - 1) {
          321                                                 x->data[datalen++] = c;
          322                                         } else {
          323                                                 x->data[datalen] = '\0';
          324                                                 x->data[0] = c;
          325                                                 datalen = 1;
          326                                         }
          327                                 }
          328                                 if (c == '<') {
          329                                         x->data[datalen] = '\0';
          330                                         break;
          331                                 }
          332                         }
          333                 }
          334         }
          335 }
          336 
          337 int
          338 main(void)
          339 {
          340         xml_parse(&parser);
          341 
          342         return 0;
          343 }