codemadness.org

       initial repo - extractjson - extract embedded JSON metadata from HTML pages
 (HTM) git clone git://git.codemadness.org/extractjson
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit ad11115ba705c4c5f88f0679f2f807e4d0883970
 (HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
       Date:   Sun, 14 Aug 2022 11:58:19 +0200
       
       initial repo
       
       Diffstat:
         A LICENSE                             |      15 +++++++++++++++
         A Makefile                            |       5 +++++
         A README                              |      15 +++++++++++++++
         A extractjson.1                       |      29 +++++++++++++++++++++++++++++
         A extractjson.c                       |     341 +++++++++++++++++++++++++++++++
       
       5 files changed, 405 insertions(+), 0 deletions(-)
       ---
 (DIR) diff --git a/LICENSE b/LICENSE
       @@ -0,0 +1,15 @@
       +ISC License
       +
       +Copyright (c) 2022 Hiltjo Posthuma <hiltjo@codemadness.org>
       +
       +Permission to use, copy, modify, and/or distribute this software for any
       +purpose with or without fee is hereby granted, provided that the above
       +copyright notice and this permission notice appear in all copies.
       +
       +THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
       +WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
       +MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
       +ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
       +WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
       +ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
       +OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 (DIR) diff --git a/Makefile b/Makefile
       @@ -0,0 +1,5 @@
       +build: clean
       +        ${CC} -o extractjson extractjson.c ${CFLAGS} ${LDFLAGS}
       +
       +clean:
       +        rm -f *.o extractjson
 (DIR) diff --git a/README b/README
       @@ -0,0 +1,15 @@
       +extractjson
       +-----------
       +
       +Extracts embedded JSON metadata from HTML pages, such as data in the tags:
       +<script type="application/ld+json"> 
       +
       +It reads HTML from stdin and outputs JSON per line to stdout.
       +
       +Example:
       +
       +        curl -s https://www.imdb.com/title/tt0107048/ | extractjson | sed 1q | json2tsv
       +
       +This extracts the JSON metadata from the IMDB page of Ground Hog data.
       +It uses the first embedded JSON fragment and pipes it to json2tsv.
       +It can then be further processed using awk to get the relevant data.
 (DIR) diff --git a/extractjson.1 b/extractjson.1
       @@ -0,0 +1,29 @@
       +.Dd May 2, 2022
       +.Dt EXTRACTJSON 1
       +.Os
       +.Sh NAME
       +.Nm extractjson
       +.Nd extracts embedded JSON metadata from HTML pages
       +.Sh SYNOPSIS
       +.Nm
       +.Sh DESCRIPTION
       +.Nm
       +extracts embedded JSON metadata from HTML pages, such as data in the tags:
       +<script type="application/ld+json">
       +.Pp
       +It reads HTML from stdin and outputs JSON per line to stdout.
       +.Sh EXIT STATUS
       +.Ex -std
       +.Sh EXAMPLES
       +.Bd -literal
       +curl -s https://www.imdb.com/title/tt0107048/ | extractjson | sed 1q | json2tsv
       +.Ed
       +.Pp
       +This extracts the JSON metadata from the IMDB page of the movie "Ground Hog Day".
       +It uses the first embedded JSON fragment and pipes it to json2tsv.
       +It can then be further processed using awk to get the relevant data.
       +.Sh SEE ALSO
       +.Xr curl 1 ,
       +.Xr json2tsv 1
       +.Sh AUTHORS
       +.An Hiltjo Posthuma Aq Mt hiltjo@codemadness.org
 (DIR) diff --git a/extractjson.c b/extractjson.c
       @@ -0,0 +1,341 @@
       +#include <ctype.h>
       +#include <errno.h>
       +#include <stdio.h>
       +#include <stdlib.h>
       +#include <string.h>
       +#include <strings.h>
       +
       +#define GETNEXT getnext
       +
       +#define ISALPHA(c) ((((unsigned)c) | 32) - 'a' < 26)
       +#define ISSPACE(c) ((c) == ' ' || ((((unsigned)c) - '\t') < 5))
       +
       +typedef struct xmlparser {
       +        /* current tag */
       +        char tag[1024];
       +        size_t taglen;
       +        /* current tag is in shortform ? <tag /> */
       +        int isshorttag;
       +        /* current attribute name */
       +        char name[1024];
       +        /* data buffer used for tag data, cdata and attribute data */
       +        char data[BUFSIZ];
       +} XMLParser;
       +
       +static XMLParser parser;
       +static int isjson;
       +static const char *ignorestate, *endtag;
       +static int (*getnext)(void) = getchar;
       +
       +/* ignore parsing all HTML data inside <script> tags, because they may contain
       +   characters such as '<' and '>' */
       +static int
       +getnext_json(void)
       +{
       +        int c;
       +
       +        if ((c = getchar()) == EOF)
       +                return EOF;
       +
       +        if (tolower(c) == tolower((unsigned char)*ignorestate)) {
       +                ignorestate++;
       +                if (*ignorestate == '\0') {
       +                        getnext = getchar; /* restore */
       +                        putchar('\n');
       +                        isjson = 0;
       +                        return c;
       +                }
       +
       +        } else {
       +                ignorestate = endtag;
       +                if (c != '\r' && c != '\n')
       +                        putchar(c);
       +        }
       +
       +        return ' ';
       +}
       +
       +static void
       +xmlattr(XMLParser *x, const char *t, size_t tl, const char *a, size_t al,
       +        const char *v, size_t vl)
       +{
       +        if (!strcasecmp(t, "script") &&
       +            !strcasecmp(a, "type")  &&
       +            (strstr(v, "application/json") ||
       +            strstr(v, "application/ld+json") ||
       +            strstr(v, "text/json")))
       +                isjson = 1;
       +}
       +
       +static void
       +xmltagstartparsed(XMLParser *x, const char *t, size_t tl, int isshort)
       +{
       +        if (!strcasecmp(t, "script") && isjson) {
       +                ignorestate = endtag = "</script>";
       +                getnext = getnext_json;
       +                return;
       +        }
       +}
       +
       +static void
       +xml_parseattrs(XMLParser *x)
       +{
       +        size_t namelen = 0, valuelen;
       +        int c, endsep, endname = 0, valuestart = 0;
       +
       +        while ((c = GETNEXT()) != EOF) {
       +                if (ISSPACE(c)) {
       +                        if (namelen)
       +                                endname = 1;
       +                        continue;
       +                } else if (c == '?')
       +                        ; /* ignore */
       +                else if (c == '=') {
       +                        x->name[namelen] = '\0';
       +                        valuestart = 1;
       +                        endname = 1;
       +                } else if (namelen && ((endname && !valuestart && ISALPHA(c)) || (c == '>' || c == '/'))) {
       +                        /* attribute without value */
       +                        xmlattr(x, x->tag, x->taglen, x->name, namelen, "", 0);
       +                        x->name[namelen] = '\0';
       +                        endname = 0;
       +                        x->name[0] = c;
       +                        namelen = 1;
       +                } else if (namelen && valuestart) {
       +                        /* attribute with value */
       +                        valuelen = 0;
       +                        if (c == '\'' || c == '"') {
       +                                endsep = c;
       +                        } else {
       +                                endsep = ' '; /* ISSPACE() */
       +                                goto startvalue;
       +                        }
       +
       +                        while ((c = GETNEXT()) != EOF) {
       +startvalue:
       +                                if (c == '&') { /* entities */
       +                                        x->data[valuelen] = '\0';
       +                                        /* call data function with data before entity if there is data */
       +                                        if (valuelen)
       +                                                xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
       +                                        x->data[0] = c;
       +                                        valuelen = 1;
       +                                        while ((c = GETNEXT()) != EOF) {
       +                                                if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c))))
       +                                                        break;
       +                                                if (valuelen < sizeof(x->data) - 1)
       +                                                        x->data[valuelen++] = c;
       +                                                else {
       +                                                        /* entity too long for buffer, handle as normal data */
       +                                                        x->data[valuelen] = '\0';
       +                                                        xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
       +                                                        x->data[0] = c;
       +                                                        valuelen = 1;
       +                                                        break;
       +                                                }
       +                                                if (c == ';') {
       +                                                        x->data[valuelen] = '\0';
       +                                                        valuelen = 0;
       +                                                        break;
       +                                                }
       +                                        }
       +                                } else if (c != endsep && !(endsep == ' ' && (c == '>' || ISSPACE(c)))) {
       +                                        if (valuelen < sizeof(x->data) - 1) {
       +                                                x->data[valuelen++] = c;
       +                                        } else {
       +                                                x->data[valuelen] = '\0';
       +                                                xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
       +                                                x->data[0] = c;
       +                                                valuelen = 1;
       +                                        }
       +                                }
       +                                if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c)))) {
       +                                        x->data[valuelen] = '\0';
       +                                        xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
       +                                        break;
       +                                }
       +                        }
       +                        namelen = endname = valuestart = 0;
       +                } else if (namelen < sizeof(x->name) - 1) {
       +                        x->name[namelen++] = c;
       +                }
       +                if (c == '>') {
       +                        break;
       +                } else if (c == '/') {
       +                        x->isshorttag = 1;
       +                        x->name[0] = '\0';
       +                        namelen = 0;
       +                }
       +        }
       +}
       +
       +static void
       +xml_parsecomment(XMLParser *x)
       +{
       +        int c, i = 0;
       +
       +        while ((c = GETNEXT()) != EOF) {
       +                if (c == '-') {
       +                        if (++i > 2)
       +                                i = 2;
       +                        continue;
       +                } else if (c == '>' && i == 2) {
       +                        return;
       +                } else if (i) {
       +                        i = 0;
       +                }
       +        }
       +}
       +
       +static void
       +xml_parsecdata(XMLParser *x)
       +{
       +        size_t datalen = 0, i = 0;
       +        int c;
       +
       +        while ((c = GETNEXT()) != EOF) {
       +                if (c == ']') {
       +                        if (++i > 2)
       +                                i = 2;
       +                        continue;
       +                } else if (c == '>' && i == 2) {
       +                        return;
       +                } else if (i) {
       +                        i = 0;
       +                }
       +
       +                if (datalen < sizeof(x->data) - 1) {
       +                        x->data[datalen++] = c;
       +                } else {
       +                        x->data[datalen] = '\0';
       +                        x->data[0] = c;
       +                        datalen = 1;
       +                }
       +        }
       +}
       +
       +static void
       +xml_parse(XMLParser *x)
       +{
       +        size_t datalen, tagdatalen;
       +        int c, isend;
       +
       +        while ((c = GETNEXT()) != EOF && c != '<')
       +                ; /* skip until < */
       +
       +        while (c != EOF) {
       +                if (c == '<') { /* parse tag */
       +                        if ((c = GETNEXT()) == EOF)
       +                                return;
       +
       +                        if (c == '!') { /* cdata and comments */
       +                                for (tagdatalen = 0; (c = GETNEXT()) != EOF;) {
       +                                        /* NOTE: sizeof(x->data) must be at least sizeof("[CDATA[") */
       +                                        if (tagdatalen <= sizeof("[CDATA[") - 1)
       +                                                x->data[tagdatalen++] = c;
       +                                        if (c == '>')
       +                                                break;
       +                                        else if (c == '-' && tagdatalen == sizeof("--") - 1 &&
       +                                                        (x->data[0] == '-')) {
       +                                                xml_parsecomment(x);
       +                                                break;
       +                                        } else if (c == '[') {
       +                                                if (tagdatalen == sizeof("[CDATA[") - 1 &&
       +                                                    !strncmp(x->data, "[CDATA[", tagdatalen)) {
       +                                                        xml_parsecdata(x);
       +                                                        break;
       +                                                }
       +                                        }
       +                                }
       +                        } else {
       +                                /* normal tag (open, short open, close), processing instruction. */
       +                                x->tag[0] = c;
       +                                x->taglen = 1;
       +                                x->isshorttag = isend = 0;
       +
       +                                /* treat processing instruction as shorttag, don't strip "?" prefix. */
       +                                if (c == '?') {
       +                                        x->isshorttag = 1;
       +                                } else if (c == '/') {
       +                                        if ((c = GETNEXT()) == EOF)
       +                                                return;
       +                                        x->tag[0] = c;
       +                                        isend = 1;
       +                                }
       +
       +                                while ((c = GETNEXT()) != EOF) {
       +                                        if (c == '/')
       +                                                x->isshorttag = 1; /* short tag */
       +                                        else if (c == '>' || ISSPACE(c)) {
       +                                                x->tag[x->taglen] = '\0';
       +                                                if (isend) { /* end tag, starts with </ */
       +                                                        x->tag[0] = '\0';
       +                                                        x->taglen = 0;
       +                                                } else {
       +                                                        /* start tag */
       +                                                        if (ISSPACE(c))
       +                                                                xml_parseattrs(x);
       +                                                        xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag);
       +                                                }
       +                                                /* call tagend for shortform or processing instruction */
       +                                                if (x->isshorttag) {
       +                                                        x->tag[0] = '\0';
       +                                                        x->taglen = 0;
       +                                                }
       +                                                break;
       +                                        } else if (x->taglen < sizeof(x->tag) - 1)
       +                                                x->tag[x->taglen++] = c; /* NOTE: tag name truncation */
       +                                }
       +                        }
       +                } else {
       +                        /* parse tag data */
       +                        datalen = 0;
       +                        while ((c = GETNEXT()) != EOF) {
       +                                if (c == '&') {
       +                                        if (datalen)
       +                                                x->data[datalen] = '\0';
       +                                        x->data[0] = c;
       +                                        datalen = 1;
       +                                        while ((c = GETNEXT()) != EOF) {
       +                                                if (c == '<')
       +                                                        break;
       +                                                if (datalen < sizeof(x->data) - 1)
       +                                                        x->data[datalen++] = c;
       +                                                else {
       +                                                        /* entity too long for buffer, handle as normal data */
       +                                                        x->data[datalen] = '\0';
       +                                                        x->data[0] = c;
       +                                                        datalen = 1;
       +                                                        break;
       +                                                }
       +                                                if (c == ';') {
       +                                                        x->data[datalen] = '\0';
       +                                                        datalen = 0;
       +                                                        break;
       +                                                }
       +                                        }
       +                                } else if (c != '<') {
       +                                        if (datalen < sizeof(x->data) - 1) {
       +                                                x->data[datalen++] = c;
       +                                        } else {
       +                                                x->data[datalen] = '\0';
       +                                                x->data[0] = c;
       +                                                datalen = 1;
       +                                        }
       +                                }
       +                                if (c == '<') {
       +                                        x->data[datalen] = '\0';
       +                                        break;
       +                                }
       +                        }
       +                }
       +        }
       +}
       +
       +int
       +main(void)
       +{
       +        xml_parse(&parser);
       +
       +        return 0;
       +}