refactor urls into general replacement function and replace some HTML entities - tscrape - twitter scraper
 (HTM) git clone git://git.codemadness.org/tscrape
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit ff8d2ecaed4cb56e6cc1ccdc4a43e1a3e45eb61f
 (DIR) parent 62a3853f6428208e5be727175479ebcede127497
 (HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
       Date:   Sun,  7 Jun 2020 00:08:42 +0200
       
       refactor urls into general replacement function and replace some HTML entities
       
       Diffstat:
         M tscrape.c                           |      64 ++++++++++++++++---------------
       
       1 file changed, 34 insertions(+), 30 deletions(-)
       ---
 (DIR) diff --git a/tscrape.c b/tscrape.c
       @@ -30,17 +30,16 @@ struct tweet {
                struct tweet *next;
        };
        
       -/* url entities and their replacements */
       -struct url {
       -        char url[256];
       -        size_t url_len;
       -        char expanded_url[1024];
       +struct replacement {
       +        char search[256];
       +        size_t search_len;
       +        char replace[1024];
        
       -        struct url *next;
       +        struct replacement *next;
        };
        
        static struct tweet *tweets, *tc;
       -static struct url *urls, *uc;
       +static struct replacement *reps, *rc;
        static char expanded_url[1024], media_url[1024], url[256];
        
        #define MAX_PINNED 5
       @@ -156,7 +155,7 @@ printescape(const char *s)
        static void
        printexpand(const char *s)
        {
       -        struct url *u;
       +        struct replacement *r;
        
                for (; *s; s++) {
                        if (isspace((unsigned char)*s)) {
       @@ -165,14 +164,14 @@ printexpand(const char *s)
                        } else if (iscntrl((unsigned char)*s)) {
                                continue;
                        }
       -                for (u = urls; u; u = u->next) {
       -                        if (!strncmp(s, u->url, u->url_len)) {
       -                                s += u->url_len - 1;
       -                                printescape(u->expanded_url);
       +                for (r = reps; r; r = r->next) {
       +                        if (!strncmp(s, r->search, r->search_len)) {
       +                                s += r->search_len - 1;
       +                                printescape(r->replace);
                                        break;
                                }
                        }
       -                if (!u)
       +                if (!r)
                                putchar(*s);
                }
        }
       @@ -231,25 +230,25 @@ addtweet(void)
        }
        
        void
       -addurl(const char *url, const char *expanded_url)
       +addreplacement(const char *search, const char *replace)
        {
       -        struct url *u;
       +        struct replacement *r;
        
       -        for (u = urls; u; u = u->next) {
       -                if (!strncmp(url, u->url, u->url_len))
       +        for (r = reps; r; r = r->next) {
       +                if (!strncmp(search, r->search, r->search_len))
                                return;
                }
        
       -        if (!(u = calloc(1, sizeof(*u))))
       +        if (!(r = calloc(1, sizeof(*r))))
                        err(1, "calloc");
       -        strlcpy(u->url, url, sizeof(u->url));
       -        u->url_len = strlen(u->url);
       -        strlcpy(u->expanded_url, expanded_url, sizeof(u->expanded_url));
       +        strlcpy(r->search, search, sizeof(r->search));
       +        r->search_len = strlen(r->search);
       +        strlcpy(r->replace, replace, sizeof(r->replace));
        
       -        if (urls)
       -                uc = uc->next = u;
       +        if (reps)
       +                rc = rc->next = r;
                else
       -                urls = uc = u;
       +                reps = rc = r;
        }
        
        void
       @@ -353,7 +352,7 @@ processnodes(struct json_node *nodes, size_t depth, const char *str)
                        } else if (!strcmp(nodes[5].name, "expanded_url")) {
        //                        printf("DEBUG: expanded_url: %s\n", str);
                                /* assumes "expanded_url" is specified after "url" */
       -                        addurl(url, str);
       +                        addreplacement(url, str);
                                url[0] = '\0';
                        }
                }
       @@ -380,9 +379,9 @@ processnodes(struct json_node *nodes, size_t depth, const char *str)
                        } else if (!strcmp(nodes[5].name, "type")) {
        //                        printf("DEBUG: type: %s\n", str);
                                if (!strcmp(str, "photo")) {
       -                                addurl(url, media_url);
       +                                addreplacement(url, media_url);
                                } else {
       -                                addurl(url, expanded_url);
       +                                addreplacement(url, expanded_url);
                                }
                                media_url[0] = url[0] = expanded_url[0] = '\0';
                        }
       @@ -404,7 +403,7 @@ processnodes(struct json_node *nodes, size_t depth, const char *str)
                                strlcpy(url, str, sizeof(url));
                        } else if (!strcmp(nodes[6].name, "expanded_url")) {
        //                        printf("DEBUG: expanded_url: %s\n", str);
       -                        addurl(url, str);
       +                        addreplacement(url, str);
                                url[0] = '\0';
                        }
                }
       @@ -433,9 +432,9 @@ processnodes(struct json_node *nodes, size_t depth, const char *str)
                        } else if (!strcmp(nodes[6].name, "type")) {
        //                        printf("DEBUG: type: %s\n", str);
                                if (!strcmp(str, "photo")) {
       -                                addurl(url, media_url);
       +                                addreplacement(url, media_url);
                                } else {
       -                                addurl(url, expanded_url);
       +                                addreplacement(url, expanded_url);
                                }
                                media_url[0] = url[0] = expanded_url[0] = '\0';
                        }
       @@ -454,6 +453,11 @@ main(void)
                if (parsejson(processnodes))
                        errx(2, "invalid JSON");
        
       +        /* replace some HTML entities */
       +        addreplacement("&lt;", "<");
       +        addreplacement("&gt;", ">");
       +        addreplacement("&amp;", "&");
       +
                for (t = tweets; t; t = t->next) {
                        /* check for pinned tweets */
                        for (i = 0; i < npinned; i++) {