improvements - surf-adblock - Surf adblock web extension
 (HTM) git clone git://git.codemadness.org/surf-adblock
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit 3cc61dad61ee13b47cc3b6a2931de9413c4c6176
 (DIR) parent 84d3f064e393f5856f4bbbfb519b267ed4a5aa0a
 (HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
       Date:   Mon,  5 Jun 2017 17:36:10 +0200
       
       improvements
       
       - fix CARET in match()
       - fix matchbegin rules (starts with ||).
       
       Diffstat:
         M TODO                                |       2 ++
         M adblock.c                           |     233 ++++++++++++++++++++-----------
         M surf-adblock.c                      |       6 +++---
         M tests/tests.c                       |      36 ++++++++++++++-----------------
       
       4 files changed, 175 insertions(+), 102 deletions(-)
       ---
 (DIR) diff --git a/TODO b/TODO
       @@ -1,3 +1,5 @@
       +- simplify match, there are only a few rules with multiple *.
       +
        - loadrules: return struct rules* ?
          on error free rules.
        
 (DIR) diff --git a/adblock.c b/adblock.c
       @@ -35,8 +35,8 @@ struct filterrule {
                int matchend;
                /* is exception rule: prefix @@ for ABP or #@# for CSS */
                int isexception;
       -        char *css; /* if non-NULL is CSS rule / hide element rule */
       -        char *uri;
       +        const char *css; /* if non-NULL is CSS rule / hide element rule */
       +        const char *uri;
                struct filterdomain *domains;
                struct filterrule *next;
        };
       @@ -184,14 +184,14 @@ string_append(String *s, const char *data, size_t len)
                memcpy(s->data + s->len, data, len);
                s->len = newlen;
                s->data[s->len] = '\0';
       +
                return len;
        }
        
        #define END          0
        #define UNMATCHABLE -2
       -#define BRACKET     -3
       -#define CARET       -4
       -#define STAR        -5
       +#define CARET       -3
       +#define STAR        -4
        
        static int
        str_next(const char *str, size_t n, size_t *step)
       @@ -275,11 +275,19 @@ match(const char *pat, const char *str, int fcase)
                                pat++;
                                m--;
                                break;
       -                default:
       +                case CARET:
                                k = str_next(str, n, &sinc);
       -                        /* TODO: write a test-case */
       -                        if (c == CARET && (k == '?' || k == '/' || k <= 0))
       +                        if (k <= 0)
       +                                return (c==END) ? 0 : 1;
       +                        str += sinc;
       +                        n -= sinc;
       +                        if (k != '?' && k != '/')
                                        return 1;
       +                        pat++;
       +                        m--;
       +                        break;
       +                default:
       +                        k = str_next(str, n, &sinc);
                                if (k <= 0)
                                        return (c==END) ? 0 : 1;
                                str += sinc;
       @@ -341,9 +349,14 @@ match(const char *pat, const char *str, int fcase)
                                break;
                        }
                        s += sinc;
       -                kfold = fcase ? casefold(k) : k;
       -                if (k != c && kfold != c)
       -                        return 1;
       +                if (c == CARET) {
       +                        if  (k != '/' && k != '?')
       +                                return 1;
       +                } else {
       +                        kfold = fcase ? casefold(k) : k;
       +                        if (k != c && kfold != c)
       +                                return 1;
       +                }
                }
        
                /* We're all done with the tails now, so throw them out */
       @@ -366,10 +379,16 @@ match(const char *pat, const char *str, int fcase)
                                k = str_next(s, endstr-s, &sinc);
                                if (!k)
                                        return 1;
       -                        kfold = fcase ? casefold(k) : k;
       -                        if (k != c && kfold != c)
       -                                break;
                                s += sinc;
       +                        if (c == CARET) {
       +                                if (k != '/' && k != '?')
       +                                        break;
       +                        } else {
       +                                kfold = fcase ? casefold(k) : k;
       +                                if (k != c && kfold != c)
       +                                        break;
       +                        }
       +
                        }
                        if (c == STAR) continue;
                        /* If we failed, advance str, by 1 char if it's a valid
       @@ -486,17 +505,20 @@ matchdomain(const char *s, const char *domain)
        }
        
        static int
       -matchrule(struct filterrule *f, const char *uri, const char *type,
       -          const char *domain)
       +matchrule(struct filterrule *f, const char *fromuri, const char *fromdomain,
       +        const char *fromrel,
       +        const char *requri, const char *reqdomain, const char *reqrel,
       +        const char *type)
        {
                /* NOTE: order matters, see FilterType enum values */
                struct filterdomain *d;
                char pat[1024];
       -        int r, m;
       +        const char *uri;
       +        int len, r;
        
                r = f->domains ? 0 : 1;
                for (d = f->domains; d; d = d->next) {
       -                if (matchdomain(d->domain, domain)) {
       +                if (matchdomain(d->domain, fromdomain)) {
                                if (r && d->inverse)
                                        r = 0;
                                else if (!r && !d->inverse)
       @@ -521,39 +543,58 @@ matchrule(struct filterrule *f, const char *uri, const char *type,
                        return 0;
        #endif
        
       -        r = snprintf(pat, sizeof(pat), "%s%s%s",
       -                f->matchbegin ? "" : "*",
       -                f->uri,
       -                f->matchend ? "" : "*");
       -        if (r == -1 || (size_t)r >= sizeof(pat)) {
       -                fprintf(stderr, "warning: pattern too large, ignoring\n");
       -                return 0;
       -        }
       -
       -        /* DEBUG */
       +        /* match begin including domain */
                if (f->matchbegin) {
       -                printf("pat: %s,    uri: %s,    domain: %s\n", pat, uri, domain);
       -        }
       +                /* TODO: match domain part of pattern */
       +                /* TODO: preprocess pattern if it is matchbegin? */
        
       -        m = 0;
       -        if (!match(pat, uri, (f->block & FilterTypeMatchCase) ? 0 : 1)) {
       -#if 0
       -                for (; *type; type++) {
       -                        for (i = 0; blockstr[i]; i++) {
       -                                if (blockstr[i] == *type &&
       -                                    f->block & (1 << i))
       -                                        printf("block type '%c'\n", blockstr[i]);
       -                                        return 1;
       -                                }
       +                len = strcspn(f->uri, "^/");
       +
       +                /* match domain without dot */
       +                r = snprintf(pat, sizeof(pat), "%.*s",
       +                        len, f->uri);
       +                if (r == -1 || (size_t)r >= sizeof(pat)) {
       +                        fprintf(stderr, "warning: pattern too large, ignoring\n");
       +                        return 0;
       +                }
       +
       +                /* TODO: block type mask */
       +                if (match(pat, reqdomain, (f->block & FilterTypeMatchCase) ? 0 : 1)) {
       +                        /* match domain with dot */
       +                        r = snprintf(pat, sizeof(pat), "*.%.*s",
       +                                len, f->uri);
       +                        if (r == -1 || (size_t)r >= sizeof(pat)) {
       +                                fprintf(stderr, "warning: pattern too large, ignoring\n");
       +                                return 0;
                                }
       +
       +                        /* TODO: block type mask */
       +                        if (match(pat, reqdomain, (f->block & FilterTypeMatchCase) ? 0 : 1))
       +                                return 0;
                        }
        
       +                /* match on path */
       +                r = snprintf(pat, sizeof(pat), "*%s%s",
       +                        f->uri + len,
       +                        f->matchend ? "" : "*");
       +                uri = reqrel;
       +        } else {
       +                r = snprintf(pat, sizeof(pat), "*%s%s",
       +                        f->uri,
       +                        f->matchend ? "" : "*");
       +                uri = requri;
       +
       +        }
       +        if (r == -1 || (size_t)r >= sizeof(pat)) {
       +                fprintf(stderr, "warning: pattern too large, ignoring\n");
                        return 0;
       -#endif
       -                m = 1;
                }
       -        /*m = r ? !m : m;*/
       -        return m;
       +
       +        /* TODO: block type mask */
       +        if (!match(pat, uri, (f->block & FilterTypeMatchCase) ? 0 : 1))
       +                return 1;
       +
       +        return 0;
        }
        
        static int
       @@ -619,6 +660,7 @@ parserule(struct filterrule *f, char *s)
                /* has options */
                if (!(f->uri = westrndup(s, p - s)))
                        return -1;
       +
                s = ++p;
        
                /* blockmask, has options? default: allow all options, case-sensitive
       @@ -723,27 +765,23 @@ getglobalcss(void)
        }
        
        char *
       -getdocumentcss(const char *uri)
       +getdocumentcss(const char *fromuri)
        {
                const char *s;
       -        char domain[256];
       +        char fromdomain[256];
                String sitecss;
                struct filterrule *r;
                size_t len;
        
       -        if (!uri)
       -                return NULL;
       -
       -        if ((s = strstr(uri, "://")))
       -                s += sizeof("://") - 1;
       -        else
       -                s = uri;
       -        len = strcspn(s, "/"); /* TODO: ":/" */
       -        memcpy(domain, s, len);
       -        domain[len] = '\0';
       +        /* skip protocol */
       +        if ((s = strstr(fromuri, "://")))
       +                fromuri = s + sizeof("://") - 1;
       +        len = strcspn(fromuri, "/"); /* TODO: ":/" */
       +        memcpy(fromdomain, s, len);
       +        fromdomain[len] = '\0';
        
       -        printf("uri: %s\n", uri);
       -        printf("domain: %s\n", domain);
       +        printf("fromuri:    %s\n", fromuri);
       +        printf("fromdomain: %s\n", fromdomain);
        
                /* DEBUG: timing */
                struct timespec tp_start, tp_end, tp_diff;
       @@ -754,7 +792,8 @@ getdocumentcss(const char *uri)
                /* site-specific CSS */
                memset(&sitecss, 0, sizeof(sitecss));
                for (r = rules; r; r = r->next) {
       -                if (!r->css || !r->domains || !matchrule(r, "", "", domain))
       +                if (!r->css || !r->domains ||
       +                    !matchrule(r, "", fromdomain, "", "", "", "", ""))
                                continue;
        
                        len = strlen(r->css);
       @@ -792,28 +831,39 @@ getdocumentcss(const char *uri)
        
        err:
                free(sitecss.data);
       +        /*memset(&sitecss, 0, sizeof(sitecss));*/
       +
                return NULL;
        }
        
        int
       -allowrequest(const char *uri, const char *requri)
       +allowrequest(const char *fromuri, const char *requri)
        {
       -        char domain[256];
                struct filterrule *r;
       -        const char *s;
       +        char fromdomain[256], reqdomain[256];
       +        const char *s, *reqrel, *fromrel;
                size_t len;
                int status = 1;
        
       -        if (!uri || !strcmp(requri, uri))
       -                return 1;
       +        /* skip protocol part */
       +        if ((s = strstr(fromuri, "://")))
       +                fromuri = s + sizeof("://") - 1;
       +        if ((s = strstr(requri, "://")))
       +                requri = s + sizeof("://") - 1;
        
       -        if ((s = strstr(uri, "://")))
       -                s += sizeof("://") - 1;
       -        else
       -                s = uri;
       -        len = strcspn(s, "/"); /* TODO: ":/" */
       -        memcpy(domain, s, len);
       -        domain[len] = '\0';
       +        len = strcspn(fromuri, ":/"); /* TODO: ":/", but support IPV6... */
       +        memcpy(fromdomain, fromuri, len);
       +        fromdomain[len] = '\0';
       +
       +        len = strcspn(requri, ":/"); /* TODO: ":/", but support IPV6... */
       +        memcpy(reqdomain, requri, len);
       +        reqdomain[len] = '\0';
       +
       +        fromrel = &fromuri[strcspn(fromuri, "/")];
       +        reqrel = &requri[strcspn(requri, "/")];
       +
       +        printf("req %s = %s\n", requri, reqrel);
       +        printf("from %s = %s\n", fromuri, fromrel);
        
                /* DEBUG: timing */
                struct timespec tp_start, tp_end, tp_diff;
       @@ -823,12 +873,15 @@ allowrequest(const char *uri, const char *requri)
        
                /* match rules */
                for (r = rules; r; r = r->next) {
       -                if (!r->css && matchrule(r, requri, "csio^", domain)) {
       -                        printf("requri: %s\n", requri);
       -                        printf("uri:    %s\n", uri);
       -                        printf("domain: %s\n", domain);
       +                if (!r->css && matchrule(r, fromuri, fromdomain,
       +                                         fromrel, requri, reqdomain, reqrel, "csio^")) {
       +                        printf("reqrel:      %s\n", reqrel);
       +                        printf("reqdomain:   %s\n", reqdomain);
       +                        printf("requri:      %s\n", requri);
       +                        printf("from uri:    %s\n", fromuri);
       +                        printf("from domain: %s\n", fromdomain);
        
       -                        fprintf(stderr, "blocked: %s, %s\n", domain, requri);
       +                        fprintf(stderr, "blocked: %s, %s\n", fromdomain, requri);
        
                                /* DEBUG: for showing the timing */
                                status = 0;
       @@ -851,13 +904,36 @@ end:
                }
        
                printf("%s [%s] timing: %lld sec, %.3f ms\n",
       -                requri, uri, (long long)tp_diff.tv_sec,
       +                requri, fromuri, (long long)tp_diff.tv_sec,
                        (float)tp_diff.tv_nsec / 1000000.0f);
        
                return status;
        }
        
        void
       +cleanup(void)
       +{
       +        struct filterrule *r;
       +        struct filterdomain *d;
       +
       +        free(globalcss.data);
       +        memset(&globalcss, 0, sizeof(globalcss));
       +
       +        for (r = rules; r; r = rules) {
       +                for (d = r->domains; d; d = r->domains) {
       +                        free(d->domain);
       +                        r->domains = d->next;
       +                        free(d);
       +                }
       +                free(r->css);
       +                free(r->uri);
       +                rules = r->next;
       +                free(r);
       +        }
       +        rules = NULL;
       +}
       +
       +void
        init(void)
        {
                struct filterrule *r;
       @@ -906,8 +982,7 @@ init(void)
        
                        len = strlen(r->css);
                        if (string_append(&globalcss, r->css, strlen(r->css)) < len) {
       -                        weprintf("cannot load global css selectors "
       -                                 "in memory\n");
       +                        weprintf("cannot load global css selectors in memory\n");
                                cleanup();
                                return;
                        }
 (DIR) diff --git a/surf-adblock.c b/surf-adblock.c
       @@ -67,14 +67,14 @@ static gboolean
        sendrequest(WebKitWebPage *wp, WebKitURIRequest *req,
                           WebKitURIResponse *res, Page *p)
        {
       -        const char *uri, *requri;
       +        const char *fromuri, *requri;
        
                if (!webkit_uri_request_get_http_method(req))
                        return TRUE; /* TRUE = don't handle any more events */
       -        uri = webkit_web_page_get_uri(p->webpage);
       +        fromuri = webkit_web_page_get_uri(p->webpage);
                requri = webkit_uri_request_get_uri(req);
        
       -        return allowrequest(uri, requri) ? FALSE : TRUE;
       +        return allowrequest(fromuri, requri) ? FALSE : TRUE;
        }
        
        static void
 (DIR) diff --git a/tests/tests.c b/tests/tests.c
       @@ -1,25 +1,21 @@
        #include "../adblock.c"
        
       -void
       -cleanup(void)
       -{
       -        struct filterrule *r;
       -        struct filterdomain *d;
       -
       -        free(globalcss.data);
       -
       -        for (r = rules; r; r = rules) {
       -                for (d = r->domains; d; d = r->domains) {
       -                        free(d->domain);
       -                        r->domains = d->next;
       -                        free(d);
       -                }
       -                free(r->css);
       -                free(r->uri);
       -                rules = r->next;
       -                free(r);
       -        }
       -}
       +/*
       +
       +TODO: add tests:
       +
       +||example.com/banner.gif will block all these addresses
       +
       +        http://example.com/banner.gif
       +        https://example.com/banner.gif
       +        http://www.example.com/banner.gif
       +
       +while not blocking:
       +
       +        http://badexample.com/banner.gif
       +        http://gooddomain.example/analyze?http://example.com/banner.gif
       +
       +*/
        
        int
        main(void)