improvements - surf-adblock - Surf adblock web extension
(HTM) git clone git://git.codemadness.org/surf-adblock
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
(DIR) commit d781090ae7718310fb13c83c1a8406be46a613b8
(DIR) parent b6cc76e9fcac3112086f2d2348ef53b16b59da9d
(HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date: Sat, 3 Jun 2017 21:54:05 +0200
improvements
- WIP: faster matching for simple rules, just a test atm, but ~50ms max to ~20ms on my
machine.
- add support for exception rules.
- debug.sh add debug script for testing: compile as main().
Diffstat:
M TODO | 14 +++++++++++++-
A debug.sh | 18 ++++++++++++++++++
M surf-adblock.c | 279 +++++++++++++++++++++++++------
3 files changed, 255 insertions(+), 56 deletions(-)
---
(DIR) diff --git a/TODO b/TODO
@@ -1,5 +1,15 @@
- fix tweakers.net popup / rule.
-- benchmark rule matching (timing).
+ this is in an exception rule...
+
+ make sure exception rules are always below in the list? modify awk script?
+
+- performance:
+ - benchmark rule matching (timing).
+ - bloom filters? some kind of cache?
+ - optimize simple filter case.
+
+- support separator "^" = [/\?]?
+ - test it better.
===
@@ -23,6 +33,8 @@ Docs:
and matchbegin or matchend set.
- make less CPU intensive.
- maybe even include it statically?
+ - optimize CSS rule matching (only per site?).
+
- optimize memory allocation.
- optimize: pregenerate one global stylesheet that applies to all sites?
- separate adblocker into daemon? not sure.
(DIR) diff --git a/debug.sh b/debug.sh
@@ -0,0 +1,18 @@
+#!/bin/sh
+# ugly debug script: compile as standalone program for testing.
+
+cc -std=c99 -pedantic -Wall -Os -I. -I/usr/include -I/usr/X11R6/include \
+ `pkg-config --cflags gtk+-3.0 webkit2gtk-4.0 webkit2gtk-web-extension-4.0` \
+ -DVERSION=\"0.1\" -DWEBEXTDIR=\"/usr/local/lib/surf\" -D_DEFAULT_SOURCE \
+ -DWEBEXTDIR=\"/usr/local/lib/surf\" \
+ `pkg-config --cflags gtk+-3.0 webkit2gtk-4.0 webkit2gtk-web-extension-4.0` \
+ -DDEBUG \
+ -c surf-adblock.c
+cc -s -L/usr/lib -lc -L/usr/X11R6/lib -lX11 \
+ `pkg-config --libs gtk+-3.0 webkit2gtk-4.0 webkit2gtk-web-extension-4.0` -lgthread-2.0 -module -avoid-version -o surf-adblock \
+ surf-adblock.o
+
+chmod +x surf-adblock
+# NOTE: need to copy because of W^X.
+doas cp surf-adblock /usr/local/bin
+/usr/local/bin/surf-adblock
(DIR) diff --git a/surf-adblock.c b/surf-adblock.c
@@ -305,11 +305,11 @@ match(const char *pat, const char *str, int fcase)
break;
default:
k = str_next(str, n, &sinc);
+ /* TODO: write a test-case */
+ if (c == CARET && (k == '?' || k == '/' || k <= 0))
+ return 1;
if (k <= 0)
return (c==END) ? 0 : 1;
- if (c == CARET && (iswdigit(k) || iswalpha(k) ||
- strchr("_-.%", k)))
- return 1;
str += sinc;
n -= sinc;
kfold = fcase ? casefold(k) : k;
@@ -410,7 +410,6 @@ match(const char *pat, const char *str, int fcase)
return 0;
}
-
/*
domain=... if domain is prefixed with ~, ignore.
multiple domains can be separated with |
@@ -521,28 +520,63 @@ matchrule(struct filterrule *f, const char *uri, const char *type,
/* NOTE: order matters, see FilterType enum values */
struct filterdomain *d;
char pat[1024];
- int r;
-
- /* ignore exception rules for now, these are usually paid
- * for by sites to allow advertisements. */
- if (f->isexception)
- return 0;
+ int r, m;
- if (f->css) {
- r = f->domains ? 0 : 1;
- for (d = f->domains; d; d = d->next) {
- if (matchdomain(d->domain, domain)) {
- if (r && d->inverse)
- r = 0;
- else if (!r && !d->inverse)
- r = 1;
- } else if (r && !d->inverse) {
+ r = f->domains ? 0 : 1;
+ for (d = f->domains; d; d = d->next) {
+ if (matchdomain(d->domain, domain)) {
+ if (r && d->inverse)
r = 0;
- }
+ else if (!r && !d->inverse)
+ r = 1;
+ } else if (r && !d->inverse) {
+ r = 0;
}
+ }
+ if (f->css) {
+ /* DEBUG */
+ if (f->isexception)
+ printf("DEBUG, exception rule, CSS: %s, match? %d\n",
+ f->css, r);
return r;
}
+#if 1
+ /* skip allow rule, TODO: inverse? */
+ if (!r)
+ return 0;
+#endif
+
+#if 1
+ /* DEBUG: test, match if it is a simple pattern */
+ char *p;
+ p = strchr(f->uri, '*');
+ if (!p)
+ p = strchr(f->uri, '^');
+ if (!p) {
+ /* TODO: write a test-case */
+ if (f->block & FilterTypeMatchCase) {
+ if (f->matchbegin)
+ m = strncmp(uri, f->uri, strlen(f->uri)) == 0;
+ else if (f->matchend)
+ m = strlen(f->uri) <= strlen(uri) &&
+ strcmp(&uri[strlen(uri) - strlen(f->uri)], f->uri) == 0;
+ else
+ m = strstr(uri, f->uri) ? 1 : 0;
+ } else {
+ if (f->matchbegin)
+ m = strncasecmp(uri, f->uri, strlen(f->uri)) == 0;
+ else if (f->matchend)
+ m = strlen(f->uri) <= strlen(uri) &&
+ strcasecmp(&uri[strlen(uri) - strlen(f->uri)], f->uri) == 0;
+ else
+ m = strcasestr(uri, f->uri) ? 1 : 0;
+ }
+ /*m = r ? !m : m;*/
+ return m;
+ }
+#endif
+
r = snprintf(pat, sizeof(pat), "%s%s%s",
f->matchbegin ? "" : "*",
f->uri,
@@ -552,19 +586,8 @@ matchrule(struct filterrule *f, const char *uri, const char *type,
return 0;
}
- r = f->domains ? 0 : 1;
- for (d = f->domains; d; d = d->next) {
- if (matchdomain(d->domain, domain)) {
- if (r && d->inverse)
- r = 0;
- else if (!r && !d->inverse)
- r = 1;
- } else if (r && !d->inverse) {
- r = 0;
- }
- }
-
- if (r && !match(pat, uri, (f->block & FilterTypeMatchCase) ? 0 : 1)) {
+ m = 0;
+ if (!match(pat, uri, (f->block & FilterTypeMatchCase) ? 0 : 1)) {
#if 0
for (; *type; type++) {
for (i = 0; blockstr[i]; i++) {
@@ -575,11 +598,13 @@ matchrule(struct filterrule *f, const char *uri, const char *type,
}
}
}
+
return 0;
#endif
- return 1;
+ m = 1;
}
- return 0;
+ /*m = r ? !m : m;*/
+ return m;
}
static int
@@ -695,6 +720,7 @@ end:
return 1;
}
+#if 0
static void
debugrule(struct filterrule *r)
{
@@ -702,6 +728,7 @@ debugrule(struct filterrule *r)
"%lu\n===\n", r->uri ? r->uri : "", r->css ? r->css : "",
r->isexception, r->block);
}
+#endif
static int
loadrules(FILE *fp)
@@ -775,6 +802,12 @@ documentloaded(WebKitWebPage *wp, Page *p)
printf("uri: %s\n", uri);
printf("domain: %s\n", domain);
+ /* DEBUG: timing */
+ struct timespec tp_start, tp_end, tp_diff;
+ if (clock_gettime(CLOCK_MONOTONIC, &tp_start) == -1) {
+ fprintf(stderr, "clock_gettime: %s\n", strerror(errno));
+ }
+
/* site-specific CSS */
memset(&sitecss, 0, sizeof(sitecss));
for (r = rules; r; r = r->next) {
@@ -783,11 +816,38 @@ documentloaded(WebKitWebPage *wp, Page *p)
len = strlen(r->css);
if (string_append(&sitecss, r->css, len) < len)
return;
- len = sizeof("{display:none;}") -1;
- if (string_append(&sitecss, "{display:none;}", len) < len)
- return;
+
+ if (r->isexception) {
+ len = sizeof("{display:initial;}") -1;
+ if (string_append(&sitecss, "{display:initial;}", len) < len)
+ return;
+ } else {
+ len = sizeof("{display:none;}") -1;
+ if (string_append(&sitecss, "{display:none;}", len) < len)
+ return;
+ }
+ }
+/* printf("sitecss: %s\n", sitecss.data ? sitecss.data : "<empty>");*/
+
+ /* DEBUG: timing */
+ if (clock_gettime(CLOCK_MONOTONIC, &tp_end) == -1) {
+ fprintf(stderr, "clock_gettime: %s\n", strerror(errno));
}
- printf("sitecss: %s\n", sitecss.data ? sitecss.data : "<empty>");
+
+ tp_diff.tv_sec = tp_end.tv_sec - tp_start.tv_sec;
+ tp_diff.tv_nsec = tp_end.tv_nsec - tp_start.tv_nsec;
+ if (tp_diff.tv_nsec < 0) {
+ tp_diff.tv_sec--;
+ tp_diff.tv_nsec += 1000000000L;
+ }
+
+ printf("timing: %zu sec, %.3f ms\n",
+ tp_diff.tv_sec, (float)tp_diff.tv_nsec / 1000000.0f);
+
+ if (globalcss.data)
+ printf("global CSS length in bytes: %zu\n", strlen(globalcss.data));
+ if (sitecss.data)
+ printf("site CSS length in bytes: %zu\n", strlen(sitecss.data));
p->view = webkit_dom_document_get_default_view(doc);
@@ -819,6 +879,7 @@ sendrequest(WebKitWebPage *wp, WebKitURIRequest *req,
const char *s, *uri = webkit_web_page_get_uri(p->webpage),
*requri = webkit_uri_request_get_uri(req);
size_t len;
+ gboolean status = FALSE;
if (!uri || !strcmp(requri, uri) ||
(strncmp(uri, "http://", sizeof("http://") - 1) &&
@@ -830,6 +891,12 @@ sendrequest(WebKitWebPage *wp, WebKitURIRequest *req,
memcpy(domain, s, len);
domain[len] = '\0';
+ /* DEBUG: timing */
+ struct timespec tp_start, tp_end, tp_diff;
+ if (clock_gettime(CLOCK_MONOTONIC, &tp_start) == -1) {
+ fprintf(stderr, "clock_gettime: %s\n", strerror(errno));
+ }
+
/* match rules */
for (r = rules; r; r = r->next) {
if (!r->css && matchrule(r, requri, "csio^", domain)) {
@@ -839,30 +906,32 @@ sendrequest(WebKitWebPage *wp, WebKitURIRequest *req,
fprintf(stderr, "blocked: %s, %s\n", domain, requri);
- return TRUE;
+ status = TRUE;
+ goto end;
}
}
- return FALSE;
-}
-
-static void
-webpagecreated(WebKitWebExtension *e, WebKitWebPage *p, gpointer unused)
-{
- Page *np;
+end:
+ /* DEBUG: timing */
+ if (clock_gettime(CLOCK_MONOTONIC, &tp_end) == -1) {
+ fprintf(stderr, "clock_gettime: %s\n", strerror(errno));
+ }
- if (!(np = newpage(p))) {
- weprintf("cannot associate webext with new page: %s\n",
- strerror(errno));
- return;
+ tp_diff.tv_sec = tp_end.tv_sec - tp_start.tv_sec;
+ tp_diff.tv_nsec = tp_end.tv_nsec - tp_start.tv_nsec;
+ if (tp_diff.tv_nsec < 0) {
+ tp_diff.tv_sec--;
+ tp_diff.tv_nsec += 1000000000L;
}
- g_signal_connect(p, "document-loaded", G_CALLBACK(documentloaded), np);
- g_signal_connect(p, "send-request", G_CALLBACK(sendrequest), np);
+ printf("%s [%s] timing: %zu sec, %.3f ms\n",
+ requri, uri, tp_diff.tv_sec, (float)tp_diff.tv_nsec / 1000000.0f);
+
+ return status;
}
-G_MODULE_EXPORT void
-webkit_web_extension_initialize(WebKitWebExtension *ext)
+void
+init(void)
{
struct filterrule *r;
FILE *fp;
@@ -922,6 +991,106 @@ webkit_web_extension_initialize(WebKitWebExtension *ext)
return;
}
}
+}
+
+static void
+webpagecreated(WebKitWebExtension *e, WebKitWebPage *p, gpointer unused)
+{
+ Page *np;
+
+ if (!(np = newpage(p))) {
+ weprintf("cannot associate webext with new page: %s\n",
+ strerror(errno));
+ return;
+ }
+
+ g_signal_connect(p, "document-loaded", G_CALLBACK(documentloaded), np);
+ g_signal_connect(p, "send-request", G_CALLBACK(sendrequest), np);
+}
+G_MODULE_EXPORT void
+webkit_web_extension_initialize(WebKitWebExtension *ext)
+{
+ init();
g_signal_connect(ext, "page-created", G_CALLBACK(webpagecreated), NULL);
}
+
+#ifdef DEBUG
+int
+main(void)
+{
+ char domain[256];
+ String sitecss;
+ struct filterrule *r;
+ const char *s, *uri;
+ size_t len;
+
+ /* TEST */
+ uri = "https://tweakers.net/";
+
+ if (!uri || (strncmp(uri, "http://", sizeof("http://") - 1) &&
+ strncmp(uri, "https://", sizeof("https://") - 1)))
+ return;
+
+ init();
+
+ s = strstr(uri, "://") + sizeof("://") - 1;
+ len = strcspn(s, "/");
+ memcpy(domain, s, len);
+ domain[len] = '\0';
+
+ printf("uri: %s\n", uri);
+ printf("domain: %s\n", domain);
+
+ /* DEBUG: timing */
+ struct timespec tp_start, tp_end, tp_diff;
+ if (clock_gettime(CLOCK_MONOTONIC, &tp_start) == -1) {
+ fprintf(stderr, "clock_gettime: %s\n", strerror(errno));
+ }
+
+ /* site-specific CSS */
+ memset(&sitecss, 0, sizeof(sitecss));
+ for (r = rules; r; r = r->next) {
+ if (!r->css || !r->domains || !matchrule(r, "", "", domain))
+ continue;
+ len = strlen(r->css);
+ if (string_append(&sitecss, r->css, len) < len)
+ return;
+ if (r->isexception) {
+ len = sizeof("{display:initial;}") -1;
+ if (string_append(&sitecss, "{display:initial;}", len) < len)
+ return;
+ } else {
+ len = sizeof("{display:none;}") -1;
+ if (string_append(&sitecss, "{display:none;}", len) < len)
+ return;
+ }
+ }
+ printf("sitecss: %s\n", sitecss.data ? sitecss.data : "<empty>");
+
+ /* DEBUG: timing */
+ if (clock_gettime(CLOCK_MONOTONIC, &tp_end) == -1) {
+ fprintf(stderr, "clock_gettime: %s\n", strerror(errno));
+ }
+
+ tp_diff.tv_sec = tp_end.tv_sec - tp_start.tv_sec;
+ tp_diff.tv_nsec = tp_end.tv_nsec - tp_start.tv_nsec;
+ if (tp_diff.tv_nsec < 0) {
+ tp_diff.tv_sec--;
+ tp_diff.tv_nsec += 1000000000L;
+ }
+
+ printf("timing: %zu sec, %.3f ms\n",
+ tp_diff.tv_sec, (float)tp_diff.tv_nsec / 1000000.0f);
+
+ if (globalcss.data)
+ printf("global CSS length in bytes: %zu\n", strlen(globalcss.data));
+ if (sitecss.data)
+ printf("site CSS length in bytes: %zu\n", strlen(sitecss.data));
+
+ free(sitecss.data);
+ cleanup();
+
+ return 0;
+}
+#endif