improvements - surf-adblock - Surf adblock web extension
(HTM) git clone git://git.codemadness.org/surf-adblock
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
(DIR) commit 3cc61dad61ee13b47cc3b6a2931de9413c4c6176
(DIR) parent 84d3f064e393f5856f4bbbfb519b267ed4a5aa0a
(HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date: Mon, 5 Jun 2017 17:36:10 +0200
improvements
- fix CARET in match()
- fix matchbegin rules (starts with ||).
Diffstat:
M TODO | 2 ++
M adblock.c | 233 ++++++++++++++++++++-----------
M surf-adblock.c | 6 +++---
M tests/tests.c | 36 ++++++++++++++-----------------
4 files changed, 175 insertions(+), 102 deletions(-)
---
(DIR) diff --git a/TODO b/TODO
@@ -1,3 +1,5 @@
+- simplify match, there are only a few rules with multiple *.
+
- loadrules: return struct rules* ?
on error free rules.
(DIR) diff --git a/adblock.c b/adblock.c
@@ -35,8 +35,8 @@ struct filterrule {
int matchend;
/* is exception rule: prefix @@ for ABP or #@# for CSS */
int isexception;
- char *css; /* if non-NULL is CSS rule / hide element rule */
- char *uri;
+ const char *css; /* if non-NULL is CSS rule / hide element rule */
+ const char *uri;
struct filterdomain *domains;
struct filterrule *next;
};
@@ -184,14 +184,14 @@ string_append(String *s, const char *data, size_t len)
memcpy(s->data + s->len, data, len);
s->len = newlen;
s->data[s->len] = '\0';
+
return len;
}
#define END 0
#define UNMATCHABLE -2
-#define BRACKET -3
-#define CARET -4
-#define STAR -5
+#define CARET -3
+#define STAR -4
static int
str_next(const char *str, size_t n, size_t *step)
@@ -275,11 +275,19 @@ match(const char *pat, const char *str, int fcase)
pat++;
m--;
break;
- default:
+ case CARET:
k = str_next(str, n, &sinc);
- /* TODO: write a test-case */
- if (c == CARET && (k == '?' || k == '/' || k <= 0))
+ if (k <= 0)
+ return (c==END) ? 0 : 1;
+ str += sinc;
+ n -= sinc;
+ if (k != '?' && k != '/')
return 1;
+ pat++;
+ m--;
+ break;
+ default:
+ k = str_next(str, n, &sinc);
if (k <= 0)
return (c==END) ? 0 : 1;
str += sinc;
@@ -341,9 +349,14 @@ match(const char *pat, const char *str, int fcase)
break;
}
s += sinc;
- kfold = fcase ? casefold(k) : k;
- if (k != c && kfold != c)
- return 1;
+ if (c == CARET) {
+ if (k != '/' && k != '?')
+ return 1;
+ } else {
+ kfold = fcase ? casefold(k) : k;
+ if (k != c && kfold != c)
+ return 1;
+ }
}
/* We're all done with the tails now, so throw them out */
@@ -366,10 +379,16 @@ match(const char *pat, const char *str, int fcase)
k = str_next(s, endstr-s, &sinc);
if (!k)
return 1;
- kfold = fcase ? casefold(k) : k;
- if (k != c && kfold != c)
- break;
s += sinc;
+ if (c == CARET) {
+ if (k != '/' && k != '?')
+ break;
+ } else {
+ kfold = fcase ? casefold(k) : k;
+ if (k != c && kfold != c)
+ break;
+ }
+
}
if (c == STAR) continue;
/* If we failed, advance str, by 1 char if it's a valid
@@ -486,17 +505,20 @@ matchdomain(const char *s, const char *domain)
}
static int
-matchrule(struct filterrule *f, const char *uri, const char *type,
- const char *domain)
+matchrule(struct filterrule *f, const char *fromuri, const char *fromdomain,
+ const char *fromrel,
+ const char *requri, const char *reqdomain, const char *reqrel,
+ const char *type)
{
/* NOTE: order matters, see FilterType enum values */
struct filterdomain *d;
char pat[1024];
- int r, m;
+ const char *uri;
+ int len, r;
r = f->domains ? 0 : 1;
for (d = f->domains; d; d = d->next) {
- if (matchdomain(d->domain, domain)) {
+ if (matchdomain(d->domain, fromdomain)) {
if (r && d->inverse)
r = 0;
else if (!r && !d->inverse)
@@ -521,39 +543,58 @@ matchrule(struct filterrule *f, const char *uri, const char *type,
return 0;
#endif
- r = snprintf(pat, sizeof(pat), "%s%s%s",
- f->matchbegin ? "" : "*",
- f->uri,
- f->matchend ? "" : "*");
- if (r == -1 || (size_t)r >= sizeof(pat)) {
- fprintf(stderr, "warning: pattern too large, ignoring\n");
- return 0;
- }
-
- /* DEBUG */
+ /* match begin including domain */
if (f->matchbegin) {
- printf("pat: %s, uri: %s, domain: %s\n", pat, uri, domain);
- }
+ /* TODO: match domain part of pattern */
+ /* TODO: preprocess pattern if it is matchbegin? */
- m = 0;
- if (!match(pat, uri, (f->block & FilterTypeMatchCase) ? 0 : 1)) {
-#if 0
- for (; *type; type++) {
- for (i = 0; blockstr[i]; i++) {
- if (blockstr[i] == *type &&
- f->block & (1 << i))
- printf("block type '%c'\n", blockstr[i]);
- return 1;
- }
+ len = strcspn(f->uri, "^/");
+
+ /* match domain without dot */
+ r = snprintf(pat, sizeof(pat), "%.*s",
+ len, f->uri);
+ if (r == -1 || (size_t)r >= sizeof(pat)) {
+ fprintf(stderr, "warning: pattern too large, ignoring\n");
+ return 0;
+ }
+
+ /* TODO: block type mask */
+ if (match(pat, reqdomain, (f->block & FilterTypeMatchCase) ? 0 : 1)) {
+ /* match domain with dot */
+ r = snprintf(pat, sizeof(pat), "*.%.*s",
+ len, f->uri);
+ if (r == -1 || (size_t)r >= sizeof(pat)) {
+ fprintf(stderr, "warning: pattern too large, ignoring\n");
+ return 0;
}
+
+ /* TODO: block type mask */
+ if (match(pat, reqdomain, (f->block & FilterTypeMatchCase) ? 0 : 1))
+ return 0;
}
+ /* match on path */
+ r = snprintf(pat, sizeof(pat), "*%s%s",
+ f->uri + len,
+ f->matchend ? "" : "*");
+ uri = reqrel;
+ } else {
+ r = snprintf(pat, sizeof(pat), "*%s%s",
+ f->uri,
+ f->matchend ? "" : "*");
+ uri = requri;
+
+ }
+ if (r == -1 || (size_t)r >= sizeof(pat)) {
+ fprintf(stderr, "warning: pattern too large, ignoring\n");
return 0;
-#endif
- m = 1;
}
- /*m = r ? !m : m;*/
- return m;
+
+ /* TODO: block type mask */
+ if (!match(pat, uri, (f->block & FilterTypeMatchCase) ? 0 : 1))
+ return 1;
+
+ return 0;
}
static int
@@ -619,6 +660,7 @@ parserule(struct filterrule *f, char *s)
/* has options */
if (!(f->uri = westrndup(s, p - s)))
return -1;
+
s = ++p;
/* blockmask, has options? default: allow all options, case-sensitive
@@ -723,27 +765,23 @@ getglobalcss(void)
}
char *
-getdocumentcss(const char *uri)
+getdocumentcss(const char *fromuri)
{
const char *s;
- char domain[256];
+ char fromdomain[256];
String sitecss;
struct filterrule *r;
size_t len;
- if (!uri)
- return NULL;
-
- if ((s = strstr(uri, "://")))
- s += sizeof("://") - 1;
- else
- s = uri;
- len = strcspn(s, "/"); /* TODO: ":/" */
- memcpy(domain, s, len);
- domain[len] = '\0';
+ /* skip protocol */
+ if ((s = strstr(fromuri, "://")))
+ fromuri = s + sizeof("://") - 1;
+ len = strcspn(fromuri, "/"); /* TODO: ":/" */
+ memcpy(fromdomain, s, len);
+ fromdomain[len] = '\0';
- printf("uri: %s\n", uri);
- printf("domain: %s\n", domain);
+ printf("fromuri: %s\n", fromuri);
+ printf("fromdomain: %s\n", fromdomain);
/* DEBUG: timing */
struct timespec tp_start, tp_end, tp_diff;
@@ -754,7 +792,8 @@ getdocumentcss(const char *uri)
/* site-specific CSS */
memset(&sitecss, 0, sizeof(sitecss));
for (r = rules; r; r = r->next) {
- if (!r->css || !r->domains || !matchrule(r, "", "", domain))
+ if (!r->css || !r->domains ||
+ !matchrule(r, "", fromdomain, "", "", "", "", ""))
continue;
len = strlen(r->css);
@@ -792,28 +831,39 @@ getdocumentcss(const char *uri)
err:
free(sitecss.data);
+ /*memset(&sitecss, 0, sizeof(sitecss));*/
+
return NULL;
}
int
-allowrequest(const char *uri, const char *requri)
+allowrequest(const char *fromuri, const char *requri)
{
- char domain[256];
struct filterrule *r;
- const char *s;
+ char fromdomain[256], reqdomain[256];
+ const char *s, *reqrel, *fromrel;
size_t len;
int status = 1;
- if (!uri || !strcmp(requri, uri))
- return 1;
+ /* skip protocol part */
+ if ((s = strstr(fromuri, "://")))
+ fromuri = s + sizeof("://") - 1;
+ if ((s = strstr(requri, "://")))
+ requri = s + sizeof("://") - 1;
- if ((s = strstr(uri, "://")))
- s += sizeof("://") - 1;
- else
- s = uri;
- len = strcspn(s, "/"); /* TODO: ":/" */
- memcpy(domain, s, len);
- domain[len] = '\0';
+ len = strcspn(fromuri, ":/"); /* TODO: ":/", but support IPV6... */
+ memcpy(fromdomain, fromuri, len);
+ fromdomain[len] = '\0';
+
+ len = strcspn(requri, ":/"); /* TODO: ":/", but support IPV6... */
+ memcpy(reqdomain, requri, len);
+ reqdomain[len] = '\0';
+
+ fromrel = &fromuri[strcspn(fromuri, "/")];
+ reqrel = &requri[strcspn(requri, "/")];
+
+ printf("req %s = %s\n", requri, reqrel);
+ printf("from %s = %s\n", fromuri, fromrel);
/* DEBUG: timing */
struct timespec tp_start, tp_end, tp_diff;
@@ -823,12 +873,15 @@ allowrequest(const char *uri, const char *requri)
/* match rules */
for (r = rules; r; r = r->next) {
- if (!r->css && matchrule(r, requri, "csio^", domain)) {
- printf("requri: %s\n", requri);
- printf("uri: %s\n", uri);
- printf("domain: %s\n", domain);
+ if (!r->css && matchrule(r, fromuri, fromdomain,
+ fromrel, requri, reqdomain, reqrel, "csio^")) {
+ printf("reqrel: %s\n", reqrel);
+ printf("reqdomain: %s\n", reqdomain);
+ printf("requri: %s\n", requri);
+ printf("from uri: %s\n", fromuri);
+ printf("from domain: %s\n", fromdomain);
- fprintf(stderr, "blocked: %s, %s\n", domain, requri);
+ fprintf(stderr, "blocked: %s, %s\n", fromdomain, requri);
/* DEBUG: for showing the timing */
status = 0;
@@ -851,13 +904,36 @@ end:
}
printf("%s [%s] timing: %lld sec, %.3f ms\n",
- requri, uri, (long long)tp_diff.tv_sec,
+ requri, fromuri, (long long)tp_diff.tv_sec,
(float)tp_diff.tv_nsec / 1000000.0f);
return status;
}
void
+cleanup(void)
+{
+ struct filterrule *r;
+ struct filterdomain *d;
+
+ free(globalcss.data);
+ memset(&globalcss, 0, sizeof(globalcss));
+
+ for (r = rules; r; r = rules) {
+ for (d = r->domains; d; d = r->domains) {
+ free(d->domain);
+ r->domains = d->next;
+ free(d);
+ }
+ free(r->css);
+ free(r->uri);
+ rules = r->next;
+ free(r);
+ }
+ rules = NULL;
+}
+
+void
init(void)
{
struct filterrule *r;
@@ -906,8 +982,7 @@ init(void)
len = strlen(r->css);
if (string_append(&globalcss, r->css, strlen(r->css)) < len) {
- weprintf("cannot load global css selectors "
- "in memory\n");
+ weprintf("cannot load global css selectors in memory\n");
cleanup();
return;
}
(DIR) diff --git a/surf-adblock.c b/surf-adblock.c
@@ -67,14 +67,14 @@ static gboolean
sendrequest(WebKitWebPage *wp, WebKitURIRequest *req,
WebKitURIResponse *res, Page *p)
{
- const char *uri, *requri;
+ const char *fromuri, *requri;
if (!webkit_uri_request_get_http_method(req))
return TRUE; /* TRUE = don't handle any more events */
- uri = webkit_web_page_get_uri(p->webpage);
+ fromuri = webkit_web_page_get_uri(p->webpage);
requri = webkit_uri_request_get_uri(req);
- return allowrequest(uri, requri) ? FALSE : TRUE;
+ return allowrequest(fromuri, requri) ? FALSE : TRUE;
}
static void
(DIR) diff --git a/tests/tests.c b/tests/tests.c
@@ -1,25 +1,21 @@
#include "../adblock.c"
-void
-cleanup(void)
-{
- struct filterrule *r;
- struct filterdomain *d;
-
- free(globalcss.data);
-
- for (r = rules; r; r = rules) {
- for (d = r->domains; d; d = r->domains) {
- free(d->domain);
- r->domains = d->next;
- free(d);
- }
- free(r->css);
- free(r->uri);
- rules = r->next;
- free(r);
- }
-}
+/*
+
+TODO: add tests:
+
+||example.com/banner.gif will block all these addresses
+
+ http://example.com/banner.gif
+ https://example.com/banner.gif
+ http://www.example.com/banner.gif
+
+while not blocking:
+
+ http://badexample.com/banner.gif
+ http://gooddomain.example/analyze?http://example.com/banner.gif
+
+*/
int
main(void)