improve base URL and <base href /> handling - webdump - HTML to plain-text converter for webpages
(HTM) git clone git://git.codemadness.org/webdump
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
(DIR) commit 0705fb754f00c7866b2cc8cee0739a88a584a2e1
(DIR) parent 7d4723febabeb679e1980c12b5dfd3b656475b4f
(HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date: Fri, 8 Sep 2023 13:09:37 +0200
improve base URL and <base href /> handling
- Parse the base URI once and reuse the structure (optimization).
- Once it is parsed it cannot be overwritten again. This matches the browser
more closely.
Diffstat:
M webdump.1 | 1 +
M webdump.c | 30 ++++++++++++++++--------------
2 files changed, 17 insertions(+), 14 deletions(-)
---
(DIR) diff --git a/webdump.1 b/webdump.1
@@ -27,6 +27,7 @@ Toggle ANSI escape codes usage, by default it is not enabled.
.It Fl b Ar baseurl
Base URL of links.
This is used to make links absolute.
+The specified URL is always preferred over the value in a <base/> tag.
.It Fl i
Toggle if link reference numbers are displayed inline or not, by default it is
not enabled.
(DIR) diff --git a/webdump.c b/webdump.c
@@ -148,6 +148,8 @@ static const char *str_ruler = "-";
/* base href, to make URLs absolute */
static char *basehref = "";
static char basehrefdoc[4096]; /* base href in document, if any */
+static int basehrefset = 0; /* base href set and can be used? */
+static struct uri base;
/* buffers for some attributes of the current tag */
String attr_alt; /* alt attribute */
@@ -1311,14 +1313,13 @@ addlinkref(const char *url, const char *_type, int ishidden)
links_cur->ishidden = ishidden;
}
-/* TODO: make parsed base URL global and overwrite it once. */
static void
handleinlinelink(void)
{
- struct uri base, newuri, olduri;
+ struct uri newuri, olduri;
struct node *cur;
char buf[4096], *url;
- int b, r;
+ int r;
if (!showrefbottom && !showrefinline && !showurlinline && !resources)
return; /* there is no need to collect the reference */
@@ -1332,15 +1333,9 @@ handleinlinelink(void)
else
url = attr_href.data;
- b = -1;
- if (uri_hasscheme(url))
- ; /* already absolute: nothing to do */
- else if (basehref[0]) /* prefer -b option over <base> */
- b = uri_parse(basehref, &base);
- else if (basehrefdoc[0])
- b = uri_parse(basehrefdoc, &base);
-
- if (b != -1 &&
+ /* Not an absolute URL yet: try to make it absolute.
+ If it is not possible use the relative URL */
+ if (!uri_hasscheme(url) && basehrefset &&
uri_parse(url, &olduri) != -1 &&
uri_makeabs(&newuri, &olduri, &base) != -1 &&
newuri.proto[0]) {
@@ -1948,7 +1943,7 @@ xmlattr(XMLParser *p, const char *tag, size_t taglen, const char *name,
string_append(&attr_id, value, valuelen);
/* <base href="..." /> */
- if (!attrcmp(name, "href") && !tagcmp(tag, "base"))
+ if (!basehrefset && !attrcmp(name, "href") && !tagcmp(tag, "base"))
strlcat(basehrefdoc, value, sizeof(basehrefdoc));
/* hide tags with attribute aria-hidden or hidden */
@@ -1992,6 +1987,10 @@ static void
xmlattrend(XMLParser *p, const char *t, size_t tl, const char *n,
size_t nl)
{
+ /* set base URL, if it is set it cannot be overwritten again */
+ if (!basehrefset && basehrefdoc[0] &&
+ !attrcmp(n, "href") && !tagcmp(t, "base"))
+ basehrefset = uri_parse(basehrefdoc, &base) != -1 ? 1 : 0;
}
static void
@@ -2013,7 +2012,7 @@ xmlattrstart(XMLParser *p, const char *t, size_t tl, const char *n,
else if (!attrcmp(n, "value"))
string_clear(&attr_value);
- if (!attrcmp(n, "href") && !tagcmp(t, "base"))
+ if (basehrefdoc[0] && !attrcmp(n, "href") && !tagcmp(t, "base"))
basehrefdoc[0] = '\0';
}
@@ -2040,6 +2039,9 @@ main(int argc, char **argv)
break;
case 'b':
basehref = EARGF(usage());
+ if (uri_parse(basehref, &base) == -1)
+ usage();
+ basehrefset = 1;
break;
case 'i':
showrefinline = !showrefinline;