various improvements - webdump - HTML to plain-text converter for webpages
(HTM) git clone git://git.codemadness.org/webdump
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
(DIR) commit 011b4885a533382d98f1aee6cb9619e280c99947
(DIR) parent 89c9108dc27fe27e0f028f67508a1156ed242d2a
(HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date: Mon, 18 Sep 2023 19:06:03 +0200
various improvements
Improve link references:
- Add RB tree to lookup link references: this uses a stripped-down version of
OpenBSD tree.h
- Add 2 separate linked-lists for the order of visible and hidden links.
- Hidden links and now also deduplicated.
Improve nested nodes and max depth:
- Rework and increase the allowed depth of nodes. Allocate them on the heap.
Diffstat:
M Makefile | 2 +-
A tree.h | 483 +++++++++++++++++++++++++++++++
M webdump.c | 177 ++++++++++++++++++++-----------
3 files changed, 597 insertions(+), 65 deletions(-)
---
(DIR) diff --git a/Makefile b/Makefile
@@ -19,7 +19,7 @@ BIN = ${NAME}
SCRIPTS =
SRC = ${BIN:=.c}
-HDR = arg.h namedentities.h xml.h
+HDR = arg.h namedentities.h tree.h xml.h
LIBXML = libxml.a
LIBXMLSRC = \
(DIR) diff --git a/tree.h b/tree.h
@@ -0,0 +1,483 @@
+/* $OpenBSD: tree.h,v 1.31 2023/03/08 04:43:09 guenther Exp $ */
+/*
+ * Copyright 2002 Niels Provos <provos@citi.umich.edu>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef TREE_H
+#define TREE_H
+
+/*
+ * This file defines a red-black tree structure.
+ *
+ * A red-black tree is a binary search tree with the node color as an
+ * extra attribute. It fulfills a set of conditions:
+ * - every search path from the root to a leaf consists of the
+ * same number of black nodes,
+ * - each red node (except for the root) has a black parent,
+ * - each leaf node is black.
+ *
+ * Every operation on a red-black tree is bounded as O(lg n).
+ * The maximum height of a red-black tree is 2lg (n+1).
+ */
+
+/* Macros that define a red-black tree */
+#define RB_HEAD(name, type) \
+struct name { \
+ struct type *rbh_root; /* root of the tree */ \
+}
+
+#define RB_INITIALIZER(root) \
+ { NULL }
+
+#define RB_INIT(root) do { \
+ (root)->rbh_root = NULL; \
+} while (0)
+
+#define RB_BLACK 0
+#define RB_RED 1
+#define RB_ENTRY(type) \
+struct { \
+ struct type *rbe_left; /* left element */ \
+ struct type *rbe_right; /* right element */ \
+ struct type *rbe_parent; /* parent element */ \
+ int rbe_color; /* node color */ \
+}
+
+#define RB_LEFT(elm, field) (elm)->field.rbe_left
+#define RB_RIGHT(elm, field) (elm)->field.rbe_right
+#define RB_PARENT(elm, field) (elm)->field.rbe_parent
+#define RB_COLOR(elm, field) (elm)->field.rbe_color
+#define RB_ROOT(head) (head)->rbh_root
+#define RB_EMPTY(head) (RB_ROOT(head) == NULL)
+
+#define RB_SET(elm, parent, field) do { \
+ RB_PARENT(elm, field) = parent; \
+ RB_LEFT(elm, field) = RB_RIGHT(elm, field) = NULL; \
+ RB_COLOR(elm, field) = RB_RED; \
+} while (0)
+
+#define RB_SET_BLACKRED(black, red, field) do { \
+ RB_COLOR(black, field) = RB_BLACK; \
+ RB_COLOR(red, field) = RB_RED; \
+} while (0)
+
+#define RB_ROTATE_LEFT(head, elm, tmp, field) do { \
+ (tmp) = RB_RIGHT(elm, field); \
+ if ((RB_RIGHT(elm, field) = RB_LEFT(tmp, field))) { \
+ RB_PARENT(RB_LEFT(tmp, field), field) = (elm); \
+ } \
+ if ((RB_PARENT(tmp, field) = RB_PARENT(elm, field))) { \
+ if ((elm) == RB_LEFT(RB_PARENT(elm, field), field)) \
+ RB_LEFT(RB_PARENT(elm, field), field) = (tmp); \
+ else \
+ RB_RIGHT(RB_PARENT(elm, field), field) = (tmp); \
+ } else \
+ (head)->rbh_root = (tmp); \
+ RB_LEFT(tmp, field) = (elm); \
+ RB_PARENT(elm, field) = (tmp); \
+} while (0)
+
+#define RB_ROTATE_RIGHT(head, elm, tmp, field) do { \
+ (tmp) = RB_LEFT(elm, field); \
+ if ((RB_LEFT(elm, field) = RB_RIGHT(tmp, field))) { \
+ RB_PARENT(RB_RIGHT(tmp, field), field) = (elm); \
+ } \
+ if ((RB_PARENT(tmp, field) = RB_PARENT(elm, field))) { \
+ if ((elm) == RB_LEFT(RB_PARENT(elm, field), field)) \
+ RB_LEFT(RB_PARENT(elm, field), field) = (tmp); \
+ else \
+ RB_RIGHT(RB_PARENT(elm, field), field) = (tmp); \
+ } else \
+ (head)->rbh_root = (tmp); \
+ RB_RIGHT(tmp, field) = (elm); \
+ RB_PARENT(elm, field) = (tmp); \
+} while (0)
+
+/* Generates prototypes and inline functions */
+#define RB_PROTOTYPE(name, type, field, cmp) \
+ RB_PROTOTYPE_INTERNAL(name, type, field, cmp,)
+#define RB_PROTOTYPE_STATIC(name, type, field, cmp) \
+ RB_PROTOTYPE_INTERNAL(name, type, field, cmp, __attribute__((__unused__)) static)
+#define RB_PROTOTYPE_INTERNAL(name, type, field, cmp, attr) \
+attr void name##_RB_INSERT_COLOR(struct name *, struct type *); \
+attr void name##_RB_REMOVE_COLOR(struct name *, struct type *, struct type *);\
+attr struct type *name##_RB_REMOVE(struct name *, struct type *); \
+attr struct type *name##_RB_INSERT(struct name *, struct type *); \
+attr struct type *name##_RB_FIND(struct name *, struct type *); \
+attr struct type *name##_RB_NFIND(struct name *, struct type *); \
+attr struct type *name##_RB_NEXT(struct type *); \
+attr struct type *name##_RB_PREV(struct type *); \
+attr struct type *name##_RB_MINMAX(struct name *, int); \
+ \
+
+/* Main rb operation.
+ * Moves node close to the key of elm to top
+ */
+#define RB_GENERATE(name, type, field, cmp) \
+ RB_GENERATE_INTERNAL(name, type, field, cmp,)
+#define RB_GENERATE_STATIC(name, type, field, cmp) \
+ RB_GENERATE_INTERNAL(name, type, field, cmp, __attribute__((__unused__)) static)
+#define RB_GENERATE_INTERNAL(name, type, field, cmp, attr) \
+attr void \
+name##_RB_INSERT_COLOR(struct name *head, struct type *elm) \
+{ \
+ struct type *parent, *gparent, *tmp; \
+ while ((parent = RB_PARENT(elm, field)) && \
+ RB_COLOR(parent, field) == RB_RED) { \
+ gparent = RB_PARENT(parent, field); \
+ if (parent == RB_LEFT(gparent, field)) { \
+ tmp = RB_RIGHT(gparent, field); \
+ if (tmp && RB_COLOR(tmp, field) == RB_RED) { \
+ RB_COLOR(tmp, field) = RB_BLACK; \
+ RB_SET_BLACKRED(parent, gparent, field);\
+ elm = gparent; \
+ continue; \
+ } \
+ if (RB_RIGHT(parent, field) == elm) { \
+ RB_ROTATE_LEFT(head, parent, tmp, field);\
+ tmp = parent; \
+ parent = elm; \
+ elm = tmp; \
+ } \
+ RB_SET_BLACKRED(parent, gparent, field); \
+ RB_ROTATE_RIGHT(head, gparent, tmp, field); \
+ } else { \
+ tmp = RB_LEFT(gparent, field); \
+ if (tmp && RB_COLOR(tmp, field) == RB_RED) { \
+ RB_COLOR(tmp, field) = RB_BLACK; \
+ RB_SET_BLACKRED(parent, gparent, field);\
+ elm = gparent; \
+ continue; \
+ } \
+ if (RB_LEFT(parent, field) == elm) { \
+ RB_ROTATE_RIGHT(head, parent, tmp, field);\
+ tmp = parent; \
+ parent = elm; \
+ elm = tmp; \
+ } \
+ RB_SET_BLACKRED(parent, gparent, field); \
+ RB_ROTATE_LEFT(head, gparent, tmp, field); \
+ } \
+ } \
+ RB_COLOR(head->rbh_root, field) = RB_BLACK; \
+} \
+ \
+attr void \
+name##_RB_REMOVE_COLOR(struct name *head, struct type *parent, struct type *elm) \
+{ \
+ struct type *tmp; \
+ while ((elm == NULL || RB_COLOR(elm, field) == RB_BLACK) && \
+ elm != RB_ROOT(head)) { \
+ if (RB_LEFT(parent, field) == elm) { \
+ tmp = RB_RIGHT(parent, field); \
+ if (RB_COLOR(tmp, field) == RB_RED) { \
+ RB_SET_BLACKRED(tmp, parent, field); \
+ RB_ROTATE_LEFT(head, parent, tmp, field);\
+ tmp = RB_RIGHT(parent, field); \
+ } \
+ if ((RB_LEFT(tmp, field) == NULL || \
+ RB_COLOR(RB_LEFT(tmp, field), field) == RB_BLACK) &&\
+ (RB_RIGHT(tmp, field) == NULL || \
+ RB_COLOR(RB_RIGHT(tmp, field), field) == RB_BLACK)) {\
+ RB_COLOR(tmp, field) = RB_RED; \
+ elm = parent; \
+ parent = RB_PARENT(elm, field); \
+ } else { \
+ if (RB_RIGHT(tmp, field) == NULL || \
+ RB_COLOR(RB_RIGHT(tmp, field), field) == RB_BLACK) {\
+ struct type *oleft; \
+ if ((oleft = RB_LEFT(tmp, field)))\
+ RB_COLOR(oleft, field) = RB_BLACK;\
+ RB_COLOR(tmp, field) = RB_RED; \
+ RB_ROTATE_RIGHT(head, tmp, oleft, field);\
+ tmp = RB_RIGHT(parent, field); \
+ } \
+ RB_COLOR(tmp, field) = RB_COLOR(parent, field);\
+ RB_COLOR(parent, field) = RB_BLACK; \
+ if (RB_RIGHT(tmp, field)) \
+ RB_COLOR(RB_RIGHT(tmp, field), field) = RB_BLACK;\
+ RB_ROTATE_LEFT(head, parent, tmp, field);\
+ elm = RB_ROOT(head); \
+ break; \
+ } \
+ } else { \
+ tmp = RB_LEFT(parent, field); \
+ if (RB_COLOR(tmp, field) == RB_RED) { \
+ RB_SET_BLACKRED(tmp, parent, field); \
+ RB_ROTATE_RIGHT(head, parent, tmp, field);\
+ tmp = RB_LEFT(parent, field); \
+ } \
+ if ((RB_LEFT(tmp, field) == NULL || \
+ RB_COLOR(RB_LEFT(tmp, field), field) == RB_BLACK) &&\
+ (RB_RIGHT(tmp, field) == NULL || \
+ RB_COLOR(RB_RIGHT(tmp, field), field) == RB_BLACK)) {\
+ RB_COLOR(tmp, field) = RB_RED; \
+ elm = parent; \
+ parent = RB_PARENT(elm, field); \
+ } else { \
+ if (RB_LEFT(tmp, field) == NULL || \
+ RB_COLOR(RB_LEFT(tmp, field), field) == RB_BLACK) {\
+ struct type *oright; \
+ if ((oright = RB_RIGHT(tmp, field)))\
+ RB_COLOR(oright, field) = RB_BLACK;\
+ RB_COLOR(tmp, field) = RB_RED; \
+ RB_ROTATE_LEFT(head, tmp, oright, field);\
+ tmp = RB_LEFT(parent, field); \
+ } \
+ RB_COLOR(tmp, field) = RB_COLOR(parent, field);\
+ RB_COLOR(parent, field) = RB_BLACK; \
+ if (RB_LEFT(tmp, field)) \
+ RB_COLOR(RB_LEFT(tmp, field), field) = RB_BLACK;\
+ RB_ROTATE_RIGHT(head, parent, tmp, field);\
+ elm = RB_ROOT(head); \
+ break; \
+ } \
+ } \
+ } \
+ if (elm) \
+ RB_COLOR(elm, field) = RB_BLACK; \
+} \
+ \
+attr struct type * \
+name##_RB_REMOVE(struct name *head, struct type *elm) \
+{ \
+ struct type *child, *parent, *old = elm; \
+ int color; \
+ if (RB_LEFT(elm, field) == NULL) \
+ child = RB_RIGHT(elm, field); \
+ else if (RB_RIGHT(elm, field) == NULL) \
+ child = RB_LEFT(elm, field); \
+ else { \
+ struct type *left; \
+ elm = RB_RIGHT(elm, field); \
+ while ((left = RB_LEFT(elm, field))) \
+ elm = left; \
+ child = RB_RIGHT(elm, field); \
+ parent = RB_PARENT(elm, field); \
+ color = RB_COLOR(elm, field); \
+ if (child) \
+ RB_PARENT(child, field) = parent; \
+ if (parent) { \
+ if (RB_LEFT(parent, field) == elm) \
+ RB_LEFT(parent, field) = child; \
+ else \
+ RB_RIGHT(parent, field) = child; \
+ } else \
+ RB_ROOT(head) = child; \
+ if (RB_PARENT(elm, field) == old) \
+ parent = elm; \
+ (elm)->field = (old)->field; \
+ if (RB_PARENT(old, field)) { \
+ if (RB_LEFT(RB_PARENT(old, field), field) == old)\
+ RB_LEFT(RB_PARENT(old, field), field) = elm;\
+ else \
+ RB_RIGHT(RB_PARENT(old, field), field) = elm;\
+ } else \
+ RB_ROOT(head) = elm; \
+ RB_PARENT(RB_LEFT(old, field), field) = elm; \
+ if (RB_RIGHT(old, field)) \
+ RB_PARENT(RB_RIGHT(old, field), field) = elm; \
+ if (parent) { \
+ left = parent; \
+ do { \
+ } while ((left = RB_PARENT(left, field))); \
+ } \
+ goto color; \
+ } \
+ parent = RB_PARENT(elm, field); \
+ color = RB_COLOR(elm, field); \
+ if (child) \
+ RB_PARENT(child, field) = parent; \
+ if (parent) { \
+ if (RB_LEFT(parent, field) == elm) \
+ RB_LEFT(parent, field) = child; \
+ else \
+ RB_RIGHT(parent, field) = child; \
+ } else \
+ RB_ROOT(head) = child; \
+color: \
+ if (color == RB_BLACK) \
+ name##_RB_REMOVE_COLOR(head, parent, child); \
+ return (old); \
+} \
+ \
+/* Inserts a node into the RB tree */ \
+attr struct type * \
+name##_RB_INSERT(struct name *head, struct type *elm) \
+{ \
+ struct type *tmp; \
+ struct type *parent = NULL; \
+ int comp = 0; \
+ tmp = RB_ROOT(head); \
+ while (tmp) { \
+ parent = tmp; \
+ comp = (cmp)(elm, parent); \
+ if (comp < 0) \
+ tmp = RB_LEFT(tmp, field); \
+ else if (comp > 0) \
+ tmp = RB_RIGHT(tmp, field); \
+ else \
+ return (tmp); \
+ } \
+ RB_SET(elm, parent, field); \
+ if (parent != NULL) { \
+ if (comp < 0) \
+ RB_LEFT(parent, field) = elm; \
+ else \
+ RB_RIGHT(parent, field) = elm; \
+ } else \
+ RB_ROOT(head) = elm; \
+ name##_RB_INSERT_COLOR(head, elm); \
+ return (NULL); \
+} \
+ \
+/* Finds the node with the same key as elm */ \
+attr struct type * \
+name##_RB_FIND(struct name *head, struct type *elm) \
+{ \
+ struct type *tmp = RB_ROOT(head); \
+ int comp; \
+ while (tmp) { \
+ comp = cmp(elm, tmp); \
+ if (comp < 0) \
+ tmp = RB_LEFT(tmp, field); \
+ else if (comp > 0) \
+ tmp = RB_RIGHT(tmp, field); \
+ else \
+ return (tmp); \
+ } \
+ return (NULL); \
+} \
+ \
+/* Finds the first node greater than or equal to the search key */ \
+attr struct type * \
+name##_RB_NFIND(struct name *head, struct type *elm) \
+{ \
+ struct type *tmp = RB_ROOT(head); \
+ struct type *res = NULL; \
+ int comp; \
+ while (tmp) { \
+ comp = cmp(elm, tmp); \
+ if (comp < 0) { \
+ res = tmp; \
+ tmp = RB_LEFT(tmp, field); \
+ } \
+ else if (comp > 0) \
+ tmp = RB_RIGHT(tmp, field); \
+ else \
+ return (tmp); \
+ } \
+ return (res); \
+} \
+ \
+attr struct type * \
+name##_RB_NEXT(struct type *elm) \
+{ \
+ if (RB_RIGHT(elm, field)) { \
+ elm = RB_RIGHT(elm, field); \
+ while (RB_LEFT(elm, field)) \
+ elm = RB_LEFT(elm, field); \
+ } else { \
+ if (RB_PARENT(elm, field) && \
+ (elm == RB_LEFT(RB_PARENT(elm, field), field))) \
+ elm = RB_PARENT(elm, field); \
+ else { \
+ while (RB_PARENT(elm, field) && \
+ (elm == RB_RIGHT(RB_PARENT(elm, field), field)))\
+ elm = RB_PARENT(elm, field); \
+ elm = RB_PARENT(elm, field); \
+ } \
+ } \
+ return (elm); \
+} \
+ \
+attr struct type * \
+name##_RB_PREV(struct type *elm) \
+{ \
+ if (RB_LEFT(elm, field)) { \
+ elm = RB_LEFT(elm, field); \
+ while (RB_RIGHT(elm, field)) \
+ elm = RB_RIGHT(elm, field); \
+ } else { \
+ if (RB_PARENT(elm, field) && \
+ (elm == RB_RIGHT(RB_PARENT(elm, field), field))) \
+ elm = RB_PARENT(elm, field); \
+ else { \
+ while (RB_PARENT(elm, field) && \
+ (elm == RB_LEFT(RB_PARENT(elm, field), field)))\
+ elm = RB_PARENT(elm, field); \
+ elm = RB_PARENT(elm, field); \
+ } \
+ } \
+ return (elm); \
+} \
+ \
+attr struct type * \
+name##_RB_MINMAX(struct name *head, int val) \
+{ \
+ struct type *tmp = RB_ROOT(head); \
+ struct type *parent = NULL; \
+ while (tmp) { \
+ parent = tmp; \
+ if (val < 0) \
+ tmp = RB_LEFT(tmp, field); \
+ else \
+ tmp = RB_RIGHT(tmp, field); \
+ } \
+ return (parent); \
+}
+
+#define RB_NEGINF -1
+#define RB_INF 1
+
+#define RB_INSERT(name, x, y) name##_RB_INSERT(x, y)
+#define RB_REMOVE(name, x, y) name##_RB_REMOVE(x, y)
+#define RB_FIND(name, x, y) name##_RB_FIND(x, y)
+#define RB_NFIND(name, x, y) name##_RB_NFIND(x, y)
+#define RB_NEXT(name, x, y) name##_RB_NEXT(y)
+#define RB_PREV(name, x, y) name##_RB_PREV(y)
+#define RB_MIN(name, x) name##_RB_MINMAX(x, RB_NEGINF)
+#define RB_MAX(name, x) name##_RB_MINMAX(x, RB_INF)
+
+#define RB_FOREACH(x, name, head) \
+ for ((x) = RB_MIN(name, head); \
+ (x) != NULL; \
+ (x) = name##_RB_NEXT(x))
+
+#define RB_FOREACH_SAFE(x, name, head, y) \
+ for ((x) = RB_MIN(name, head); \
+ ((x) != NULL) && ((y) = name##_RB_NEXT(x), 1); \
+ (x) = (y))
+
+#define RB_FOREACH_REVERSE(x, name, head) \
+ for ((x) = RB_MAX(name, head); \
+ (x) != NULL; \
+ (x) = name##_RB_PREV(x))
+
+#define RB_FOREACH_REVERSE_SAFE(x, name, head, y) \
+ for ((x) = RB_MAX(name, head); \
+ ((x) != NULL) && ((y) = name##_RB_PREV(x), 1); \
+ (x) = (y))
+
+#endif /* TREE_H */
(DIR) diff --git a/webdump.c b/webdump.c
@@ -10,6 +10,7 @@
#include "arg.h"
char *argv0;
+#include "tree.h"
#include "xml.h"
static XMLParser parser;
@@ -151,19 +152,31 @@ struct selectors {
size_t count;
};
-/* linked-list of link references */
+/* RB tree of link references */
struct linkref {
char *type;
enum TagId tagid;
char *url;
int ishidden;
size_t linknr;
- struct linkref *next;
+ RB_ENTRY(linkref) entry;
};
-static struct linkref *links_head;
-static struct linkref *links_cur;
-static int linkcount; /* visible link count */
+/* link references and hidden link references */
+struct linkref **visrefs;
+static size_t nvisrefs, ncapvisrefs; /* visible link count / capacity */
+struct linkref **hiddenrefs;
+static size_t nhiddenrefs, ncaphiddenrefs; /* hidden link count / capacity */
+
+int
+linkrefcmp(struct linkref *r1, struct linkref *r2)
+{
+ return strcmp(r1->url, r2->url);
+}
+
+RB_HEAD(linkreftree, linkref) linkrefhead = RB_INITIALIZER(&linkrefhead);
+RB_PROTOTYPE(linkreftree, linkref, entry, linkrefcmp)
+RB_GENERATE(linkreftree, linkref, entry, linkrefcmp)
static const char *str_bullet_item = "* ";
static const char *str_checkbox_checked = "x";
@@ -213,10 +226,11 @@ static char rbuf[1024];
static int rbuflen;
static int rnbufcells = 0; /* pending cell count to add */
-#define MAX_DEPTH 256
-static struct node nodes[MAX_DEPTH];
-static String nodes_links[MAX_DEPTH]; /* keep track of links per node */
-static int curnode;
+#define MAX_NODE_DEPTH 65535 /* absolute maximum node depth */
+static struct node *nodes;
+static String *nodes_links; /* keep track of links per node */
+static size_t ncapnodes;
+static int curnode; /* current node depth */
/* reader / selector mode */
static int reader_mode = 0;
@@ -1378,39 +1392,59 @@ handleinlinealt(void)
}
}
-/* slow linear lookup of link references
- TODO: optimize it, maybe using tree.h RB_TREE? */
+/* lookup a link reference by url in the red-black tree */
static struct linkref *
findlinkref(const char *url)
{
- struct linkref *cur;
+ struct linkref find;
- for (cur = links_head; cur; cur = cur->next) {
- if (!strcmp(url, cur->url))
- return cur;
- }
- return NULL;
+ find.url = (char *)url;
+
+ return RB_FIND(linkreftree, &linkrefhead, &find);
}
+/* add a link reference. Returns the added link reference, or the existing link
+ reference if links are deduplicated */
static struct linkref *
-addlinkref(const char *url, const char *_type, enum TagId tagid, int ishidden,
- int linknr)
+addlinkref(const char *url, const char *_type, enum TagId tagid, int ishidden)
{
+ struct linkref *link;
+ size_t linknr;
+
+ /* if links are deduplicates return the existing link */
+ if (uniqrefs && (link = findlinkref(url)))
+ return link;
+
if (tagid == TagA)
_type = "link";
- /* add to linked list */
- if (!links_head)
- links_cur = links_head = ecalloc(1, sizeof(*links_head));
- else
- links_cur = links_cur->next = ecalloc(1, sizeof(*links_head));
- links_cur->url = estrdup(url);
- links_cur->type = estrdup(_type);
- links_cur->tagid = tagid;
- links_cur->ishidden = ishidden;
- links_cur->linknr = linknr;
+ link = ecalloc(1, sizeof(*link));
+
+ if (!ishidden) {
+ linknr = ++nvisrefs;
+ if (nvisrefs >= ncapvisrefs)
+ ncapvisrefs += 256; /* greedy alloc */
+ visrefs = erealloc(visrefs, sizeof(*visrefs) * ncapvisrefs);
+ visrefs[linknr - 1] = link; /* add pointer to list */
+ } else {
+ linknr = ++nhiddenrefs;
+ if (nhiddenrefs >= ncaphiddenrefs)
+ ncaphiddenrefs += 256; /* greedy alloc */
+ hiddenrefs = erealloc(hiddenrefs, sizeof(*hiddenrefs) * ncaphiddenrefs);
+ hiddenrefs[linknr - 1] = link; /* add pointer to list */
+ }
- return links_cur;
+ link->url = estrdup(url);
+ link->type = estrdup(_type);
+ link->tagid = tagid;
+ link->ishidden = ishidden;
+ link->linknr = linknr;
+
+ /* add to tree: the tree is only used for checking unique link references */
+ if (uniqrefs)
+ RB_INSERT(linkreftree, &linkrefhead, link);
+
+ return link;
}
static void
@@ -1462,43 +1496,65 @@ handleinlinelink(void)
/* add hidden links directly to the reference,
the order doesn't matter */
if (cur->tag.displaytype & DisplayNone)
- addlinkref(url, cur->tag.name, cur->tag.id, 1, 0);
+ addlinkref(url, cur->tag.name, cur->tag.id, 1);
}
void
printlinkrefs(void)
{
+ struct linkref *ref;
size_t i;
- int hashiddenrefs = 0;
- if (!links_head)
+ if (!nvisrefs && !nhiddenrefs)
return;
if (resources) {
- for (i = 1, links_cur = links_head; links_cur; links_cur = links_cur->next, i++)
- dprintf(3, "%s\t%s\n", links_cur->type, links_cur->url);
+ for (i = 0; i < nvisrefs; i++) {
+ ref = visrefs[i];
+ dprintf(3, "%s\t%s\n", ref->type, ref->url);
+ }
+ for (i = 0; i < nhiddenrefs; i++) {
+ ref = hiddenrefs[i];
+ dprintf(3, "%s\t%s\n", ref->type, ref->url);
+ }
}
printf("\nReferences\n\n");
- i = 1;
- for (links_cur = links_head; links_cur; links_cur = links_cur->next) {
- if (links_cur->ishidden) {
- hashiddenrefs = 1;
- continue;
- }
- printf(" %zu. %s (%s)\n", links_cur->linknr, links_cur->url, links_cur->type);
- i++;
+ for (i = 0; i < nvisrefs; i++) {
+ ref = visrefs[i];
+ printf(" %zu. %s (%s)\n", ref->linknr, ref->url, ref->type);
}
- if (hashiddenrefs)
+ if (nhiddenrefs > 0)
printf("\n\nHidden references\n\n");
/* hidden links don't have a link number, just count them */
- for (links_cur = links_head; links_cur; links_cur = links_cur->next) {
- if (!links_cur->ishidden)
- continue;
- printf(" %zu. %s (%s)\n", i, links_cur->url, links_cur->type);
- i++;
+ for (i = 0; i < nhiddenrefs; i++) {
+ ref = hiddenrefs[i];
+ printf(" %zu. %s (%s)\n", ref->linknr, ref->url, ref->type);
+ }
+}
+
+#define NODE_CAP_INC 256
+
+/* increase node depth, allocate space for nodes if needed */
+static void
+incnode(void)
+{
+ curnode++;
+
+ if (curnode >= MAX_NODE_DEPTH)
+ errx(1, "max node depth reached: %d", curnode);
+
+ if (curnode >= ncapnodes) {
+ nodes = erealloc(nodes, sizeof(*nodes) * (ncapnodes + NODE_CAP_INC));
+ nodes_links = erealloc(nodes_links, sizeof(*nodes_links) * (ncapnodes + NODE_CAP_INC));
+
+ /* clear new region */
+ memset(&nodes[ncapnodes], 0, sizeof(*nodes) * NODE_CAP_INC);
+ memset(&nodes_links[ncapnodes], 0, sizeof(*nodes_links) * NODE_CAP_INC);
+
+ ncapnodes += NODE_CAP_INC; /* greedy alloc */
}
}
@@ -1670,17 +1726,8 @@ endnode(struct node *cur)
/* add link and show the link number in the visible order */
if (!ishidden && nodes_links[curnode].len > 0) {
- if (uniqrefs)
- ref = findlinkref(nodes_links[curnode].data);
- else
- ref = NULL;
-
- /* new link: add it */
- if (!ref) {
- linkcount++;
- ref = addlinkref(nodes_links[curnode].data,
- cur->tag.name, cur->tag.id, ishidden, linkcount);
- }
+ ref = addlinkref(nodes_links[curnode].data,
+ cur->tag.name, cur->tag.id, ishidden);
if (showrefinline || showurlinline) {
hflush();
@@ -1825,9 +1872,6 @@ xmltagstart(XMLParser *p, const char *t, size_t tl)
char *s;
int i, j, k, nchildfound, parenttype;
- if (curnode >= MAX_DEPTH - 2)
- errx(1, "max tag depth reached: %d\n", curnode);
-
cur = &nodes[curnode];
string_clear(&attr_alt);
@@ -1920,7 +1964,7 @@ xmltagstart(XMLParser *p, const char *t, size_t tl)
}
}
- curnode++;
+ incnode();
string_clear(&nodes_links[curnode]); /* clear possible link reference for this node */
cur = &nodes[curnode];
memset(cur, 0, sizeof(*cur)); /* clear / reset node */
@@ -2333,6 +2377,11 @@ main(int argc, char **argv)
usage();
} ARGEND
+ /* initial nodes */
+ ncapnodes = NODE_CAP_INC;
+ nodes = ecalloc(ncapnodes, sizeof(*nodes));
+ nodes_links = ecalloc(ncapnodes, sizeof(*nodes_links));
+
/* top-most document root needs initialization */
nodes[0].tag.name = "";