selector syntax: document it and add feature to filter on a specific nth node - webdump - HTML to plain-text converter for webpages
(HTM) git clone git://git.codemadness.org/webdump
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
(DIR) commit 56ec7ea6c49d79cc3aaf301d2e6040e15d17785a
(DIR) parent 94f0ad42fcfbe17b01d9e573a786435d1acc0232
(HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date: Fri, 8 Sep 2023 11:07:57 +0200
selector syntax: document it and add feature to filter on a specific nth node
Diffstat:
M webdump.1 | 38 +++++++++++++++++++++++++++----
M webdump.c | 33 ++++++++++++++++++++++++++++---
2 files changed, 63 insertions(+), 8 deletions(-)
---
(DIR) diff --git a/webdump.1 b/webdump.1
@@ -1,4 +1,4 @@
-.Dd September 7, 2023
+.Dd September 8, 2023
.Dt WEBDUMP 1
.Os
.Sh NAME
@@ -36,12 +36,16 @@ not enabled.
.It Fl r
Toggle if line-wrapping mode is enabled, by default it is not enabled.
.It Fl s
-CSS-like selectors, this sets a reader mode to hide content
-matching the selector, for example: "main" or "main#id" or "main.class".
+CSS-like selectors, this sets a reader mode to hide content matching the
+selector, see the section
+.Sx SELECTOR SYNTAX
+for the syntax.
Multiple selectors can be specified by separating them with a comma.
.It Fl u
-CSS-like selectors, this sets a reader mode to hide content
-matching the selector, for example: "main" or "main#id" or "main.class".
+CSS-like selectors, this sets a reader mode to hide content matching the
+selector, see the section
+.Sx SELECTOR SYNTAX
+for the syntax.
Multiple selectors can be specified by separating them with a comma.
.It Fl w Ar termwidth
The terminal width.
@@ -49,6 +53,30 @@ The default is 77 characters.
.It Fl x
Write resources as TAB-separated lines to file descriptor 3.
.El
+.Sh SELECTOR SYNTAX
+The syntax has some inspiration from CSS, but it is more limited.
+Some examples:
+.Bl -item
+.It
+"main" would match on the "main" tags.
+.It
+"#someid" would match on any tag which has the id attribute set to "someid".
+.It
+".someclass" would match on any tag which has the class attribute set to
+"someclass".
+.It
+"main#someid" would match on the "main" tag which has the id attribute set to
+"someid".
+.It
+"main.someclass" would match on the "main" tags which has the class
+attribute set to "someclass".
+.It
+"ul li" would match on any "li" tag which also has a parent "ul" tag.
+.It
+"li@0" would match on any "li" tag which is also the first child element of its
+parent container.
+Note that this differs from filtering on a collection of "li" elements.
+.El
.Sh EXIT STATUS
.Ex -std
.Sh EXAMPLES
(DIR) diff --git a/webdump.c b/webdump.c
@@ -123,6 +123,7 @@ struct node {
struct selectornode {
char tagname[256];
+ long index; /* index of node to match on: -1 if not matching on index */
/* attributes */
char id[256];
char classnames[1024];
@@ -1073,11 +1074,13 @@ int
compileselector(const char *sel, struct selectornode *nodes, size_t maxnodes)
{
int depth = 0, len;
+ long l;
const char *s, *start;
char tmp[256];
int nameset = 0;
memset(&nodes[0], 0, sizeof(nodes[0]));
+ nodes[0].index = -1;
s = sel;
for (; *s && ISSPACE((unsigned char)*s); s++)
@@ -1087,7 +1090,7 @@ compileselector(const char *sel, struct selectornode *nodes, size_t maxnodes)
for (; ; s++) {
/* end of tag */
if (!nameset &&
- (*s == '#' || *s == '.' || *s == '[' ||
+ (*s == '#' || *s == '.' || *s == '@' ||
*s == '\0' || ISSPACE((unsigned char)*s))) {
nameset = 1;
len = s - start; /* tag name */
@@ -1111,15 +1114,32 @@ compileselector(const char *sel, struct selectornode *nodes, size_t maxnodes)
nameset = 0;
memset(&nodes[depth], 0, sizeof(nodes[depth]));
+ nodes[depth].index = -1;
/* end of selector */
if (*s == '\0')
break;
}
+ /* index */
+ if (*s == '@') {
+ len = strcspn(s + 1, ".#@ \t\n");
+ if (len >= sizeof(tmp))
+ return 0;
+ memcpy(tmp, s + 1, len);
+ tmp[len] = '\0';
+
+ l = strtol(tmp, NULL, 10);
+ if (l >= 0)
+ nodes[depth].index = l;
+ s += len;
+ start = s + 1;
+ continue;
+ }
+
/* id */
if (*s == '#') {
- len = strcspn(s + 1, ".#[ \t\n");
+ len = strcspn(s + 1, ".#@ \t\n");
if (len >= sizeof(tmp))
return 0;
memcpy(tmp, s + 1, len);
@@ -1132,7 +1152,7 @@ compileselector(const char *sel, struct selectornode *nodes, size_t maxnodes)
/* class */
if (*s == '.') {
- len = strcspn(s + 1, ".#[ \t\n");
+ len = strcspn(s + 1, ".#@ \t\n");
if (len >= sizeof(tmp))
return 0;
memcpy(tmp, s + 1, len);
@@ -1225,6 +1245,13 @@ iscssmatch(struct selector *sel, struct node *root, int maxdepth)
!isclassmatch(root[d].classnames, sel->nodes[md].classnames))
continue; /* no */
+ /* index matched */
+ if (sel->nodes[md].index != -1 &&
+ (d == 0 ||
+ root[d - 1].nchildren == 0 ||
+ sel->nodes[md].index != root[d - 1].nchildren - 1))
+ continue;
+
md++;
/* all matched of one selector */
if (md == sel->depth)