add -F and -R option to allow to specify a different field and record separator - json2tsv - JSON to TSV converter
(HTM) git clone git://git.codemadness.org/json2tsv
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
(DIR) commit 18215ba6f1a9f2c76ffceded0670eb2b2f466792
(DIR) parent 226d85203f7ea26dcc71c98e0fb7fe3ffb78176b
(HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date: Sat, 18 Sep 2021 16:17:32 +0200
add -F and -R option to allow to specify a different field and record separator
Diffstat:
M README | 3 +++
M json2tsv.1 | 101 +++++++++++++++++++++++++++----
M json2tsv.c | 96 +++++++++++++++++++++++++++----
3 files changed, 176 insertions(+), 24 deletions(-)
---
(DIR) diff --git a/README b/README
@@ -112,6 +112,9 @@ $2 == "s" && index($3, "\\") {
print $3;
}'
+To not have to unescape the data a different field separator and record
+separator can be set using the -F and -R option.
+
License
-------
(DIR) diff --git a/json2tsv.1 b/json2tsv.1
@@ -1,4 +1,4 @@
-.Dd November 5, 2019
+.Dd September 24, 2021
.Dt JSON2TSV 1
.Os
.Sh NAME
@@ -8,6 +8,8 @@
.Nm
.Op Fl n
.Op Fl r
+.Op Fl F Ar fs
+.Op Fl R Ar rs
.Sh DESCRIPTION
.Nm
reads JSON data from stdin.
@@ -19,32 +21,105 @@ The options are as follows:
Show the indices for array types (by default off).
.It Fl r
Show all control-characters (by default off).
+.It Fl F Ar fs
+Use
+.Ar fs
+as the field separator.
+The default is a TAB character.
+.It Fl R Ar rs
+Use
+.Ar rs
+as the record separator.
+The default is a newline character.
.El
-.Sh TAB-SEPARATED VALUE FORMAT
-The output format per line is:
+.Pp
+The
+.Ar fs
+or
+.Ar rs
+separators can be specified in the following formats:
+.Pp
+.Bl -item -compact
+.It
+\\\\ for a backslash character.
+.It
+\\n for a newline character.
+.It
+\\r for a carriage return character.
+.It
+\\t for a TAB character.
+.It
+\\xXX for a character specified in the hexadecimal format as XX.
+.El
+.Pp
+Otherwise: if a single character is specified this character will be used.
+If more than one character is specified it will be parsed as a number using the
+format supported by
+.Xr strtol 3
+with base set to 0 and this character is the index in the ASCII table.
+.Sh OUTPUT FORMAT
+The output format per node is:
.Bd -literal
-nodename<TAB>type<TAB>value<LF>
+nodename<FIELD SEPARATOR>type<FIELD SEPARATOR>value<RECORD SEPARATOR>
.Ed
.Pp
Control-characters such as a newline, TAB and backslash (\\n, \\t and \\\\) are
-always escaped in the nodename and value fields.
-Other control-characters are removed, unless the option
+escaped in the nodename and value fields unless a
+.Fl F
+or
+.Fl R
+option is specified.
+When the .Fl F
+or
+.Fl R
+option is specified then the separator characters are removed from the output.
+Control-characters are removed, unless the option
.Fl r
is set.
.Pp
The type field is a single byte and can be:
-.Bl -tag -width Ds
-.It a for array
-.It b for bool
-.It n for number
-.It o for object
-.It s for string
-.It ? for null
+.Pp
+.Bl -item -compact
+.It
+a for array
+.It
+b for bool
+.It
+n for number
+.It
+o for object
+.It
+s for string
+.It
+? for null
.El
.Sh EXIT STATUS
.Nm
exits with the exit status 0 on success, 1 on a parse error or 2 when
out of memory.
+.Sh EXAMPLES
+.Bd -literal
+json2tsv < input.json | awk -F '\\t' '$1 == ".url" { print $3 }'
+.Ed
+.Pp
+To filter without having to unescape characters the
+.Fl F
+and
+.Fl R
+options can be used.
+In the example below it uses the ASCII character 0x1f (Unit Separator) as the
+field separator and the ASCII character 0x1e (Record Separator) as the record
+separator.
+.Bd -literal
+json2tsv -r -F '\\x1f' -R '\\x1e' < input.json | \\
+ awk '
+ BEGIN {
+ FS = "\\x1f"; RS = "\\x1e";
+ }
+ $1 == ".url" {
+ print $3;
+ }'
+.Ed
.Sh SEE ALSO
.Xr awk 1 ,
.Xr grep 1
(DIR) diff --git a/json2tsv.c b/json2tsv.c
@@ -14,9 +14,12 @@
static int nflag = 0; /* -n flag: show indices count for arrays */
static int rflag = 0; /* -r flag: show all control-characters */
+static int fs = '\t', rs = '\n';
+
+static void (*printvalue)(const char *);
void
-printvalue(const char *s)
+tsv_printvalue(const char *s)
{
for (; *s; s++) {
/* escape some chars */
@@ -34,6 +37,18 @@ printvalue(const char *s)
}
void
+rs_printvalue(const char *s)
+{
+ for (; *s; s++) {
+ /* ignore other control chars */
+ if ((!rflag && iscntrl((unsigned char)*s)) ||
+ *s == fs || *s == rs)
+ continue;
+ putchar(*s);
+ }
+}
+
+void
processnode(struct json_node *nodes, size_t depth, const char *value)
{
size_t i;
@@ -58,17 +73,58 @@ processnode(struct json_node *nodes, size_t depth, const char *value)
}
}
- putchar('\t');
+ putchar(fs);
putchar(nodes[depth - 1].type);
- putchar('\t');
+ putchar(fs);
printvalue(value);
- putchar('\n');
+ putchar(rs);
+}
+
+int
+readnum(const char *s, int base)
+{
+ long l;
+ char *end;
+
+ errno = 0;
+ l = strtol(s, &end, base);
+ if (errno || s == end || *end != '\0' || l < 0 || l > 255) {
+ fprintf(stderr, "invalid number\n");
+ exit(1);
+ }
+
+ return (int)l;
+}
+
+int
+readchar(const char *s)
+{
+ if (!*s) {
+ fprintf(stderr, "invalid character\n");
+ exit(1);
+ } else if (strlen(s) == 1) {
+ return *s;
+ } else if (*s == '\\') {
+ s++;
+ switch (*s) {
+ case '\\': return '\\';
+ case 't': return '\t';
+ case 'n': return '\n';
+ case 'r': return '\r';
+ case 'x': return readnum(++s, 16); /* hexadecimal */
+ default:
+ fprintf(stderr, "unsupported escape character\n");
+ exit(1);
+ }
+ }
+ /* base 0 (decimal, octal, hex) using strtol() format */
+ return readnum(s, 0);
}
void
usage(const char *argv0)
{
- fprintf(stderr, "usage: %s [-n] [-r]\n", argv0);
+ fprintf(stderr, "usage: %s [-n] [-r] [-F fs] [-R rs]\n", argv0);
exit(1);
}
@@ -82,16 +138,34 @@ main(int argc, char *argv[])
return 1;
}
+ printvalue = tsv_printvalue;
for (i = 1; i < argc; i++) {
- if (argv[i][0] != '-' || argv[i][1] == '\0')
- usage(argv[0]);
- for (j = 1; argv[i][j]; j++) {
+ for (j = 1; i < argc && argv[i][j]; j++) {
switch (argv[i][j]) {
- case 'n': nflag = 1; break;
- case 'r': rflag = 1; break;
- default: usage(argv[0]); break;
+ case 'n':
+ nflag = 1;
+ break;
+ case 'r':
+ rflag = 1;
+ break;
+ case 'F':
+ if (i + 1 >= argc)
+ usage(argv[0]);
+ fs = readchar(argv[++i]);
+ printvalue = rs_printvalue;
+ goto nextarg;
+ case 'R':
+ if (i + 1 >= argc)
+ usage(argv[0]);
+ rs = readchar(argv[++i]);
+ printvalue = rs_printvalue;
+ goto nextarg;
+ default:
+ usage(argv[0]);
+ break;
}
}
+nextarg:;
}
switch (parsejson(processnode)) {