codemadness.org

       add -F and -R option to allow to specify a different field and record separator - json2tsv - JSON to TSV converter
 (HTM) git clone git://git.codemadness.org/json2tsv
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit 18215ba6f1a9f2c76ffceded0670eb2b2f466792
 (DIR) parent 226d85203f7ea26dcc71c98e0fb7fe3ffb78176b
 (HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
       Date:   Sat, 18 Sep 2021 16:17:32 +0200
       
       add -F and -R option to allow to specify a different field and record separator
       
       Diffstat:
         M README                              |       3 +++
         M json2tsv.1                          |     101 +++++++++++++++++++++++++++----
         M json2tsv.c                          |      96 +++++++++++++++++++++++++++----
       
       3 files changed, 176 insertions(+), 24 deletions(-)
       ---
 (DIR) diff --git a/README b/README
       @@ -112,6 +112,9 @@ $2 == "s" && index($3, "\\") {
                print $3;
        }'
        
       +To not have to unescape the data a different field separator and record
       +separator can be set using the -F and -R option.
       +
        
        License
        -------
 (DIR) diff --git a/json2tsv.1 b/json2tsv.1
       @@ -1,4 +1,4 @@
       -.Dd November 5, 2019
       +.Dd September 24, 2021
        .Dt JSON2TSV 1
        .Os
        .Sh NAME
       @@ -8,6 +8,8 @@
        .Nm
        .Op Fl n
        .Op Fl r
       +.Op Fl F Ar fs
       +.Op Fl R Ar rs
        .Sh DESCRIPTION
        .Nm
        reads JSON data from stdin.
       @@ -19,32 +21,105 @@ The options are as follows:
        Show the indices for array types (by default off).
        .It Fl r
        Show all control-characters (by default off).
       +.It Fl F Ar fs
       +Use
       +.Ar fs
       +as the field separator.
       +The default is a TAB character.
       +.It Fl R Ar rs
       +Use
       +.Ar rs
       +as the record separator.
       +The default is a newline character.
        .El
       -.Sh TAB-SEPARATED VALUE FORMAT
       -The output format per line is:
       +.Pp
       +The
       +.Ar fs
       +or
       +.Ar rs
       +separators can be specified in the following formats:
       +.Pp
       +.Bl -item -compact
       +.It
       +\\\\ for a backslash character.
       +.It
       +\\n for a newline character.
       +.It
       +\\r for a carriage return character.
       +.It
       +\\t for a TAB character.
       +.It
       +\\xXX for a character specified in the hexadecimal format as XX.
       +.El
       +.Pp
       +Otherwise: if a single character is specified this character will be used.
       +If more than one character is specified it will be parsed as a number using the
       +format supported by
       +.Xr strtol 3
       +with base set to 0 and this character is the index in the ASCII table.
       +.Sh OUTPUT FORMAT
       +The output format per node is:
        .Bd -literal
       -nodename<TAB>type<TAB>value<LF>
       +nodename<FIELD SEPARATOR>type<FIELD SEPARATOR>value<RECORD SEPARATOR>
        .Ed
        .Pp
        Control-characters such as a newline, TAB and backslash (\\n, \\t and \\\\) are
       -always escaped in the nodename and value fields.
       -Other control-characters are removed, unless the option
       +escaped in the nodename and value fields unless a
       +.Fl F
       +or
       +.Fl R
       +option is specified.
       +When the .Fl F
       +or
       +.Fl R
       +option is specified then the separator characters are removed from the output.
       +Control-characters are removed, unless the option
        .Fl r
        is set.
        .Pp
        The type field is a single byte and can be:
       -.Bl -tag -width Ds
       -.It a for array
       -.It b for bool
       -.It n for number
       -.It o for object
       -.It s for string
       -.It ? for null
       +.Pp
       +.Bl -item -compact
       +.It
       +a for array
       +.It
       +b for bool
       +.It
       +n for number
       +.It
       +o for object
       +.It
       +s for string
       +.It
       +? for null
        .El
        .Sh EXIT STATUS
        .Nm
        exits with the exit status 0 on success, 1 on a parse error or 2 when
        out of memory.
       +.Sh EXAMPLES
       +.Bd -literal
       +json2tsv < input.json | awk -F '\\t' '$1 == ".url" { print $3 }'
       +.Ed
       +.Pp
       +To filter without having to unescape characters the
       +.Fl F
       +and
       +.Fl R
       +options can be used.
       +In the example below it uses the ASCII character 0x1f (Unit Separator) as the
       +field separator and the ASCII character 0x1e (Record Separator) as the record
       +separator.
       +.Bd -literal
       +json2tsv -r -F '\\x1f' -R '\\x1e' < input.json | \\
       +        awk '
       +        BEGIN {
       +                FS = "\\x1f"; RS = "\\x1e";
       +        }
       +        $1 == ".url" {
       +                print $3;
       +        }'
       +.Ed
        .Sh SEE ALSO
        .Xr awk 1 ,
        .Xr grep 1
 (DIR) diff --git a/json2tsv.c b/json2tsv.c
       @@ -14,9 +14,12 @@
        
        static int nflag = 0; /* -n flag: show indices count for arrays */
        static int rflag = 0; /* -r flag: show all control-characters */
       +static int fs = '\t', rs = '\n';
       +
       +static void (*printvalue)(const char *);
        
        void
       -printvalue(const char *s)
       +tsv_printvalue(const char *s)
        {
                for (; *s; s++) {
                        /* escape some chars */
       @@ -34,6 +37,18 @@ printvalue(const char *s)
        }
        
        void
       +rs_printvalue(const char *s)
       +{
       +        for (; *s; s++) {
       +                /* ignore other control chars */
       +                if ((!rflag && iscntrl((unsigned char)*s)) ||
       +                    *s == fs || *s == rs)
       +                        continue;
       +                putchar(*s);
       +        }
       +}
       +
       +void
        processnode(struct json_node *nodes, size_t depth, const char *value)
        {
                size_t i;
       @@ -58,17 +73,58 @@ processnode(struct json_node *nodes, size_t depth, const char *value)
                        }
                }
        
       -        putchar('\t');
       +        putchar(fs);
                putchar(nodes[depth - 1].type);
       -        putchar('\t');
       +        putchar(fs);
                printvalue(value);
       -        putchar('\n');
       +        putchar(rs);
       +}
       +
       +int
       +readnum(const char *s, int base)
       +{
       +        long l;
       +        char *end;
       +
       +        errno = 0;
       +        l = strtol(s, &end, base);
       +        if (errno || s == end || *end != '\0' || l < 0 || l > 255) {
       +                fprintf(stderr, "invalid number\n");
       +                exit(1);
       +        }
       +
       +        return (int)l;
       +}
       +
       +int
       +readchar(const char *s)
       +{
       +        if (!*s) {
       +                fprintf(stderr, "invalid character\n");
       +                exit(1);
       +        } else if (strlen(s) == 1) {
       +                return *s;
       +        } else if (*s == '\\') {
       +                s++;
       +                switch (*s) {
       +                case '\\': return '\\';
       +                case 't': return '\t';
       +                case 'n': return '\n';
       +                case 'r': return '\r';
       +                case 'x': return readnum(++s, 16); /* hexadecimal */
       +                default:
       +                        fprintf(stderr, "unsupported escape character\n");
       +                        exit(1);
       +                }
       +        }
       +        /* base 0 (decimal, octal, hex) using strtol() format */
       +        return readnum(s, 0);
        }
        
        void
        usage(const char *argv0)
        {
       -        fprintf(stderr, "usage: %s [-n] [-r]\n", argv0);
       +        fprintf(stderr, "usage: %s [-n] [-r] [-F fs] [-R rs]\n", argv0);
                exit(1);
        }
        
       @@ -82,16 +138,34 @@ main(int argc, char *argv[])
                        return 1;
                }
        
       +        printvalue = tsv_printvalue;
                for (i = 1; i < argc; i++) {
       -                if (argv[i][0] != '-' || argv[i][1] == '\0')
       -                        usage(argv[0]);
       -                for (j = 1; argv[i][j]; j++) {
       +                for (j = 1; i < argc && argv[i][j]; j++) {
                                switch (argv[i][j]) {
       -                        case 'n': nflag = 1; break;
       -                        case 'r': rflag = 1; break;
       -                        default: usage(argv[0]); break;
       +                        case 'n':
       +                                nflag = 1;
       +                                break;
       +                        case 'r':
       +                                rflag = 1;
       +                                break;
       +                        case 'F':
       +                                if (i + 1 >= argc)
       +                                        usage(argv[0]);
       +                                fs = readchar(argv[++i]);
       +                                printvalue = rs_printvalue;
       +                                goto nextarg;
       +                        case 'R':
       +                                if (i + 1 >= argc)
       +                                        usage(argv[0]);
       +                                rs = readchar(argv[++i]);
       +                                printvalue = rs_printvalue;
       +                                goto nextarg;
       +                        default:
       +                                usage(argv[0]);
       +                                break;
                                }
                        }
       +nextarg:;
                }
        
                switch (parsejson(processnode)) {