handle unescaped string and unexpected EOF and improve handling surrogates - json2tsv - JSON to TSV converter
(HTM) git clone git://git.codemadness.org/json2tsv
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
(DIR) commit 9f4ab639718e4351b02b4bde6035cd588c32b169
(DIR) parent f0b7f8935d41162e29c5a01f15273ba225909969
(HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date: Tue, 15 Oct 2019 18:58:56 +0200
handle unescaped string and unexpected EOF and improve handling surrogates
.... handle UTF-16 surrogate code-point errors and try to recover gracefully, just
output the raw bytes.
Diffstat:
M json2tsv.c | 45 ++++++++++++++++++++-----------
1 file changed, 29 insertions(+), 16 deletions(-)
---
(DIR) diff --git a/json2tsv.c b/json2tsv.c
@@ -126,7 +126,7 @@ parsejson(void (*cb)(struct json_node *, size_t, const char *), const char **err
nodes[depth].type = TYPE_PRIMITIVE;
while ((c = GETNEXT()) != EOF) {
- /* not whitespace or control-character */
+ /* not whitespace or control character */
if (c <= 0x20 || c == 0x7f)
continue;
@@ -149,14 +149,19 @@ parsejson(void (*cb)(struct json_node *, size_t, const char *), const char **err
break;
case '"':
nodes[depth].type = TYPE_STRING;
- for (escape = 0; (c = GETNEXT()) != EOF;) {
- /* 0x7f is not defined as a control-character in strings in the RFC */
- if (c < 0x20)
- continue;
+ escape = 0;
+ for (;;) {
+ c = GETNEXT();
+chr:
+ if (c < 0x20) {
+ /* EOF or control char: 0x7f is not defined as a control char in RFC8259 */
+ *errstr = JSON_ERROR_INVALID_CHAR;
+ goto end;
+ }
if (escape) {
+escchr:
escape = 0;
-
switch (c) {
case '"': /* FALLTHROUGH */
case '\\':
@@ -167,6 +172,8 @@ parsejson(void (*cb)(struct json_node *, size_t, const char *), const char **err
case 'r': c = '\r'; break;
case 't': c = '\t'; break;
case 'u': /* hex hex hex hex */
+ if (capacity(&value, &vz, v, 4) == -1)
+ goto end;
for (i = 12, cp = 0; i >= 0; i -= 4) {
if ((c = GETNEXT()) == EOF || !isxdigit(c)) {
*errstr = JSON_ERROR_CODEPOINT;
@@ -175,13 +182,17 @@ parsejson(void (*cb)(struct json_node *, size_t, const char *), const char **err
cp |= (hexdigit(c) << i);
}
/* See also:
- * RFC7159 - 7. Strings and
+ * RFC8259 - 7. Strings and
* https://unicode.org/faq/utf_bom.html#utf8-4
* 0xd800 - 0xdb7f - high surrogates (no private use range) */
if (cp >= 0xd800 && cp <= 0xdb7f) {
- if (GETNEXT() != '\\' || GETNEXT() != 'u') {
- *errstr = JSON_ERROR_CODEPOINT;
- goto end;
+ if ((c = GETNEXT()) != '\\') {
+ v += codepointtoutf8(cp, &value[v]);
+ goto chr;
+ }
+ if ((c = GETNEXT()) != 'u') {
+ v += codepointtoutf8(cp, &value[v]);
+ goto escchr;
}
for (hi = cp, i = 12, lo = 0; i >= 0; i -= 4) {
if ((c = GETNEXT()) == EOF || !isxdigit(c)) {
@@ -191,14 +202,16 @@ parsejson(void (*cb)(struct json_node *, size_t, const char *), const char **err
lo |= (hexdigit(c) << i);
}
/* 0xdc00 - 0xdfff - low surrogates: must follow after high surrogate */
- if (!(lo >= 0xdc00 && lo <= 0xdfff)) {
- *errstr = JSON_ERROR_CODEPOINT;
- goto end;
+ if (lo >= 0xdc00 && lo <= 0xdfff) {
+ cp = (hi << 10) + lo - 56613888; /* - offset */
+ } else {
+ v += codepointtoutf8(hi, &value[v]);
+ if (capacity(&value, &vz, v, 4) == -1)
+ goto end;
+ v += codepointtoutf8(lo, &value[v]);
+ continue;
}
- cp = (hi << 10) + lo - 56613888; /* - offset */
}
- if (capacity(&value, &vz, v, 4) == -1)
- goto end;
v += codepointtoutf8(cp, &value[v]);
continue;
default: