add support for high-low surrogates and UTF-16 decoding - json2tsv - JSON to TSV converter
(HTM) git clone git://git.codemadness.org/json2tsv
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
(DIR) commit 933582372d81a911193fb1da7c86b6b960432535
(DIR) parent 922491e0343ab6f440024803921daf843b0e9cf5
(HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date: Sun, 13 Oct 2019 21:31:31 +0200
add support for high-low surrogates and UTF-16 decoding
seen in the wild on a Reddit JSON file encoding emojis.
It is also mentioned in the RFC7159 - 7. Strings
Diffstat:
M json2tsv.c | 25 ++++++++++++++++++++++++-
1 file changed, 24 insertions(+), 1 deletion(-)
---
(DIR) diff --git a/json2tsv.c b/json2tsv.c
@@ -107,7 +107,7 @@ parsejson(void (*cb)(struct json_node *, size_t, const char *), const char **err
{
struct json_node nodes[JSON_MAX_NODE_DEPTH] = { 0 };
size_t depth = 0, v = 0, vz = 0;
- long cp;
+ long cp, hi, lo;
int c, i, escape, ret = -1;
char *value = NULL;
@@ -164,6 +164,29 @@ parsejson(void (*cb)(struct json_node *, size_t, const char *), const char **err
}
cp |= (hexdigit(c) << i);
}
+ /* See also:
+ * RFC7159 - 7. Strings and
+ * https://unicode.org/faq/utf_bom.html#utf8-4
+ * 0xd800 - 0xdb7f - high surrogates (no private use range) */
+ if (cp >= 0xd800 && cp <= 0xdb7f) {
+ if (GETNEXT() != '\\' || GETNEXT() != 'u') {
+ *errstr = "invalid codepoint";
+ goto end;
+ }
+ for (hi = cp, i = 12, lo = 0; i >= 0; i -= 4) {
+ if ((c = GETNEXT()) == EOF || !isxdigit(c)) {
+ *errstr = "invalid codepoint";
+ goto end;
+ }
+ lo |= (hexdigit(c) << i);
+ }
+ /* 0xdc00 - 0xdfff - low surrogates: must follow after high surrogate */
+ if (!(lo >= 0xdc00 && lo <= 0xdfff)) {
+ *errstr = "invalid codepoint";
+ goto end;
+ }
+ cp = (hi << 10) + (0xDC00 + (lo & 0x3FF)) - 56613888;
+ }
if (capacity(&value, &vz, v, 5) == -1)
goto end;
v += codepointtoutf8(cp, &value[v]);