utf8.[ch]: imported utf8len() and utf8check() - iomenu - interactive terminal-based selection menu
(HTM) git clone git://bitreich.org/iomenu git://enlrupgkhuxnvlhsf6lc3fziv5h2hhfrinws65d7roiv6bfj7d652fid.onion/iomenu
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) Tags
(DIR) README
(DIR) LICENSE
---
(DIR) commit c383bef4af98a82331c2c6e56ff7964e3dbf690b
(DIR) parent 09d7cdbc37907c01400e2193f4eafba74736aa7d
(HTM) Author: Josuah Demangeon <josuah.demangeon@gandi.net>
Date: Tue, 22 Aug 2017 19:48:53 +0200
utf8.[ch]: imported utf8len() and utf8check()
Diffstat:
M iomenu.c | 37 ++++++++++++++++---------------
D utf.c | 332 -------------------------------
D utf.h | 18 ------------------
A utf8.c | 142 +++++++++++++++++++++++++++++++
A utf8.h | 5 +++++
5 files changed, 166 insertions(+), 368 deletions(-)
---
(DIR) diff --git a/iomenu.c b/iomenu.c
@@ -12,9 +12,9 @@
#define CONTINUE 2 /* as opposed to EXIT_SUCCESS and EXIT_FAILURE */
-#define CONTROL(char) (char ^ 0x40)
+#define CTL(char) (char ^ 0x40)
#define ALT(char) (char + 0x80)
-#define ESC(char) (char + 0x80 + 0x80)
+#define CSI(char) (char + 0x80 + 0x80)
#define MIN(X, Y) (((X) < (Y)) ? (X) : (Y))
static struct winsize ws;
@@ -74,7 +74,7 @@ resetterminal(void)
int i;
/* clear terminal */
- for (i = 0; i < opt['l'] + 1; i++)
+ for (i = 0; i < rows + 1; i++)
fputs("\r\033[K\n", stderr);
/* reset cursor position */
@@ -322,61 +322,61 @@ key(void)
top:
switch (key) {
- case CONTROL('C'):
+ case CTL('C'):
return EXIT_FAILURE;
- case CONTROL('U'):
+ case CTL('U'):
input[0] = '\0';
filter();
break;
- case CONTROL('W'):
+ case CTL('W'):
removeword();
break;
case 127:
- case CONTROL('H'): /* backspace */
+ case CTL('H'): /* backspace */
input[strlen(input) - 1] = '\0';
filter();
break;
- case ESC('A'): /* up */
- case CONTROL('P'):
+ case CSI('A'): /* up */
+ case CTL('P'):
move(-1);
break;
- case ESC('B'): /* down */
- case CONTROL('N'):
+ case CSI('B'): /* down */
+ case CTL('N'):
move(+1);
break;
- case ESC('5'):
+ case CSI('5'): /* page up */
if (fgetc(stdin) != '~') break;
/* FALLTHROUGH */
case ALT('v'):
movepg(-1);
break;
- case ESC('6'):
+ case CSI('6'): /* page down */
if (fgetc(stdin) != '~') break;
/* FALLTHROUGH */
- case CONTROL('V'):
+ case CTL('V'):
movepg(+1);
break;
- case CONTROL('I'): /* tab */
+ case CTL('I'): /* tab */
if (linec > 0)
strcpy(input, matchv[current]);
filter();
break;
- case CONTROL('J'): /* enter */
- case CONTROL('M'):
+ case CTL('J'): /* enter */
+ case CTL('M'):
printselection();
return EXIT_SUCCESS;
case ALT('['):
- key = ESC(fgetc(stdin));
+ key = CSI(fgetc(stdin));
goto top;
case 033: /* escape / alt */
@@ -464,6 +464,7 @@ main(int argc, char *argv[])
input[0] = '\0';
while ((exitcode = key()) == CONTINUE)
printscreen();
+ printscreen();
resetterminal();
close(ttyfd);
(DIR) diff --git a/utf.c b/utf.c
@@ -1,332 +0,0 @@
-/*
- * Functions handling UTF-8 strings:
- *
- * stdin -> buffer -> stdout
- * UTF-8 -> rune -> UTF-8
- * char[] -> long[] -> char[]
- *
- * Thanks to Connor Lane Smith for the idea of combining switches and
- * binary masks.
- */
-
-
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-
-#include "utf.h"
-
-
-/* --- lengths -------------------------------------------------------------- */
-
-
-/*
- * Return the number of bytes in rune for the `n` next char in `s`,
- * or 0 if ti is misencoded.
- */
-int
-utflen(char *s, int n)
-{
- int len = 1;
- int continuation_bytes =
- (s[0] & 0x80) == 0x00 ? 0 : /* 0xxxxxxx */
- (s[0] & 0xc0) == 0x80 ? 1 : /* 10xxxxxx */
- (s[0] & 0xe0) == 0xc0 ? 2 : /* 110xxxxx */
- (s[0] & 0xf0) == 0xe0 ? 3 : /* 1110xxxx */
- (s[0] & 0xf8) == 0xf0 ? 4 : /* 11110xxx */
- (s[0] & 0xfc) == 0xf8 ? 5 : /* 111110xx */
- (s[0] & 0xfe) == 0xfc ? 6 : /* 1111110x */
- (s[0] & 0xff) == 0xfe ? 7 : /* 11111110 */
- 8; /* 11111111 */
-
- if (continuation_bytes > 6 || continuation_bytes > n)
- return 0;
-
- /* check if continuation bytes are 10xxxxxx and increment `len` */
- switch (continuation_bytes) { /* FALLTHROUGH */
- case 7: if ((s[6] & 0xc0) != 0x80) return 0; else len++;
- case 6: if ((s[5] & 0xc0) != 0x80) return 0; else len++;
- case 5: if ((s[4] & 0xc0) != 0x80) return 0; else len++;
- case 4: if ((s[3] & 0xc0) != 0x80) return 0; else len++;
- case 3: if ((s[2] & 0xc0) != 0x80) return 0; else len++;
- case 2: if ((s[1] & 0xc0) != 0x80) return 0; else len++;
- case 0: return len;
- default: return 0;
- }
-}
-
-
-/*
- * Return the number of bytes required to encode `rune` into UTF-8, or
- * 0 if rune is too long.
- */
-int
-runelen(long r)
-{
- if (r <= 0x0000007f) return 1;
- if (r <= 0x000007ff) return 2;
- if (r <= 0x0000ffff) return 3;
- if (r <= 0x001fffff) return 4;
- if (r <= 0x03ffffff) return 5;
- if (r <= 0x7fffffff) return 6;
- return 0;
-}
-
-
-/* --- conversions ---------------------------------------------------------- */
-
-
-/*
- * Sets `r` to a rune corresponding to the firsts `n` bytes of `s`.
- * If `s` is misencoded, the rune is stored as a negative value.
- *
- * Return the number of bytes read.
- */
-int
-utftorune(long *r, char *s, int n)
-{
- int len = utflen(s, n), i;
-
- /* first byte */
- switch (len) {
- case 1: *r = s[0]; return 1; /* 0xxxxxxx */
- case 2: *r = s[0] & 0x1f; break; /* 110xxxxx */
- case 3: *r = s[0] & 0x0f; break; /* 1110xxxx */
- case 4: *r = s[0] & 0x07; break; /* 11110xxx */
- case 5: *r = s[0] & 0x03; break; /* 111110xx */
- case 6: *r = s[0] & 0x01; break; /* 1111110x */
- default: *r = -(unsigned char) s[0]; return 1; /* misencoded */
- }
-
- /* continuation bytes */
- for (i = 1; i < len; i++)
- *r = (*r << 6) | (s[i] & 0x3f); /* 10xxxxxx */
-
- /* overlong sequences */
- if (runelen(*r) != len) {
- *r = -(unsigned char) s[0];
- return 1;
- }
-
- return len;
-}
-
-
-/*
- * Convert the utf char sring `src` of size `n` to a long string
- * `dest`.
- *
- * Return the length of `i`.
- */
-size_t
-utftorunes(long *runes, char *utf, size_t n)
-{
- size_t i, j;
-
- for (i = 0, j = 0; n > 0; i++)
- j += utftorune(runes + i, utf + j, n - j);
-
- runes[i] = '\0';
- return i;
-}
-
-
-/*
- * Encode the rune `r` in utf-8 in `s`, null-terminated.
- *
- * Return the number of bytes written, 0 if `r` is invalid.
- */
-int
-runetoutf(char *s, long r)
-{
- switch (runelen(r)) {
- case 1:
- s[0] = r; /* 0xxxxxxx */
- s[1] = '\0';
- return 1;
- case 2:
- s[0] = 0xc0 | (0x1f & (r >> 6)); /* 110xxxxx */
- s[1] = 0x80 | (0x3f & (r)); /* 10xxxxxx */
- s[2] = '\0';
- return 2;
- case 3:
- s[0] = 0xe0 | (0x0f & (r >> 12)); /* 1110xxxx */
- s[1] = 0x80 | (0x3f & (r >> 6)); /* 10xxxxxx */
- s[2] = 0x80 | (0x3f & (r)); /* 10xxxxxx */
- s[3] = '\0';
- return 3;
- case 4:
- s[0] = 0xf0 | (0x07 & (r >> 18)); /* 11110xxx */
- s[1] = 0x80 | (0x3f & (r >> 12)); /* 10xxxxxx */
- s[2] = 0x80 | (0x3f & (r >> 6)); /* 10xxxxxx */
- s[3] = 0x80 | (0x3f & (r)); /* 10xxxxxx */
- s[4] = '\0';
- return 4;
- case 5:
- s[0] = 0xf8 | (0x03 & (r >> 24)); /* 111110xx */
- s[1] = 0x80 | (0x3f & (r >> 18)); /* 10xxxxxx */
- s[2] = 0x80 | (0x3f & (r >> 12)); /* 10xxxxxx */
- s[3] = 0x80 | (0x3f & (r >> 6)); /* 10xxxxxx */
- s[4] = 0x80 | (0x3f & (r)); /* 10xxxxxx */
- s[5] = '\0';
- return 5;
- case 6:
- s[0] = 0xfc | (0x01 & (r >> 30)); /* 1111110x */
- s[1] = 0x80 | (0x3f & (r >> 24)); /* 10xxxxxx */
- s[2] = 0x80 | (0x3f & (r >> 18)); /* 10xxxxxx */
- s[3] = 0x80 | (0x3f & (r >> 12)); /* 10xxxxxx */
- s[4] = 0x80 | (0x3f & (r >> 6)); /* 10xxxxxx */
- s[5] = 0x80 | (0x3f & (r)); /* 10xxxxxx */
- s[6] = '\0';
- return 6;
- default:
- s[0] = '\0';
- return 0;
- }
-}
-
-
-/*
- * Fill `s` with a printable representation of `r`.
- *
- * Return the width of the character.
- */
-int
-runetoprint(char *s, long r)
-{
- if (r < 0) {
- return sprintf(s, "[%02x]", (unsigned char) -r);
-
- } else if (r == 0x7f || r < ' ') {
- return sprintf(s, "[%02lx]", r);
-
- } else if (!runeisprint(r)) {
- return sprintf(s, "[%04lx]", r);
-
- } else {
- runetoutf(s, r);
- return 1;
- }
-
- return 0;
-}
-
-
-/* --- standard library ----------------------------------------------------- */
-
-
-/*
- * Returns 1 if the rune is a printable character and 0 if not.
- */
-int
-runeisprint(long r)
-{
- return !(
- (r != '\t' && r < ' ') || /* ascii control */
- (r == 0x7f) ||
-
- (0x80 <= r && r < 0xa0) || /* unicode control */
-
- (r > 0x10ffff) || /* outside range */
-
- ((r & 0x00fffe) == 0x00fffe) || /* noncharacters */
- (0x00fdd0 <= r && r <= 0x00fdef) ||
-
- (0x00e000 <= r && r <= 0x00f8ff) || /* private use */
- (0x0f0000 <= r && r <= 0x0ffffd) ||
- (0x100000 <= r && r <= 0x10fffd) ||
-
- (0x00d800 <= r && r <= 0x00dfff) /* surrogates */
- );
-}
-
-
-/*
- * Read an utf string from `f` up to the first '\n' character or the
- * end of the file. It is stored as a rune array into the newly
- * allocated `r`.
- *
- * Return the length of `r`, or -1 if malloc fails or if the end of
- * `f` is reached.
- */
-size_t
-getrunes(long **r, FILE *f)
-{
- size_t slen, rlen = 0, size = BUFSIZ, i;
- int c;
- char *s;
-
- if (!(s = malloc(size))) return -1;
- for (slen = 0; (c = fgetc(f)) != EOF && (c != '\n'); slen++) {
- if (slen > size && !(s = realloc(s, ++size))) return -1;
- s[slen] = c;
- }
-
- if (!(*r = malloc(size * sizeof (long)))) return -1;
- for (i = 0; i < slen; rlen++)
- i += utftorune(*r + rlen, s + i, slen - i);
- (*r)[rlen] = '\0';
-
- free(s);
- if (feof(f)) return -1; else return rlen;
-}
-
-
-long *
-runescpy(long *dest, long *src)
-{
- size_t i;
-
- for (i = 0; src[i] != '\0'; i++)
- dest[i] = src[i];
- dest[i] = '\0';
-
- return dest;
-}
-
-
-long *
-runeschr(long *s, long r)
-{
- size_t i;
-
- for (i = 0; s[i] != '\0'; i++)
- if (s[i] == r) return s + i;
-
- return NULL;
-}
-
-
-long *
-runescat(long *s1, long *s2)
-{
- size_t i, j;
-
- for (i = 0; s1[i] != '\0'; i++);
- for (j = 0; s2[j] != '\0'; j++)
- s1[i + j] = s2[j];
- s1[i + j] = '\0';
-
- return s1;
-}
-
-
-int
-main()
-{
- char s[BUFSIZ];
- long *r;
- int len, i;
-
- for (len = 0; (len = getrunes(&r, stdin)) >= 0 && !feof(stdin); free(r)) {
- for (i = 0; i < len; i++) {
- runetoprint(s, r[i]);
- fputs(s, stdout);
- }
-
- putchar('\n');
- }
- free(r);
-
- return 0;
-}
(DIR) diff --git a/utf.h b/utf.h
@@ -1,18 +0,0 @@
-/* lengths */
-int utflen(char *, int);
-int runelen(long);
-
-/* conversions */
-int utftorune(long *, char *, int);
-int utftorune(long *, char *, int);
-int runetoutf(char *, long);
-int runetoprint(char *, long);
-
-
-/* standard library */
-
-int runeisprint(long);
-size_t getrunes(long **, FILE *);
-long * runescpy(long *, long *);
-long * runeschr(long *, long);
-long * runescat(long *, long *);
(DIR) diff --git a/utf8.c b/utf8.c
@@ -0,0 +1,142 @@
+/*
+ * ASCII all have a leading '0' byte:
+ *
+ * 0xxxxxxx
+ *
+ * UTF-8(7) have one leading '1' and as many following '1' as there are
+ * continuation bytes (with leading '1' and '0').
+ *
+ * 0xxxxxxx
+ * 110xxxxx 10xxxxxx
+ * 1110xxxx 10xxxxxx 10xxxxxx
+ * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ *
+ * There is up to 3 continuation bytes -- up to 4 bytes per runes.
+ *
+ * The whole character value is retreived into an 'x' and stored into a
+ * (long)[].
+ *
+ * Thanks to Connor Lane Smith for the idea of combining switches and
+ * binary masks.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "utf8.h"
+
+
+/*
+ * Return the number of bytes in rune for the `n` next char in `s`,
+ * or 0 if ti is misencoded.
+ */
+size_t
+utf8len(char *s, int n)
+{
+ unsigned char *sp = (unsigned char *) s;
+ int i, len = (*sp < 0x80) ? 1 : /* 0xxxxxxx < 10000000 */
+ (*sp < 0xc0) ? 0 : /* 10xxxxxx < 11000000 */
+ (*sp < 0xe0) ? 2 : /* 110xxxxx < 11100000 */
+ (*sp < 0xf0) ? 3 : /* 1110xxxx < 11110000 */
+ (*sp < 0xf8) ? 4 : /* 11110xxx < 11111000 */
+ (*sp < 0xfc) ? 5 : /* 111110xx < 11111100 */
+ (*sp < 0xfe) ? 6 : /* 1111110x < 11111110 */
+ (*sp < 0xff) ? 7 : /* 11111110 < 11111111 */
+ 0;
+ if (len > n) return 0;
+
+ /* check continuation bytes */
+ for (sp++, i = 1; i < len; i++, sp++)
+ if ((*sp & 0xc0) != 0x80) /* 10xxxxxx & 11000000 */
+ return 0;
+
+ return len;
+}
+
+
+/*
+ * Return the number of bytes required to encode `rune` into UTF-8, or
+ * 0 if rune is too long.
+ */
+size_t
+utf8runelen(long r)
+{
+ return (r <= 0x0000007f) ? 1 : (r <= 0x000007ff) ? 2 :
+ (r <= 0x0000ffff) ? 3 : (r <= 0x001fffff) ? 4 :
+ (r <= 0x03ffffff) ? 5 : (r <= 0x7fffffff) ? 6 : 0;
+}
+
+
+/*
+ * Sets 'r' to a rune corresponding to the firsts 'n' bytes of 's'.
+ *
+ * Return the number of bytes read or 0 if the string is misencoded.
+ */
+size_t
+utf8torune(long *r, char *s, size_t n)
+{
+ char mask[] = { 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
+ size_t i, len = utf8len(s, n);
+
+ if (len == 0 || len > 6 || len > n)
+ return 0;
+
+ /* first byte */
+ *r = *s++ & mask[len - 1];
+
+ /* continuation bytes */
+ for (i = 1; i < len; i++)
+ *r = (*r << 6) | (*s++ & 0x3f); /* 10xxxxxx */
+
+ /* overlong sequences */
+ if (utf8runelen(*r) != len)
+ return 0;
+
+ return len;
+}
+
+
+/*
+ * Returns 1 if the rune is a valid unicode code point and 0 if not.
+ */
+int
+utf8runeisunicode(long r)
+{
+ return !(
+ (r > 0x10ffff) || /* outside range */
+
+ ((r & 0x00fffe) == 0x00fffe) || /* noncharacters */
+ (0x00fdd0 <= r && r <= 0x00fdef) ||
+
+ (0x00e000 <= r && r <= 0x00f8ff) || /* private use */
+ (0x0f0000 <= r && r <= 0x0ffffd) ||
+ (0x100000 <= r && r <= 0x10fffd) ||
+
+ (0x00d800 <= r && r <= 0x00dfff) /* surrogates */
+ );
+}
+
+
+/*
+ * Return 1 if '*s' is correctly encoded in UTF-8 with allowed Unicode
+ * code points.
+ */
+int
+utf8check(char *s, size_t len)
+{
+ size_t shift;
+ long r = 0;
+
+ while (len > 0) {
+ shift = utf8torune(&r, s, len);
+ if (!shift || !utf8runeisunicode(r))
+ return 0;
+
+ s += shift;
+ len -= shift;
+ }
+
+ return 1;
+}
(DIR) diff --git a/utf8.h b/utf8.h
@@ -0,0 +1,5 @@
+size_t utf8len(char *, int);
+size_t utf8runelen(long);
+size_t utf8torune(long *, char *, size_t);
+int utf8runeisunicode(long);
+int utf8check(char *, size_t);