utf8expr.c - utf8expr - expr(1) for UTF-8
 (HTM) git clone git://bitreich.org/utf8expr/ git://enlrupgkhuxnvlhsf6lc3fziv5h2hhfrinws65d7roiv6bfj7d652fid.onion/utf8expr/
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) Tags
 (DIR) LICENSE
       ---
       utf8expr.c (2190B)
       ---
            1 /*
            2  * Copy me if you can.
            3  * by 20h
            4  */
            5 
            6 #include <unistd.h>
            7 #include <string.h>
            8 #include <stdlib.h>
            9 #include <stdio.h>
           10 #include <libgen.h>
           11 
           12 #include "arg.h"
           13 
           14 char *argv0;
           15 
           16 /*
           17  * Idea taken from:
           18  *        http://canonical.org/~kragen/strlen-utf8.html
           19  */
           20 size_t
           21 utf8strlen(char *s)
           22 {
           23         size_t i;
           24 
           25         i = 0;
           26         for (; s[0]; s++) {
           27                 if ((s[0] & 0xc0) != 0x80)
           28                         i++;
           29         }
           30 
           31         return i;
           32 }
           33 
           34 char *
           35 utf8strchr(char *s, char *c)
           36 {
           37         size_t j, cl;
           38 
           39         cl = strlen(c);
           40         if (cl == 0)
           41                 return NULL;
           42 
           43         for (j = 0; ; s++) {
           44                 if (j > 6)
           45                         return NULL;
           46                 j++;
           47 
           48                 if ((s[0] & 0xc0) != 0x80 || s[0] == '\0') {
           49                         if (cl == j) {
           50                                 if (!memcmp(&s[-j], c, cl))
           51                                         return &s[-j];
           52                         }
           53                         j = 0;
           54 
           55                         if (s[0] == '\0')
           56                                 break;
           57                 }
           58         }
           59 
           60         return NULL;
           61 }
           62 
           63 char *
           64 utf8substr(char *s, size_t pos, size_t *length)
           65 {
           66         size_t i, j, rl;
           67         char *ret;
           68 
           69         if (*length < 1)
           70                 return NULL;
           71 
           72         ret = NULL;
           73         rl = 0;
           74         for (i = 0, j = 0; *length > 0; s++) {
           75                 if (j > 6)
           76                         return NULL;
           77                 j++;
           78 
           79                 if (ret != NULL)
           80                         rl++;
           81 
           82                 if ((s[0] & 0xc0) != 0x80 || s[0] == '\0') {
           83                         if (i >= pos) {
           84                                 if (ret == NULL) {
           85                                         ret = &s[-j];
           86                                         rl = j;
           87                                 }
           88                                 (*length)--;
           89                         }
           90                         i++;
           91                         j = 0;
           92 
           93                         if (s[0] == '\0')
           94                                 break;
           95                 }
           96         }
           97 
           98         *length = rl;
           99         return ret;
          100 }
          101 
          102 size_t
          103 utf8index(char *s, char *chars)
          104 {
          105         size_t i, j;
          106         char c[7];
          107 
          108         j = 0;
          109         for (i = 0; ; s++) {
          110                 if (j > 6)
          111                         return 0;
          112                 j++;
          113 
          114                 if ((s[0] & 0xc0) != 0x80 || s[0] == '\0') {
          115                         memset(c, 0, sizeof(c));
          116                         memmove(c, &s[-j], j);
          117                         if (utf8strchr(chars, c))
          118                                 return i;
          119                         i++;
          120                         j = 0;
          121 
          122                         if (s[0] == '\0')
          123                                 break;
          124                 }
          125         }
          126 
          127         return 0;
          128 }
          129 
          130 void
          131 usage(void)
          132 {
          133         fprintf(stderr, "usage: %s [substr|index|length] str [args ...]\n",
          134                         basename(argv0));
          135         exit(1);
          136 }
          137 
          138 int
          139 main(int argc, char *argv[])
          140 {
          141         char *s;
          142         size_t len;
          143 
          144         argv0 = argv[0];
          145 
          146         if (argc < 3)
          147                 usage();
          148 
          149         switch(argv[1][0]) {
          150         case 'i':
          151                 if (argc < 4)
          152                         usage();
          153                 printf("%ld\n", utf8index(argv[2], argv[3]));
          154                 break;
          155         case 'l':
          156                 printf("%ld\n", utf8strlen(argv[2]));
          157                 break;
          158         case 's':
          159                 if (argc < 5)
          160                         usage();
          161                 len = atoi(argv[4]);
          162                 s = utf8substr(argv[2], atoi(argv[3]), &len);
          163                 if (s == NULL)
          164                         return -1;
          165                 printf("%.*s\n", (int)len, s);
          166                 break;
          167         default:
          168                 usage();
          169         };
          170 
          171         return 0;
          172 }
          173