rune.c - 9base - revived minimalist port of Plan 9 userland to Unix
 (HTM) git clone git://git.suckless.org/9base
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
       rune.c (3963B)
       ---
            1 /*
            2  * The authors of this software are Rob Pike and Ken Thompson.
            3  *              Copyright (c) 2002 by Lucent Technologies.
            4  * Permission to use, copy, modify, and distribute this software for any
            5  * purpose without fee is hereby granted, provided that this entire notice
            6  * is included in all copies of any software which is or includes a copy
            7  * or modification of this software and in all copies of the supporting
            8  * documentation for such software.
            9  * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
           10  * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE
           11  * ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
           12  * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
           13  */
           14 #include <stdarg.h>
           15 #include <string.h>
           16 #include "plan9.h"
           17 #include "utf.h"
           18 
           19 enum
           20 {
           21         Bit1        = 7,
           22         Bitx        = 6,
           23         Bit2        = 5,
           24         Bit3        = 4,
           25         Bit4        = 3,
           26         Bit5        = 2,
           27 
           28         T1        = ((1<<(Bit1+1))-1) ^ 0xFF,        /* 0000 0000 */
           29         Tx        = ((1<<(Bitx+1))-1) ^ 0xFF,        /* 1000 0000 */
           30         T2        = ((1<<(Bit2+1))-1) ^ 0xFF,        /* 1100 0000 */
           31         T3        = ((1<<(Bit3+1))-1) ^ 0xFF,        /* 1110 0000 */
           32         T4        = ((1<<(Bit4+1))-1) ^ 0xFF,        /* 1111 0000 */
           33         T5        = ((1<<(Bit5+1))-1) ^ 0xFF,        /* 1111 1000 */
           34 
           35         Rune1        = (1<<(Bit1+0*Bitx))-1,                /* 0000 0000 0000 0000 0111 1111 */
           36         Rune2        = (1<<(Bit2+1*Bitx))-1,                /* 0000 0000 0000 0111 1111 1111 */
           37         Rune3        = (1<<(Bit3+2*Bitx))-1,                /* 0000 0000 1111 1111 1111 1111 */
           38         Rune4        = (1<<(Bit4+3*Bitx))-1,                /* 0011 1111 1111 1111 1111 1111 */
           39 
           40         Maskx        = (1<<Bitx)-1,                        /* 0011 1111 */
           41         Testx        = Maskx ^ 0xFF,                        /* 1100 0000 */
           42 
           43         Bad        = Runeerror
           44 };
           45 
           46 int
           47 chartorune(Rune *rune, char *str)
           48 {
           49         int c, c1, c2, c3;
           50         long l;
           51 
           52         /*
           53          * one character sequence
           54          *        00000-0007F => T1
           55          */
           56         c = *(uchar*)str;
           57         if(c < Tx) {
           58                 *rune = c;
           59                 return 1;
           60         }
           61 
           62         /*
           63          * two character sequence
           64          *        0080-07FF => T2 Tx
           65          */
           66         c1 = *(uchar*)(str+1) ^ Tx;
           67         if(c1 & Testx)
           68                 goto bad;
           69         if(c < T3) {
           70                 if(c < T2)
           71                         goto bad;
           72                 l = ((c << Bitx) | c1) & Rune2;
           73                 if(l <= Rune1)
           74                         goto bad;
           75                 *rune = l;
           76                 return 2;
           77         }
           78 
           79         /*
           80          * three character sequence
           81          *        0800-FFFF => T3 Tx Tx
           82          */
           83         c2 = *(uchar*)(str+2) ^ Tx;
           84         if(c2 & Testx)
           85                 goto bad;
           86         if(c < T4) {
           87                 l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
           88                 if(l <= Rune2)
           89                         goto bad;
           90                 *rune = l;
           91                 return 3;
           92         }
           93 
           94         /*
           95          * four character sequence
           96          *        10000-10FFFF => T4 Tx Tx Tx
           97          */
           98         if(UTFmax >= 4) {
           99                 c3 = *(uchar*)(str+3) ^ Tx;
          100                 if(c3 & Testx)
          101                         goto bad;
          102                 if(c < T5) {
          103                         l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
          104                         if(l <= Rune3)
          105                                 goto bad;
          106                         if(l > Runemax)
          107                                 goto bad;
          108                         *rune = l;
          109                         return 4;
          110                 }
          111         }
          112 
          113         /*
          114          * bad decoding
          115          */
          116 bad:
          117         *rune = Bad;
          118         return 1;
          119 }
          120 
          121 int
          122 runetochar(char *str, Rune *rune)
          123 {
          124         long c;
          125 
          126         /*
          127          * one character sequence
          128          *        00000-0007F => 00-7F
          129          */
          130         c = *rune;
          131         if(c <= Rune1) {
          132                 str[0] = c;
          133                 return 1;
          134         }
          135 
          136         /*
          137          * two character sequence
          138          *        00080-007FF => T2 Tx
          139          */
          140         if(c <= Rune2) {
          141                 str[0] = T2 | (c >> 1*Bitx);
          142                 str[1] = Tx | (c & Maskx);
          143                 return 2;
          144         }
          145 
          146         /*
          147          * three character sequence
          148          *        00800-0FFFF => T3 Tx Tx
          149          */
          150         if(c > Runemax)
          151                 c = Runeerror;
          152         if(c <= Rune3) {
          153                 str[0] = T3 |  (c >> 2*Bitx);
          154                 str[1] = Tx | ((c >> 1*Bitx) & Maskx);
          155                 str[2] = Tx |  (c & Maskx);
          156                 return 3;
          157         }
          158         
          159         /*
          160          * four character sequence
          161          *        010000-1FFFFF => T4 Tx Tx Tx
          162          */
          163         str[0] = T4 |  (c >> 3*Bitx);
          164         str[1] = Tx | ((c >> 2*Bitx) & Maskx);
          165         str[2] = Tx | ((c >> 1*Bitx) & Maskx);
          166         str[3] = Tx |  (c & Maskx);
          167         return 4;
          168 }
          169 
          170 int
          171 runelen(long c)
          172 {
          173         Rune rune;
          174         char str[10];
          175 
          176         rune = c;
          177         return runetochar(str, &rune);
          178 }
          179 
          180 int
          181 runenlen(Rune *r, int nrune)
          182 {
          183         int nb, c;
          184 
          185         nb = 0;
          186         while(nrune--) {
          187                 c = *r++;
          188                 if(c <= Rune1)
          189                         nb++;
          190                 else
          191                 if(c <= Rune2)
          192                         nb += 2;
          193                 else
          194                 if(c <= Rune3 || c > Runemax)
          195                         nb += 3;
          196                 else
          197                         nb += 4;
          198         }
          199         return nb;
          200 }
          201 
          202 int
          203 fullrune(char *str, int n)
          204 {
          205         int c;
          206 
          207         if(n <= 0)
          208                 return 0;
          209         c = *(uchar*)str;
          210         if(c < Tx)
          211                 return 1;
          212         if(c < T3)
          213                 return n >= 2;
          214         if(UTFmax == 3 || c < T4)
          215                 return n >= 3;
          216         return n >= 4;
          217 }