bitreich.org

       enc-unicode.lisp - clic - Clic is an command line interactive client for gopher written in Common LISP
 (HTM) git clone git://bitreich.org/clic/ git://enlrupgkhuxnvlhsf6lc3fziv5h2hhfrinws65d7roiv6bfj7d652fid.onion/clic/
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) Tags
 (DIR) README
 (DIR) LICENSE
       ---
       enc-unicode.lisp (42416B)
       ---
            1 ;;;; -*- Mode: lisp; indent-tabs-mode: nil -*-
            2 ;;;
            3 ;;; enc-unicode.lisp --- Unicode encodings.
            4 ;;;
            5 ;;; Copyright (C) 2007, Luis Oliveira  <loliveira@common-lisp.net>
            6 ;;;
            7 ;;; Permission is hereby granted, free of charge, to any person
            8 ;;; obtaining a copy of this software and associated documentation
            9 ;;; files (the "Software"), to deal in the Software without
           10 ;;; restriction, including without limitation the rights to use, copy,
           11 ;;; modify, merge, publish, distribute, sublicense, and/or sell copies
           12 ;;; of the Software, and to permit persons to whom the Software is
           13 ;;; furnished to do so, subject to the following conditions:
           14 ;;;
           15 ;;; The above copyright notice and this permission notice shall be
           16 ;;; included in all copies or substantial portions of the Software.
           17 ;;;
           18 ;;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
           19 ;;; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
           20 ;;; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
           21 ;;; NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
           22 ;;; HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
           23 ;;; WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
           24 ;;; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
           25 ;;; DEALINGS IN THE SOFTWARE.
           26 
           27 ;;; This implementation is largely based on OpenMCL's l1-unicode.lisp
           28 ;;;   Copyright (C) 2006 Clozure Associates and contributors.
           29 
           30 (in-package #:babel-encodings)
           31 
           32 (eval-when (:compile-toplevel :load-toplevel :execute)
           33   (defconstant +repl+ #xfffd "Unicode replacement character code point.")
           34   (defconstant +byte-order-mark-code+ #xfeff)
           35   (defconstant +swapped-byte-order-mark-code+ #xfffe)
           36   (defconstant +swapped-byte-order-mark-code-32+ #xfffe0000))
           37 
           38 ;;; Some convenience macros adding FIXNUM declarations.
           39 (defmacro f-ash (integer count) `(the fixnum (ash ,integer ,count)))
           40 (defmacro f-logior (&rest integers) `(the fixnum (logior ,@integers)))
           41 (defmacro f-logand (&rest integers) `(the fixnum (logand ,@integers)))
           42 (defmacro f-logxor (&rest integers) `(the fixnum (logxor ,@integers)))
           43 
           44 ;;;; UTF-8
           45 
           46 (define-character-encoding :utf-8
           47     "An 8-bit, variable-length character encoding in which
           48 character code points in the range #x00-#x7f can be encoded in a
           49 single octet; characters with larger code values can be encoded
           50 in 2 to 4 bytes."
           51   :max-units-per-char 4
           52   :literal-char-code-limit #x80
           53   :bom-encoding #(#xef #xbb #xbf)
           54   :default-replacement #xfffd)
           55 
           56 (define-condition invalid-utf8-starter-byte (character-decoding-error)
           57   ()
           58   (:documentation "Signalled when an invalid UTF-8 starter byte is found."))
           59 
           60 (define-condition invalid-utf8-continuation-byte (character-decoding-error)
           61   ()
           62   (:documentation
           63    "Signalled when an invalid UTF-8 continuation byte is found."))
           64 
           65 (define-condition overlong-utf8-sequence (character-decoding-error)
           66   ()
           67   (:documentation "Signalled upon overlong UTF-8 sequences."))
           68 
           69 (define-octet-counter :utf-8 (getter type)
           70   `(named-lambda utf-8-octet-counter (seq start end max)
           71      (declare (type ,type seq) (fixnum start end max))
           72      (loop with noctets fixnum = 0
           73            for i fixnum from start below end
           74            for code of-type code-point = (,getter seq i) do
           75            (let ((new (+ (cond ((< code #x80) 1)
           76                                ((< code #x800) 2)
           77                                ((< code #x10000) 3)
           78                                (t 4))
           79                          noctets)))
           80              (if (and (plusp max) (> new max))
           81                  (loop-finish)
           82                  (setq noctets new)))
           83            finally (return (values noctets i)))))
           84 
           85 (define-code-point-counter :utf-8 (getter type)
           86   `(named-lambda utf-8-code-point-counter (seq start end max)
           87      (declare (type ,type seq) (fixnum start end max))
           88      (loop with nchars fixnum = 0
           89            with i fixnum = start
           90            while (< i end) do
           91            ;; check for invalid continuation bytes
           92            (macrolet ((invalid-cb-p (n)
           93                         `(and (< (+ i ,n) end)
           94                               (not (< #x7f (,',getter seq (+ i ,n)) #xc0)))))
           95              ;; wrote this code with LET instead of FOR because CLISP's
           96              ;; LOOP doesn't like WHILE clauses before FOR clauses.
           97              (let* ((octet (,getter seq i))
           98                     (next-i (+ i (cond ((or (< octet #xc0) (invalid-cb-p 1)) 1)
           99                                        ((or (< octet #xe0) (invalid-cb-p 2)) 2)
          100                                        ((or (< octet #xf0) (invalid-cb-p 3)) 3)
          101                                        ((or (< octet #xf8) (invalid-cb-p 4)) 4)
          102                                        ((or (< octet #xfc) (invalid-cb-p 5)) 5)
          103                                        (t 6)))))
          104                (declare (type ub8 octet) (fixnum next-i))
          105                (cond
          106                  ((> next-i end)
          107                   ;; Should we add restarts to this error, we'll have
          108                   ;; to figure out a way to communicate with the
          109                   ;; decoder since we probably want to do something
          110                   ;; about it right here when we have a chance to
          111                   ;; change the count or something.  (Like an
          112                   ;; alternative replacement character or perhaps the
          113                   ;; existence of this error so that the decoder
          114                   ;; doesn't have to check for it on every iteration
          115                   ;; like we do.)
          116                   ;;
          117                   ;; FIXME: The data for this error is not right.
          118                   (decoding-error (vector octet) :utf-8 seq i
          119                                   nil 'end-of-input-in-character)
          120                   (return (values (1+ nchars) end)))
          121                  (t
          122                   (setq nchars (1+ nchars)
          123                         i next-i)
          124                   (when (and (plusp max) (= nchars max))
          125                     (return (values nchars i)))))))
          126            finally (progn
          127                      (assert (= i end))
          128                      (return (values nchars i))))))
          129 
          130 (define-encoder :utf-8 (getter src-type setter dest-type)
          131   `(named-lambda utf-8-encoder (src start end dest d-start)
          132      (declare (type ,src-type src)
          133               (type ,dest-type dest)
          134               (fixnum start end d-start))
          135      (loop with di fixnum = d-start
          136            for i fixnum from start below end
          137            for code of-type code-point = (,getter src i) do
          138            (macrolet ((set-octet (offset value)
          139                         `(,',setter ,value dest (the fixnum (+ di ,offset)))))
          140              (cond
          141                ;; 1 octet
          142                ((< code #x80)
          143                 (set-octet 0 code)
          144                 (incf di))
          145                ;; 2 octets
          146                ((< code #x800)
          147                 (set-octet 0 (logior #xc0 (f-ash code -6)))
          148                 (set-octet 1 (logior #x80 (f-logand code #x3f)))
          149                 (incf di 2))
          150                ;; 3 octets
          151                ((< code #x10000)
          152                 (set-octet 0 (logior #xe0 (f-ash code -12)))
          153                 (set-octet 1 (logior #x80 (f-logand #x3f (f-ash code -6))))
          154                 (set-octet 2 (logior #x80 (f-logand code #x3f)))
          155                 (incf di 3))
          156                ;; 4 octets
          157                (t
          158                 (set-octet 0 (logior #xf0 (f-logand #x07 (f-ash code -18))))
          159                 (set-octet 1 (logior #x80 (f-logand #x3f (f-ash code -12))))
          160                 (set-octet 2 (logior #x80 (f-logand #x3f (f-ash code -6))))
          161                 (set-octet 3 (logior #x80 (logand code #x3f)))
          162                 (incf di 4))))
          163            finally (return (the fixnum (- di d-start))))))
          164 
          165 (define-decoder :utf-8 (getter src-type setter dest-type)
          166   `(named-lambda utf-8-decoder (src start end dest d-start)
          167      (declare (type ,src-type src)
          168               (type ,dest-type dest)
          169               (fixnum start end d-start))
          170      (let ((u2 0) (u3 0) (u4 0) (u5 0) (u6 0))
          171        (declare (type ub8 u2 u3 u4 u5 u6))
          172        (loop for di fixnum from d-start
          173              for i fixnum from start below end
          174              for u1 of-type ub8 = (,getter src i) do
          175              ;; Note: CONSUME-OCTET doesn't check if I is being
          176              ;; incremented past END.  We're assuming that END has
          177              ;; been calculated with the CODE-POINT-POINTER above that
          178              ;; checks this.
          179              (macrolet
          180                  ((consume-octet ()
          181                     `(let ((next-i (incf i)))
          182                        (if (= next-i end)
          183                            ;; FIXME: data for this error is incomplete.
          184                            ;; and signalling this error twice
          185                            (return-from setter-block
          186                              (decoding-error nil :utf-8 src i +repl+
          187                                              'end-of-input-in-character))
          188                            (,',getter src next-i))))
          189                   (handle-error (n &optional (c 'character-decoding-error))
          190                     `(decoding-error
          191                       (vector ,@(subseq '(u1 u2 u3 u4 u5 u6) 0 n))
          192                       :utf-8 src (1+ (- i ,n)) +repl+ ',c))
          193                   (handle-error-if-icb (var n)
          194                     `(when (not (< #x7f ,var #xc0))
          195                        (decf i)
          196                        (return-from setter-block
          197                          (handle-error ,n invalid-utf8-continuation-byte)))))
          198                (,setter
          199                 (block setter-block
          200                   (cond
          201                     ((< u1 #x80) u1)    ; 1 octet
          202                     ((< u1 #xc0)
          203                      (handle-error 1 invalid-utf8-starter-byte))
          204                     (t
          205                      (setq u2 (consume-octet))
          206                      (handle-error-if-icb u2 1)
          207                      (cond
          208                        ((< u1 #xc2)
          209                         (handle-error 2 overlong-utf8-sequence))
          210                        ((< u1 #xe0)     ; 2 octets
          211                         (logior (f-ash (f-logand #x1f u1) 6)
          212                                 (f-logxor u2 #x80)))
          213                        (t
          214                         (setq u3 (consume-octet))
          215                         (handle-error-if-icb u3 2)
          216                         (cond
          217                           ((and (= u1 #xe0) (< u2 #xa0))
          218                            (handle-error 3 overlong-utf8-sequence))
          219                           ((< u1 #xf0)  ; 3 octets
          220                            (let ((start (f-logior (f-ash (f-logand u1 #x0f) 12)
          221                                                   (f-ash (f-logand u2 #x3f) 6))))
          222                              (if (<= #xd800 start #xdfc0)
          223                                  (handle-error 3 character-out-of-range)
          224                                  (logior start (f-logand u3 #x3f)))))
          225                           (t            ; 4 octets
          226                            (setq u4 (consume-octet))
          227                            (handle-error-if-icb u4 3)
          228                            (cond
          229                              ((and (= u1 #xf0) (< u2 #x90))
          230                               (handle-error 4 overlong-utf8-sequence))
          231                              ((< u1 #xf8)
          232                               (if (or (> u1 #xf4) (and (= u1 #xf4) (> u2 #x8f)))
          233                                   (handle-error 4 character-out-of-range)
          234                                   (f-logior (f-ash (f-logand u1 7) 18)
          235                                             (f-ash (f-logxor u2 #x80) 12)
          236                                             (f-ash (f-logxor u3 #x80) 6)
          237                                             (f-logxor u4 #x80))))
          238                              ;; from here on we'll be getting either
          239                              ;; invalid continuation bytes or overlong
          240                              ;; 5-byte or 6-byte sequences.
          241                              (t
          242                               (setq u5 (consume-octet))
          243                               (handle-error-if-icb u5 4)
          244                               (cond
          245                                 ((and (= u1 #xf8) (< u2 #x88))
          246                                  (handle-error 5 overlong-utf8-sequence))
          247                                 ((< u1 #xfc)
          248                                  (handle-error 5 character-out-of-range))
          249                                 (t
          250                                  (setq u6 (consume-octet))
          251                                  (handle-error-if-icb u6 5)
          252                                  (cond
          253                                    ((and (= u1 #xfc) (< u2 #x84))
          254                                     (handle-error 6 overlong-utf8-sequence))
          255                                    (t
          256                                     (handle-error 6 character-out-of-range)
          257                                     )))))))))))))
          258                 dest di))
          259              finally (return (the fixnum (- di d-start)))))))
          260 
          261 ;;;; UTF-8B
          262 
          263 ;;; The following excerpt from a linux-utf8 message by Markus Kuhn is
          264 ;;; the closest thing to a UTF-8B specification:
          265 ;;;
          266 ;;; <http://mail.nl.linux.org/linux-utf8/2000-07/msg00040.html>
          267 ;;;
          268 ;;; "D) Emit a malformed UTF-16 sequence for every byte in a malformed
          269 ;;;     UTF-8 sequence
          270 ;;;
          271 ;;;  All the previous options for converting malformed UTF-8 sequences
          272 ;;;  to UTF-16 destroy information. This can be highly undesirable in
          273 ;;;  applications such as text file editors, where guaranteed binary
          274 ;;;  transparency is a desireable feature. (E.g., I frequently edit
          275 ;;;  executable code or graphic files with the Emacs text editor and I
          276 ;;;  hate the idea that my editor might automatically make U+FFFD
          277 ;;;  substitutions at locations that I haven't even edited when I save
          278 ;;;  the file again.)
          279 ;;;
          280 ;;;  I therefore suggested 1999-11-02 on the unicode@xxxxxxxxxxx
          281 ;;;  mailing list the following approach. Instead of using U+FFFD,
          282 ;;;  simply encode malformed UTF-8 sequences as malformed UTF-16
          283 ;;;  sequences. Malformed UTF-8 sequences consist excludively of the
          284 ;;;  bytes 0x80 - 0xff, and each of these bytes can be represented
          285 ;;;  using a 16-bit value from the UTF-16 low-half surrogate zone
          286 ;;;  U+DC80 to U+DCFF. Thus, the overlong "K" (U+004B) 0xc1 0x8b from
          287 ;;;  the above example would be represented in UTF-16 as U+DCC1
          288 ;;;  U+DC8B. If we simply make sure that every UTF-8 encoded surrogate
          289 ;;;  character is also treated like a malformed sequence, then there
          290 ;;;  is no way that a single high-half surrogate could precede the
          291 ;;;  encoded malformed sequence and cause a valid UTF-16 sequence to
          292 ;;;  emerge.
          293 ;;;
          294 ;;;  This way 100% binary transparent UTF-8 -> UTF-16 -> UTF-8
          295 ;;;  round-trip compatibility can be achieved quite easily.
          296 ;;;
          297 ;;;  On an output device, a lonely low-half surrogate character should
          298 ;;;  be treated just like a character outside the adopted subset of
          299 ;;;  representable characters, that is for the end user, the display
          300 ;;;  would look exactly like with semantics B), i.e. one symbol per
          301 ;;;  byte of a malformed sequence. However in contrast to semantics
          302 ;;;  B), no information is thrown away, and a cut&paste in an editor
          303 ;;;  or terminal emulator will be guaranteed to reconstruct the
          304 ;;;  original byte sequence. This should greatly reduce the incidence
          305 ;;;  of accidental corruption of binary data by UTF-8 -> UTF-16 ->
          306 ;;;  UTF-8 conversion round trips."
          307 
          308 (define-character-encoding :utf-8b
          309     "An 8-bit, variable-length character encoding in which
          310 character code points in the range #x00-#x7f can be encoded in a
          311 single octet; characters with larger code values can be encoded
          312 in 2 to 4 bytes.  Invalid UTF-8 sequences are encoded with #xDCXX
          313 code points for each invalid byte."
          314   :max-units-per-char 4
          315   :literal-char-code-limit #x80
          316   :bom-encoding #(#xef #xbb #xbf)
          317   :default-replacement nil)
          318 
          319 ;;; TODO: reuse the :UTF-8 octet counter through a simple macro.
          320 (define-octet-counter :utf-8b (getter type)
          321   `(named-lambda utf-8b-octet-counter (seq start end max)
          322      (declare (type ,type seq) (fixnum start end max))
          323      (loop with noctets fixnum = 0
          324            for i fixnum from start below end
          325            for code of-type code-point = (,getter seq i) do
          326            (let ((new (+ (cond ((< code #x80) 1)
          327                                ((< code #x800) 2)
          328                                ((<= #xdc80 code #xdcff) 1)
          329                                ((< code #x10000) 3)
          330                                (t 4))
          331                          noctets)))
          332              (if (and (plusp max) (> new max))
          333                  (loop-finish)
          334                  (setq noctets new)))
          335            finally (return (values noctets i)))))
          336 
          337 (define-code-point-counter :utf-8b (getter type)
          338   `(named-lambda utf-8b-code-point-counter (seq start end max)
          339      (declare (type ,type seq) (fixnum start end max))
          340      (loop with nchars fixnum = 0
          341            with i fixnum = start
          342            while (< i end) do
          343            ;; wrote this code with LET instead of FOR because CLISP's
          344            ;; LOOP doesn't like WHILE clauses before FOR clauses.
          345            (let* ((octet (,getter seq i))
          346                   (noctets (cond ((< octet #x80) 1)
          347                                  ((< octet #xe0) 2)
          348                                  ((< octet #xf0) 3)
          349                                  (t 4))))
          350              (declare (type ub8 octet) (fixnum noctets))
          351              (cond
          352                ((> (+ i noctets) end)
          353                 ;; If this error is suppressed these last few bytes
          354                 ;; will be encoded as raw bytes later.
          355                 (decoding-error (vector octet) :utf-8 seq i
          356                                 nil 'end-of-input-in-character)
          357                 (return (values (+ nchars (- end i)) end)))
          358                (t
          359                 ;; FIXME: clean this mess up.
          360                 (let* ((u1 octet)
          361                        (u2 (if (>= noctets 2) (,getter seq (1+ i)) 0))
          362                        (u3 (if (>= noctets 3) (,getter seq (+ i 2)) 0))
          363                        (u4 (if (= noctets 4) (,getter seq (+ i 3)) 0))
          364                        (inc (or (and (> noctets 1)
          365                                      (< u1 #xc2))
          366                                 (and (= noctets 2)
          367                                      (not (logior u2 #x40)))
          368                                 (and (= noctets 3)
          369                                      (not (and (< (f-logxor u2 #x80) #x40)
          370                                                (< (f-logxor u3 #x80) #x40)
          371                                                (or (>= u1 #xe1) (>= u2 #xa0))
          372                                                (or (/= u1 #xed) (< u2 #xa0) (> u2 #xbf)))))
          373                                 (and (= noctets 4)
          374                                      (not
          375                                       (and (< (f-logxor u2 #x80) #x40)
          376                                            (< (f-logxor u3 #x80) #x40)
          377                                            (< (f-logxor u4 #x80) #x40)
          378                                            (or (>= u1 #xf1) (>= u2 #x90))))))))
          379                   (let ((new-nchars (if inc (+ nchars noctets) (1+ nchars))))
          380                     (when (and (plusp max) (> new-nchars max))
          381                       (return (values nchars i)))
          382                     (incf i noctets)
          383                     (setq nchars new-nchars))))))
          384            finally (progn
          385                      (assert (= i end))
          386                      (return (values nchars i))))))
          387 
          388 ;;; TODO: reuse the :UTF-8 encoder with through a simple macro.
          389 (define-encoder :utf-8b (getter src-type setter dest-type)
          390   `(named-lambda utf-8b-encoder (src start end dest d-start)
          391      (declare (type ,src-type src)
          392               (type ,dest-type dest)
          393               (fixnum start end d-start))
          394      (loop with di fixnum = d-start
          395            for i fixnum from start below end
          396            for code of-type code-point = (,getter src i) do
          397            (macrolet ((set-octet (offset value)
          398                         `(,',setter ,value dest (the fixnum (+ di ,offset)))))
          399              (cond
          400                ;; 1 octet
          401                ((< code #x80)
          402                 (set-octet 0 code)
          403                 (incf di))
          404                ;; 2 octets
          405                ((< code #x800)
          406                 (set-octet 0 (logior #xc0 (f-ash code -6)))
          407                 (set-octet 1 (logior #x80 (f-logand code #x3f)))
          408                 (incf di 2))
          409                ;; 1 octet (invalid octet)
          410                ((<= #xdc80 code #xdcff)
          411                 (set-octet 0 (f-logand code #xff))
          412                 (incf di))
          413                ;; 3 octets
          414                ((< code #x10000)
          415                 (set-octet 0 (logior #xe0 (f-ash code -12)))
          416                 (set-octet 1 (logior #x80 (f-logand #x3f (f-ash code -6))))
          417                 (set-octet 2 (logior #x80 (f-logand code #x3f)))
          418                 (incf di 3))
          419                ;; 4 octets
          420                (t
          421                 (set-octet 0 (logior #xf0 (f-logand #x07 (f-ash code -18))))
          422                 (set-octet 1 (logior #x80 (f-logand #x3f (f-ash code -12))))
          423                 (set-octet 2 (logior #x80 (f-logand #x3f (f-ash code -6))))
          424                 (set-octet 3 (logand #x3f code))
          425                 (incf di 4))))
          426            finally (return (the fixnum (- di d-start))))))
          427 
          428 (define-decoder :utf-8b (getter src-type setter dest-type)
          429   `(named-lambda utf-8b-decoder (src start end dest d-start)
          430      (declare (type ,src-type src)
          431               (type ,dest-type dest)
          432               (fixnum start end d-start))
          433      (let ((u2 0) (u3 0) (u4 0))
          434        (declare (type ub8 u2 u3 u4))
          435        (loop for di fixnum from d-start
          436              for i fixnum from start below end
          437              for u1 of-type ub8 = (,getter src i) do
          438              ;; Unlike the UTF-8 version, this version of
          439              ;; CONSUME-OCTET needs to check if I is being incremented
          440              ;; past END because we might have trailing binary
          441              ;; garbage.
          442              (macrolet
          443                  ((consume-octet (n)
          444                     `(if (= i (1- end))
          445                          (encode-raw-octets ,n)
          446                          (,',getter src (incf i))))
          447                   (encode-raw-octets (n)
          448                     `(progn
          449                        ,@(loop for i below n and var in '(u1 u2 u3 u4)
          450                                collect `(,',setter (logior #xdc00 ,var) dest di)
          451                                unless (= i (1- n))
          452                                collect '(incf di))
          453                        (return-from set-body))))
          454                (block set-body
          455                  (,setter (cond
          456                             ((< u1 #x80) ; 1 octet
          457                              u1)
          458                             ((>= u1 #xc2)
          459                              (setq u2 (consume-octet 1))
          460                              (cond
          461                                ((< u1 #xe0) ; 2 octets
          462                                 (if (< (f-logxor u2 #x80) #x40)
          463                                     (logior (f-ash (f-logand #x1f u1) 6)
          464                                             (f-logxor u2 #x80))
          465                                     (encode-raw-octets 2)))
          466                                (t
          467                                 (setq u3 (consume-octet 2))
          468                                 (cond
          469                                   ((< u1 #xf0) ; 3 octets
          470                                    (if (and (< (f-logxor u2 #x80) #x40)
          471                                             (< (f-logxor u3 #x80) #x40)
          472                                             (or (>= u1 #xe1) (>= u2 #xa0)))
          473                                        (let ((start (f-logior (f-ash (f-logand u1 #x0f) 12)
          474                                                               (f-ash (f-logand u2 #x3f) 6))))
          475                                          (if (<= #xd800 start #xdfc0)
          476                                              (encode-raw-octets 3)
          477                                              (logior start (f-logand u3 #x3f))))
          478                                        (encode-raw-octets 3)))
          479                                   (t    ; 4 octets
          480                                    (setq u4 (consume-octet 3))
          481                                    (if (and (< (f-logxor u2 #x80) #x40)
          482                                             (< (f-logxor u3 #x80) #x40)
          483                                             (< (f-logxor u4 #x80) #x40)
          484                                             (or (>= u1 #xf1) (>= u2 #x90)))
          485                                        (logior
          486                                         (f-logior (f-ash (f-logand u1 7) 18)
          487                                                   (f-ash (f-logxor u2 #x80) 12))
          488                                         (f-logior (f-ash (f-logxor u3 #x80) 6)
          489                                                   (f-logxor u4 #x80)))
          490                                        (encode-raw-octets 4)))))))
          491                             (t (encode-raw-octets 1)))
          492                           dest di)))
          493              finally (return (the fixnum (- di d-start)))))))
          494 
          495 ;;;; UTF-16
          496 
          497 ;;; TODO: add a way to pass some info at compile-time telling us that,
          498 ;;; for example, the maximum code-point will always be < #x10000 in
          499 ;;; which case we could simply return (* 2 (- end start)).
          500 (defmacro utf16-octet-counter (getter type)
          501   `(named-lambda utf-16-octet-counter (seq start end max)
          502      (declare (type ,type seq) (fixnum start end max))
          503      (loop with noctets fixnum = 0
          504            for i fixnum from start below end
          505            for code of-type code-point = (,getter seq i)
          506            do (let ((new (the fixnum (+ (if (< code #x10000) 2 4) noctets))))
          507                 (if (and (plusp max) (> new max))
          508                     (loop-finish)
          509                     (setq noctets new)))
          510            finally (return (values noctets i)))))
          511 
          512 (defmacro utf-16-combine-surrogate-pairs (u1 u2)
          513   `(the (unsigned-byte 21)
          514      (+ #x10000
          515         (the (unsigned-byte 20)
          516           (logior
          517            (the (unsigned-byte 20)
          518              (ash (the (unsigned-byte 10) (- ,u1 #xd800)) 10))
          519            (the (unsigned-byte 10)
          520              (- ,u2 #xdc00)))))))
          521 
          522 (defmacro define-utf-16 (name &optional endianness)
          523   (check-type endianness (or null (eql :be) (eql :le)))
          524   (check-type name keyword)
          525   (let ((swap-var (gensym "SWAP"))
          526         (code-point-counter-name
          527           (format-symbol t '#:~a-code-point-counter (string name)))
          528         (encoder-name (format-symbol t '#:~a-encoder (string name)))
          529         (decoder-name (format-symbol t '#:~a-decoder (string name))))
          530     (labels ((make-bom-check-form (end start getter seq)
          531                (if (null endianness)
          532                  ``((,',swap-var
          533                       (when (> ,,end ,,start)
          534                         (case (,,getter ,,seq ,,start 2 :ne)
          535                           (#.+byte-order-mark-code+ (incf ,,start 2) nil)
          536                           (#.+swapped-byte-order-mark-code+ (incf ,,start 2) t)
          537                           (t #+little-endian t)))))
          538                  '()))
          539              (make-getter-form (getter src i)
          540                (case endianness
          541                  (:le ``(,,getter ,,src ,,i 2 :le))
          542                  (:be ``(,,getter ,,src ,,i 2 :be))
          543                  (t ``(if ,',swap-var
          544                         (,,getter ,,src ,,i 2 :re)
          545                         (,,getter ,,src ,,i 2 :ne)))))
          546              (make-setter-form (setter code dest di)
          547                (case endianness
          548                  (:be ``(,,setter ,,code ,,dest ,,di 2 :be))
          549                  (:le ``(,,setter ,,code ,,dest ,,di 2 :le))
          550                  (t ``(,,setter ,,code ,,dest ,,di 2 :ne)))))
          551       `(progn
          552          (define-octet-counter ,name (getter type)
          553            `(utf16-octet-counter ,getter ,type))
          554          (define-code-point-counter ,name (getter type)
          555            `(named-lambda ,',code-point-counter-name (seq start end max)
          556               (declare (type ,type seq) (fixnum start end max))
          557               (let* ,,(make-bom-check-form ''end ''start 'getter ''seq)
          558                 (loop with count fixnum = 0
          559                   with i fixnum = start
          560                   while (<= i (- end 2)) do
          561                   (let* ((code ,,(make-getter-form 'getter ''seq ''i))
          562                          (next-i (+ i (if (or (< code #xd800) (>= code #xdc00))
          563                                         2
          564                                         4))))
          565                     (declare (type (unsigned-byte 16) code) (fixnum next-i))
          566                     (cond
          567                       ((> next-i end)
          568                        (decoding-error
          569                          (vector (,getter seq i) (,getter seq (1+ i)))
          570                          ,',name seq i nil 'end-of-input-in-character)
          571                        (return (values count i)))
          572                       (t
          573                         (setq i next-i
          574                               count (1+ count))
          575                         (when (and (plusp max) (= count max))
          576                           (return (values count i))))))
          577                   finally (progn
          578                             (assert (= i end))
          579                             (return (values count i)))))))
          580          (define-encoder ,name (getter src-type setter dest-type)
          581            `(named-lambda ,',encoder-name (src start end dest d-start)
          582               (declare (type ,src-type src)
          583                        (type ,dest-type dest)
          584                        (fixnum start end d-start))
          585               (loop with di fixnum = d-start
          586                 for i fixnum from start below end
          587                 for code of-type code-point = (,getter src i)
          588                 for high-bits fixnum = (- code #x10000) do
          589                 (cond ((< high-bits 0)
          590                        ,,(make-setter-form 'setter ''code ''dest ''di)
          591                        (incf di 2))
          592                       (t
          593                         ,,(make-setter-form
          594                               'setter ''(logior #xd800 (f-ash high-bits -10))
          595                             ''dest ''di)
          596                         ,,(make-setter-form
          597                               'setter ''(logior #xdc00 (f-logand high-bits #x3ff))
          598                             ''dest ''(+ di 2))
          599                         (incf di 4)))
          600                 finally (return (the fixnum (- di d-start))))))
          601          (define-decoder ,name (getter src-type setter dest-type)
          602            `(named-lambda ,',decoder-name (src start end dest d-start)
          603               (declare (type ,src-type src)
          604                        (type ,dest-type dest)
          605                        (fixnum start end d-start))
          606               (let ,,(make-bom-check-form ''end ''start 'getter ''src)
          607                 (loop with i fixnum = start
          608                   for di fixnum from d-start
          609                   until (= i end) do
          610                   (let ((u1 ,,(make-getter-form 'getter ''src ''i)))
          611                     (declare (type (unsigned-byte 16) u1))
          612                     (incf i 2)
          613                     (,setter (cond
          614                                ((or (< u1 #xd800) (>= u1 #xe000)) ; 2 octets
          615                                 u1)
          616                                ((< u1 #xdc00) ; 4 octets
          617                                 (let ((u2 ,,(make-getter-form 'getter ''src ''i)))
          618                                   (declare (type (unsigned-byte 16) u2))
          619                                   (incf i 2)
          620                                   (if (and (>= u2 #xdc00) (< u2 #xe000))
          621                                     (utf-16-combine-surrogate-pairs u1 u2)
          622                                     (decoding-error
          623                                       (vector (,getter src (- i 4))
          624                                               (,getter src (- i 3))
          625                                               (,getter src (- i 2))
          626                                               (,getter src (- i 1)))
          627                                       ,',name src i +repl+))))
          628                                (t
          629                                  (decoding-error (vector (,getter src (- i 2))
          630                                                          (,getter src (- i 1)))
          631                                                  ,',name src i +repl+)))
          632                              dest di))
          633                   finally (return (the fixnum (- di d-start)))))))
          634          ',name))))
          635 
          636 (define-character-encoding :utf-16
          637     "A 16-bit, variable-length encoding in which characters with
          638 code points less than #x10000 can be encoded in a single 16-bit
          639 word and characters with larger codes can be encoded in a pair of
          640 16-bit words.  The endianness of the encoded data is indicated by
          641 the endianness of a byte-order-mark character (#\u+feff)
          642 prepended to the data; in the absence of such a character on
          643 input, the data is assumed to be in big-endian order.  Output is
          644 written in native byte-order with a leading byte-order mark."
          645   :max-units-per-char 2
          646   :code-unit-size 16
          647   :native-endianness t            ; not necessarily true when decoding
          648   :decode-literal-code-unit-limit #xd800
          649   :encode-literal-code-unit-limit #x10000
          650   :use-bom #+big-endian :utf-16be #+little-endian :utf-16le
          651   :bom-encoding #+big-endian #(#xfe #xff) #+little-endian #(#xff #xfe)
          652   :nul-encoding #(0 0)
          653   :default-replacement #xfffd
          654   :ambiguous #+little-endian t #+big-endian nil)
          655 
          656 (define-utf-16 :utf-16)
          657 
          658 (define-character-encoding :utf-16le
          659     "A 16-bit, variable-length encoding in which characters with
          660 code points less than #x10000 can be encoded in a single 16-bit
          661 word and characters with larger codes can be encoded in a pair of
          662 16-bit words.  The data is assumed to be in little-endian order.  Output is
          663 written in little-endian byte-order without a leading byte-order mark."
          664   :aliases '(:utf-16/le)
          665   :max-units-per-char 2
          666   :code-unit-size 16
          667   :native-endianness #+little-endian t #+big-endian nil
          668   :decode-literal-code-unit-limit #xd800
          669   :encode-literal-code-unit-limit #x10000
          670   :nul-encoding #(0 0)
          671   :default-replacement #xfffd)
          672 
          673 (define-utf-16 :utf-16le :le)
          674 
          675 (define-character-encoding :utf-16be
          676     "A 16-bit, variable-length encoding in which characters with
          677 code points less than #x10000 can be encoded in a single 16-bit
          678 word and characters with larger codes can be encoded in a pair of
          679 16-bit words.  The data is assumed to be in big-endian order.  Output is
          680 written in big-endian byte-order without a leading byte-order mark."
          681   :aliases '(:utf-16/be)
          682   :max-units-per-char 2
          683   :code-unit-size 16
          684   :native-endianness #+little-endian nil #+big-endian t
          685   :decode-literal-code-unit-limit #xd800
          686   :encode-literal-code-unit-limit #x10000
          687   :nul-encoding #(0 0)
          688   :default-replacement #xfffd)
          689 
          690 (define-utf-16 :utf-16be :be)
          691 
          692 (defmacro define-ucs (name bytes &optional endianness (limit #x110000))
          693   (check-type name keyword)
          694   (check-type bytes (or (eql 2) (eql 4)))
          695   (check-type endianness (or null (eql :le) (eql :be)))
          696   (let ((swap-var (gensym "SWAP"))
          697         (code-point-counter-name
          698           (format-symbol t '#:~a-code-point-counter (string name)))
          699         (encoder-name
          700           (format-symbol t '#:~a-encoder (string name)))
          701         (decoder-name
          702           (format-symbol t '#:~a-decoder (string name))))
          703     (labels ((make-bom-check-form (end start getter src)
          704                (if (null endianness)
          705                    ``(when (not (zerop (- ,,end ,,start)))
          706                        (case (,,getter ,,src 0 ,',bytes :ne)
          707                          (#.+byte-order-mark-code+
          708                           (incf ,,start ,',bytes) nil)
          709                          (#.+swapped-byte-order-mark-code-32+
          710                           (incf ,,start ,',bytes) t)
          711                          (t #+little-endian t)))
          712                    '()))
          713              (make-setter-form (setter code dest di)
          714                ``(,,setter ,,code ,,dest ,,di ,',bytes
          715                            ,',(or endianness :ne)))
          716              (make-getter-form (getter src i)
          717                (if (null endianness)
          718                    ``(if ,',swap-var
          719                          (,,getter ,,src ,,i ,',bytes :re)
          720                          (,,getter ,,src ,,i ,',bytes :ne))
          721                    ``(,,getter ,,src ,,i ,',bytes ,',endianness))))
          722       `(progn
          723          (define-code-point-counter ,name (getter type)
          724            `(named-lambda ,',code-point-counter-name (seq start end max)
          725               (declare (type ,type seq) (fixnum start end max))
          726               ;; check for bom
          727               ,,(make-bom-check-form ''end ''start 'getter ''seq)
          728               (multiple-value-bind (count rem)
          729                   (floor (- end start) ,',bytes)
          730                 (cond
          731                   ((and (plusp max) (> count max))
          732                    (values max (the fixnum (+ start (* ,',bytes max)))))
          733                   (t
          734                    ;; check for incomplete last character
          735                    (unless (zerop rem)
          736                      (let ((vector (make-array ,',bytes :fill-pointer 0)))
          737                        (dotimes (i rem)
          738                          (vector-push (,getter seq (+ i (- end rem))) vector))
          739                        (decoding-error vector ,',name seq (the fixnum (- end rem)) nil
          740                                        'end-of-input-in-character)
          741                        (decf end rem)))
          742                    (values count end))))))
          743          (define-encoder ,name (getter src-type setter dest-type)
          744            `(named-lambda ,',encoder-name (src start end dest d-start)
          745               (declare (type ,src-type src)
          746                        (type ,dest-type dest)
          747                        (fixnum start end d-start))
          748               (loop for i fixnum from start below end
          749                     and di fixnum from d-start by ,',bytes
          750                     for code of-type code-point = (,getter src i)
          751                     do (if (>= code ,',limit)
          752                            (encoding-error code ,',name src i +repl+)
          753                            ,,(make-setter-form 'setter ''code ''dest ''di))
          754                     finally (return (the fixnum (- di d-start))))))
          755          (define-decoder ,name (getter src-type setter dest-type)
          756            `(named-lambda ,',decoder-name (src start end dest d-start)
          757               (declare (type ,src-type src)
          758                        (type ,dest-type dest)
          759                        (fixnum start end d-start))
          760               (let ((,',swap-var ,,(make-bom-check-form ''end ''start 'getter ''src)))
          761                 (declare (ignorable ,',swap-var))
          762                 (loop for i fixnum from start below end by ,',bytes
          763                       and di from d-start
          764                       do (,setter (let ((unit ,,(make-getter-form 'getter ''src ''i)))
          765                                     (if (>= unit ,',limit)
          766                                         (decoding-error
          767                                          (vector (,getter src i)
          768                                                  (,getter src (+ i 1))
          769                                                  ,@,(if (= bytes 4)
          770                                                         ``((,getter src (+ i 2))
          771                                                            (,getter src (+ i 3)))))
          772                                          ,',name src i +repl+
          773                                          'character-out-of-range)
          774                                         unit))
          775                                   dest di)
          776                       finally (return (the fixnum (- di d-start)))))))
          777          ',name))))
          778 
          779 ;;;; UTF-32
          780 
          781 (define-character-encoding :utf-32
          782    "A 32-bit, fixed-length encoding in which all Unicode
          783 characters can be encoded in a single 32-bit word.  The
          784 endianness of the encoded data is indicated by the endianness of
          785 a byte-order-mark character (#\u+feff) prepended to the data; in
          786 the absence of such a character on input, input data is assumed
          787 to be in big-endian order.  Output is written in native byte
          788 order with a leading byte-order mark."
          789   :aliases '(:ucs-4)
          790   :max-units-per-char 1
          791   :code-unit-size 32
          792   :native-endianness t ; not necessarily true when decoding
          793   :literal-char-code-limit #x110000
          794   :use-bom #+little-endian :utf-32le #+big-endian :utf-32be
          795   :bom-encoding
          796   #+big-endian #(#x00 #x00 #xfe #xff)
          797   #+little-endian #(#xff #xfe #x00 #x00)
          798   :nul-encoding #(0 0 0 0)
          799   :ambiguous #+little-endian t #+big-endian nil)
          800 
          801 (define-ucs :utf-32 4)
          802 
          803 (define-character-encoding :utf-32le
          804    "A 32-bit, fixed-length encoding in which all Unicode
          805 characters can be encoded in a single 32-bit word. Input data is assumed
          806 to be in little-endian order.  Output is also written in little-endian byte
          807 order without a leading byte-order mark."
          808   :max-units-per-char 1
          809   :code-unit-size 32
          810   :aliases '(:utf-32/le :ucs-4le :ucs-4/le)
          811   :native-endianness #+little-endian t #+big-endian nil
          812   :literal-char-code-limit #x110000
          813   :nul-encoding #(0 0 0 0))
          814 
          815 (define-ucs :utf-32le 4 :le)
          816 
          817 (define-character-encoding :utf-32be
          818    "A 32-bit, fixed-length encoding in which all Unicode
          819 characters can be encoded in a single 32-bit word. Input data is assumed
          820 to be in big-endian order.  Output is also written in big-endian byte
          821 order without a leading byte-order mark."
          822   :max-units-per-char 1
          823   :code-unit-size 32
          824   :aliases '(:utf-32/be :ucs-4be :ucs-4/be)
          825   :native-endianness #+little-endian nil #+big-endian t
          826   :literal-char-code-limit #x110000
          827   :nul-encoding #(0 0 0 0))
          828 
          829 (define-ucs :utf-32be 4 :be)
          830 
          831 ;; UCS-2
          832 
          833 (define-character-encoding :ucs-2
          834    "A 16-bit, fixed-length encoding in which all Unicode
          835 characters can be encoded in a single 16-bit word.  The
          836 endianness of the encoded data is indicated by the endianness of
          837 a byte-order-mark character (#\u+feff) prepended to the data; in
          838 the absence of such a character on input, input data is assumed
          839 to be in big-endian order.  Output is written in native byte
          840 order with a leading byte-order mark."
          841   :aliases '(:ucs-2)
          842   :max-units-per-char 1
          843   :code-unit-size 16
          844   :native-endianness t ; not necessarily true when decoding
          845   :literal-char-code-limit #x10000
          846   :use-bom #+little-endian :ucs-2le #+big-endian :ucs-2be
          847   :bom-encoding
          848   #+big-endian #(#xfe #xff)
          849   #+little-endian #(#xff #xfe)
          850   :nul-encoding #(0 0)
          851   :ambiguous #+little-endian t #+big-endian nil)
          852 
          853 (define-ucs :ucs-2 2 nil #x10000)
          854 
          855 (define-character-encoding :ucs-2le
          856    "A 16-bit, fixed-length encoding in which all Unicode
          857 characters can be encoded in a single 16-bit word. Input data is assumed
          858 to be in little-endian order.  Output is also written in little-endian byte
          859 order without a leading byte-order mark."
          860   :max-units-per-char 1
          861   :code-unit-size 16
          862   :aliases '(:ucs-2/le)
          863   :native-endianness #+little-endian t #+big-endian nil
          864   :literal-char-code-limit #x10000
          865   :nul-encoding #(0 0))
          866 
          867 (define-ucs :ucs-2le 2 :le #x10000)
          868 
          869 (define-character-encoding :ucs-2be
          870    "A 16-bit, fixed-length encoding in which all Unicode
          871 characters can be encoded in a single 16-bit word. Input data is assumed
          872 to be in big-endian order.  Output is also written in big-endian byte
          873 order without a leading byte-order mark."
          874   :max-units-per-char 1
          875   :code-unit-size 16
          876   :aliases '(:ucs-2/be)
          877   :native-endianness #+little-endian nil #+big-endian t
          878   :literal-char-code-limit #x10000
          879   :nul-encoding #(0 0))
          880 
          881 (define-ucs :ucs-2be 2 :be #x10000)