enc-unicode.lisp - clic - Clic is an command line interactive client for gopher written in Common LISP
(HTM) git clone git://bitreich.org/clic/ git://enlrupgkhuxnvlhsf6lc3fziv5h2hhfrinws65d7roiv6bfj7d652fid.onion/clic/
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) Tags
(DIR) README
(DIR) LICENSE
---
enc-unicode.lisp (42416B)
---
1 ;;;; -*- Mode: lisp; indent-tabs-mode: nil -*-
2 ;;;
3 ;;; enc-unicode.lisp --- Unicode encodings.
4 ;;;
5 ;;; Copyright (C) 2007, Luis Oliveira <loliveira@common-lisp.net>
6 ;;;
7 ;;; Permission is hereby granted, free of charge, to any person
8 ;;; obtaining a copy of this software and associated documentation
9 ;;; files (the "Software"), to deal in the Software without
10 ;;; restriction, including without limitation the rights to use, copy,
11 ;;; modify, merge, publish, distribute, sublicense, and/or sell copies
12 ;;; of the Software, and to permit persons to whom the Software is
13 ;;; furnished to do so, subject to the following conditions:
14 ;;;
15 ;;; The above copyright notice and this permission notice shall be
16 ;;; included in all copies or substantial portions of the Software.
17 ;;;
18 ;;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 ;;; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 ;;; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21 ;;; NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
22 ;;; HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
23 ;;; WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24 ;;; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 ;;; DEALINGS IN THE SOFTWARE.
26
27 ;;; This implementation is largely based on OpenMCL's l1-unicode.lisp
28 ;;; Copyright (C) 2006 Clozure Associates and contributors.
29
30 (in-package #:babel-encodings)
31
32 (eval-when (:compile-toplevel :load-toplevel :execute)
33 (defconstant +repl+ #xfffd "Unicode replacement character code point.")
34 (defconstant +byte-order-mark-code+ #xfeff)
35 (defconstant +swapped-byte-order-mark-code+ #xfffe)
36 (defconstant +swapped-byte-order-mark-code-32+ #xfffe0000))
37
38 ;;; Some convenience macros adding FIXNUM declarations.
39 (defmacro f-ash (integer count) `(the fixnum (ash ,integer ,count)))
40 (defmacro f-logior (&rest integers) `(the fixnum (logior ,@integers)))
41 (defmacro f-logand (&rest integers) `(the fixnum (logand ,@integers)))
42 (defmacro f-logxor (&rest integers) `(the fixnum (logxor ,@integers)))
43
44 ;;;; UTF-8
45
46 (define-character-encoding :utf-8
47 "An 8-bit, variable-length character encoding in which
48 character code points in the range #x00-#x7f can be encoded in a
49 single octet; characters with larger code values can be encoded
50 in 2 to 4 bytes."
51 :max-units-per-char 4
52 :literal-char-code-limit #x80
53 :bom-encoding #(#xef #xbb #xbf)
54 :default-replacement #xfffd)
55
56 (define-condition invalid-utf8-starter-byte (character-decoding-error)
57 ()
58 (:documentation "Signalled when an invalid UTF-8 starter byte is found."))
59
60 (define-condition invalid-utf8-continuation-byte (character-decoding-error)
61 ()
62 (:documentation
63 "Signalled when an invalid UTF-8 continuation byte is found."))
64
65 (define-condition overlong-utf8-sequence (character-decoding-error)
66 ()
67 (:documentation "Signalled upon overlong UTF-8 sequences."))
68
69 (define-octet-counter :utf-8 (getter type)
70 `(named-lambda utf-8-octet-counter (seq start end max)
71 (declare (type ,type seq) (fixnum start end max))
72 (loop with noctets fixnum = 0
73 for i fixnum from start below end
74 for code of-type code-point = (,getter seq i) do
75 (let ((new (+ (cond ((< code #x80) 1)
76 ((< code #x800) 2)
77 ((< code #x10000) 3)
78 (t 4))
79 noctets)))
80 (if (and (plusp max) (> new max))
81 (loop-finish)
82 (setq noctets new)))
83 finally (return (values noctets i)))))
84
85 (define-code-point-counter :utf-8 (getter type)
86 `(named-lambda utf-8-code-point-counter (seq start end max)
87 (declare (type ,type seq) (fixnum start end max))
88 (loop with nchars fixnum = 0
89 with i fixnum = start
90 while (< i end) do
91 ;; check for invalid continuation bytes
92 (macrolet ((invalid-cb-p (n)
93 `(and (< (+ i ,n) end)
94 (not (< #x7f (,',getter seq (+ i ,n)) #xc0)))))
95 ;; wrote this code with LET instead of FOR because CLISP's
96 ;; LOOP doesn't like WHILE clauses before FOR clauses.
97 (let* ((octet (,getter seq i))
98 (next-i (+ i (cond ((or (< octet #xc0) (invalid-cb-p 1)) 1)
99 ((or (< octet #xe0) (invalid-cb-p 2)) 2)
100 ((or (< octet #xf0) (invalid-cb-p 3)) 3)
101 ((or (< octet #xf8) (invalid-cb-p 4)) 4)
102 ((or (< octet #xfc) (invalid-cb-p 5)) 5)
103 (t 6)))))
104 (declare (type ub8 octet) (fixnum next-i))
105 (cond
106 ((> next-i end)
107 ;; Should we add restarts to this error, we'll have
108 ;; to figure out a way to communicate with the
109 ;; decoder since we probably want to do something
110 ;; about it right here when we have a chance to
111 ;; change the count or something. (Like an
112 ;; alternative replacement character or perhaps the
113 ;; existence of this error so that the decoder
114 ;; doesn't have to check for it on every iteration
115 ;; like we do.)
116 ;;
117 ;; FIXME: The data for this error is not right.
118 (decoding-error (vector octet) :utf-8 seq i
119 nil 'end-of-input-in-character)
120 (return (values (1+ nchars) end)))
121 (t
122 (setq nchars (1+ nchars)
123 i next-i)
124 (when (and (plusp max) (= nchars max))
125 (return (values nchars i)))))))
126 finally (progn
127 (assert (= i end))
128 (return (values nchars i))))))
129
130 (define-encoder :utf-8 (getter src-type setter dest-type)
131 `(named-lambda utf-8-encoder (src start end dest d-start)
132 (declare (type ,src-type src)
133 (type ,dest-type dest)
134 (fixnum start end d-start))
135 (loop with di fixnum = d-start
136 for i fixnum from start below end
137 for code of-type code-point = (,getter src i) do
138 (macrolet ((set-octet (offset value)
139 `(,',setter ,value dest (the fixnum (+ di ,offset)))))
140 (cond
141 ;; 1 octet
142 ((< code #x80)
143 (set-octet 0 code)
144 (incf di))
145 ;; 2 octets
146 ((< code #x800)
147 (set-octet 0 (logior #xc0 (f-ash code -6)))
148 (set-octet 1 (logior #x80 (f-logand code #x3f)))
149 (incf di 2))
150 ;; 3 octets
151 ((< code #x10000)
152 (set-octet 0 (logior #xe0 (f-ash code -12)))
153 (set-octet 1 (logior #x80 (f-logand #x3f (f-ash code -6))))
154 (set-octet 2 (logior #x80 (f-logand code #x3f)))
155 (incf di 3))
156 ;; 4 octets
157 (t
158 (set-octet 0 (logior #xf0 (f-logand #x07 (f-ash code -18))))
159 (set-octet 1 (logior #x80 (f-logand #x3f (f-ash code -12))))
160 (set-octet 2 (logior #x80 (f-logand #x3f (f-ash code -6))))
161 (set-octet 3 (logior #x80 (logand code #x3f)))
162 (incf di 4))))
163 finally (return (the fixnum (- di d-start))))))
164
165 (define-decoder :utf-8 (getter src-type setter dest-type)
166 `(named-lambda utf-8-decoder (src start end dest d-start)
167 (declare (type ,src-type src)
168 (type ,dest-type dest)
169 (fixnum start end d-start))
170 (let ((u2 0) (u3 0) (u4 0) (u5 0) (u6 0))
171 (declare (type ub8 u2 u3 u4 u5 u6))
172 (loop for di fixnum from d-start
173 for i fixnum from start below end
174 for u1 of-type ub8 = (,getter src i) do
175 ;; Note: CONSUME-OCTET doesn't check if I is being
176 ;; incremented past END. We're assuming that END has
177 ;; been calculated with the CODE-POINT-POINTER above that
178 ;; checks this.
179 (macrolet
180 ((consume-octet ()
181 `(let ((next-i (incf i)))
182 (if (= next-i end)
183 ;; FIXME: data for this error is incomplete.
184 ;; and signalling this error twice
185 (return-from setter-block
186 (decoding-error nil :utf-8 src i +repl+
187 'end-of-input-in-character))
188 (,',getter src next-i))))
189 (handle-error (n &optional (c 'character-decoding-error))
190 `(decoding-error
191 (vector ,@(subseq '(u1 u2 u3 u4 u5 u6) 0 n))
192 :utf-8 src (1+ (- i ,n)) +repl+ ',c))
193 (handle-error-if-icb (var n)
194 `(when (not (< #x7f ,var #xc0))
195 (decf i)
196 (return-from setter-block
197 (handle-error ,n invalid-utf8-continuation-byte)))))
198 (,setter
199 (block setter-block
200 (cond
201 ((< u1 #x80) u1) ; 1 octet
202 ((< u1 #xc0)
203 (handle-error 1 invalid-utf8-starter-byte))
204 (t
205 (setq u2 (consume-octet))
206 (handle-error-if-icb u2 1)
207 (cond
208 ((< u1 #xc2)
209 (handle-error 2 overlong-utf8-sequence))
210 ((< u1 #xe0) ; 2 octets
211 (logior (f-ash (f-logand #x1f u1) 6)
212 (f-logxor u2 #x80)))
213 (t
214 (setq u3 (consume-octet))
215 (handle-error-if-icb u3 2)
216 (cond
217 ((and (= u1 #xe0) (< u2 #xa0))
218 (handle-error 3 overlong-utf8-sequence))
219 ((< u1 #xf0) ; 3 octets
220 (let ((start (f-logior (f-ash (f-logand u1 #x0f) 12)
221 (f-ash (f-logand u2 #x3f) 6))))
222 (if (<= #xd800 start #xdfc0)
223 (handle-error 3 character-out-of-range)
224 (logior start (f-logand u3 #x3f)))))
225 (t ; 4 octets
226 (setq u4 (consume-octet))
227 (handle-error-if-icb u4 3)
228 (cond
229 ((and (= u1 #xf0) (< u2 #x90))
230 (handle-error 4 overlong-utf8-sequence))
231 ((< u1 #xf8)
232 (if (or (> u1 #xf4) (and (= u1 #xf4) (> u2 #x8f)))
233 (handle-error 4 character-out-of-range)
234 (f-logior (f-ash (f-logand u1 7) 18)
235 (f-ash (f-logxor u2 #x80) 12)
236 (f-ash (f-logxor u3 #x80) 6)
237 (f-logxor u4 #x80))))
238 ;; from here on we'll be getting either
239 ;; invalid continuation bytes or overlong
240 ;; 5-byte or 6-byte sequences.
241 (t
242 (setq u5 (consume-octet))
243 (handle-error-if-icb u5 4)
244 (cond
245 ((and (= u1 #xf8) (< u2 #x88))
246 (handle-error 5 overlong-utf8-sequence))
247 ((< u1 #xfc)
248 (handle-error 5 character-out-of-range))
249 (t
250 (setq u6 (consume-octet))
251 (handle-error-if-icb u6 5)
252 (cond
253 ((and (= u1 #xfc) (< u2 #x84))
254 (handle-error 6 overlong-utf8-sequence))
255 (t
256 (handle-error 6 character-out-of-range)
257 )))))))))))))
258 dest di))
259 finally (return (the fixnum (- di d-start)))))))
260
261 ;;;; UTF-8B
262
263 ;;; The following excerpt from a linux-utf8 message by Markus Kuhn is
264 ;;; the closest thing to a UTF-8B specification:
265 ;;;
266 ;;; <http://mail.nl.linux.org/linux-utf8/2000-07/msg00040.html>
267 ;;;
268 ;;; "D) Emit a malformed UTF-16 sequence for every byte in a malformed
269 ;;; UTF-8 sequence
270 ;;;
271 ;;; All the previous options for converting malformed UTF-8 sequences
272 ;;; to UTF-16 destroy information. This can be highly undesirable in
273 ;;; applications such as text file editors, where guaranteed binary
274 ;;; transparency is a desireable feature. (E.g., I frequently edit
275 ;;; executable code or graphic files with the Emacs text editor and I
276 ;;; hate the idea that my editor might automatically make U+FFFD
277 ;;; substitutions at locations that I haven't even edited when I save
278 ;;; the file again.)
279 ;;;
280 ;;; I therefore suggested 1999-11-02 on the unicode@xxxxxxxxxxx
281 ;;; mailing list the following approach. Instead of using U+FFFD,
282 ;;; simply encode malformed UTF-8 sequences as malformed UTF-16
283 ;;; sequences. Malformed UTF-8 sequences consist excludively of the
284 ;;; bytes 0x80 - 0xff, and each of these bytes can be represented
285 ;;; using a 16-bit value from the UTF-16 low-half surrogate zone
286 ;;; U+DC80 to U+DCFF. Thus, the overlong "K" (U+004B) 0xc1 0x8b from
287 ;;; the above example would be represented in UTF-16 as U+DCC1
288 ;;; U+DC8B. If we simply make sure that every UTF-8 encoded surrogate
289 ;;; character is also treated like a malformed sequence, then there
290 ;;; is no way that a single high-half surrogate could precede the
291 ;;; encoded malformed sequence and cause a valid UTF-16 sequence to
292 ;;; emerge.
293 ;;;
294 ;;; This way 100% binary transparent UTF-8 -> UTF-16 -> UTF-8
295 ;;; round-trip compatibility can be achieved quite easily.
296 ;;;
297 ;;; On an output device, a lonely low-half surrogate character should
298 ;;; be treated just like a character outside the adopted subset of
299 ;;; representable characters, that is for the end user, the display
300 ;;; would look exactly like with semantics B), i.e. one symbol per
301 ;;; byte of a malformed sequence. However in contrast to semantics
302 ;;; B), no information is thrown away, and a cut&paste in an editor
303 ;;; or terminal emulator will be guaranteed to reconstruct the
304 ;;; original byte sequence. This should greatly reduce the incidence
305 ;;; of accidental corruption of binary data by UTF-8 -> UTF-16 ->
306 ;;; UTF-8 conversion round trips."
307
308 (define-character-encoding :utf-8b
309 "An 8-bit, variable-length character encoding in which
310 character code points in the range #x00-#x7f can be encoded in a
311 single octet; characters with larger code values can be encoded
312 in 2 to 4 bytes. Invalid UTF-8 sequences are encoded with #xDCXX
313 code points for each invalid byte."
314 :max-units-per-char 4
315 :literal-char-code-limit #x80
316 :bom-encoding #(#xef #xbb #xbf)
317 :default-replacement nil)
318
319 ;;; TODO: reuse the :UTF-8 octet counter through a simple macro.
320 (define-octet-counter :utf-8b (getter type)
321 `(named-lambda utf-8b-octet-counter (seq start end max)
322 (declare (type ,type seq) (fixnum start end max))
323 (loop with noctets fixnum = 0
324 for i fixnum from start below end
325 for code of-type code-point = (,getter seq i) do
326 (let ((new (+ (cond ((< code #x80) 1)
327 ((< code #x800) 2)
328 ((<= #xdc80 code #xdcff) 1)
329 ((< code #x10000) 3)
330 (t 4))
331 noctets)))
332 (if (and (plusp max) (> new max))
333 (loop-finish)
334 (setq noctets new)))
335 finally (return (values noctets i)))))
336
337 (define-code-point-counter :utf-8b (getter type)
338 `(named-lambda utf-8b-code-point-counter (seq start end max)
339 (declare (type ,type seq) (fixnum start end max))
340 (loop with nchars fixnum = 0
341 with i fixnum = start
342 while (< i end) do
343 ;; wrote this code with LET instead of FOR because CLISP's
344 ;; LOOP doesn't like WHILE clauses before FOR clauses.
345 (let* ((octet (,getter seq i))
346 (noctets (cond ((< octet #x80) 1)
347 ((< octet #xe0) 2)
348 ((< octet #xf0) 3)
349 (t 4))))
350 (declare (type ub8 octet) (fixnum noctets))
351 (cond
352 ((> (+ i noctets) end)
353 ;; If this error is suppressed these last few bytes
354 ;; will be encoded as raw bytes later.
355 (decoding-error (vector octet) :utf-8 seq i
356 nil 'end-of-input-in-character)
357 (return (values (+ nchars (- end i)) end)))
358 (t
359 ;; FIXME: clean this mess up.
360 (let* ((u1 octet)
361 (u2 (if (>= noctets 2) (,getter seq (1+ i)) 0))
362 (u3 (if (>= noctets 3) (,getter seq (+ i 2)) 0))
363 (u4 (if (= noctets 4) (,getter seq (+ i 3)) 0))
364 (inc (or (and (> noctets 1)
365 (< u1 #xc2))
366 (and (= noctets 2)
367 (not (logior u2 #x40)))
368 (and (= noctets 3)
369 (not (and (< (f-logxor u2 #x80) #x40)
370 (< (f-logxor u3 #x80) #x40)
371 (or (>= u1 #xe1) (>= u2 #xa0))
372 (or (/= u1 #xed) (< u2 #xa0) (> u2 #xbf)))))
373 (and (= noctets 4)
374 (not
375 (and (< (f-logxor u2 #x80) #x40)
376 (< (f-logxor u3 #x80) #x40)
377 (< (f-logxor u4 #x80) #x40)
378 (or (>= u1 #xf1) (>= u2 #x90))))))))
379 (let ((new-nchars (if inc (+ nchars noctets) (1+ nchars))))
380 (when (and (plusp max) (> new-nchars max))
381 (return (values nchars i)))
382 (incf i noctets)
383 (setq nchars new-nchars))))))
384 finally (progn
385 (assert (= i end))
386 (return (values nchars i))))))
387
388 ;;; TODO: reuse the :UTF-8 encoder with through a simple macro.
389 (define-encoder :utf-8b (getter src-type setter dest-type)
390 `(named-lambda utf-8b-encoder (src start end dest d-start)
391 (declare (type ,src-type src)
392 (type ,dest-type dest)
393 (fixnum start end d-start))
394 (loop with di fixnum = d-start
395 for i fixnum from start below end
396 for code of-type code-point = (,getter src i) do
397 (macrolet ((set-octet (offset value)
398 `(,',setter ,value dest (the fixnum (+ di ,offset)))))
399 (cond
400 ;; 1 octet
401 ((< code #x80)
402 (set-octet 0 code)
403 (incf di))
404 ;; 2 octets
405 ((< code #x800)
406 (set-octet 0 (logior #xc0 (f-ash code -6)))
407 (set-octet 1 (logior #x80 (f-logand code #x3f)))
408 (incf di 2))
409 ;; 1 octet (invalid octet)
410 ((<= #xdc80 code #xdcff)
411 (set-octet 0 (f-logand code #xff))
412 (incf di))
413 ;; 3 octets
414 ((< code #x10000)
415 (set-octet 0 (logior #xe0 (f-ash code -12)))
416 (set-octet 1 (logior #x80 (f-logand #x3f (f-ash code -6))))
417 (set-octet 2 (logior #x80 (f-logand code #x3f)))
418 (incf di 3))
419 ;; 4 octets
420 (t
421 (set-octet 0 (logior #xf0 (f-logand #x07 (f-ash code -18))))
422 (set-octet 1 (logior #x80 (f-logand #x3f (f-ash code -12))))
423 (set-octet 2 (logior #x80 (f-logand #x3f (f-ash code -6))))
424 (set-octet 3 (logand #x3f code))
425 (incf di 4))))
426 finally (return (the fixnum (- di d-start))))))
427
428 (define-decoder :utf-8b (getter src-type setter dest-type)
429 `(named-lambda utf-8b-decoder (src start end dest d-start)
430 (declare (type ,src-type src)
431 (type ,dest-type dest)
432 (fixnum start end d-start))
433 (let ((u2 0) (u3 0) (u4 0))
434 (declare (type ub8 u2 u3 u4))
435 (loop for di fixnum from d-start
436 for i fixnum from start below end
437 for u1 of-type ub8 = (,getter src i) do
438 ;; Unlike the UTF-8 version, this version of
439 ;; CONSUME-OCTET needs to check if I is being incremented
440 ;; past END because we might have trailing binary
441 ;; garbage.
442 (macrolet
443 ((consume-octet (n)
444 `(if (= i (1- end))
445 (encode-raw-octets ,n)
446 (,',getter src (incf i))))
447 (encode-raw-octets (n)
448 `(progn
449 ,@(loop for i below n and var in '(u1 u2 u3 u4)
450 collect `(,',setter (logior #xdc00 ,var) dest di)
451 unless (= i (1- n))
452 collect '(incf di))
453 (return-from set-body))))
454 (block set-body
455 (,setter (cond
456 ((< u1 #x80) ; 1 octet
457 u1)
458 ((>= u1 #xc2)
459 (setq u2 (consume-octet 1))
460 (cond
461 ((< u1 #xe0) ; 2 octets
462 (if (< (f-logxor u2 #x80) #x40)
463 (logior (f-ash (f-logand #x1f u1) 6)
464 (f-logxor u2 #x80))
465 (encode-raw-octets 2)))
466 (t
467 (setq u3 (consume-octet 2))
468 (cond
469 ((< u1 #xf0) ; 3 octets
470 (if (and (< (f-logxor u2 #x80) #x40)
471 (< (f-logxor u3 #x80) #x40)
472 (or (>= u1 #xe1) (>= u2 #xa0)))
473 (let ((start (f-logior (f-ash (f-logand u1 #x0f) 12)
474 (f-ash (f-logand u2 #x3f) 6))))
475 (if (<= #xd800 start #xdfc0)
476 (encode-raw-octets 3)
477 (logior start (f-logand u3 #x3f))))
478 (encode-raw-octets 3)))
479 (t ; 4 octets
480 (setq u4 (consume-octet 3))
481 (if (and (< (f-logxor u2 #x80) #x40)
482 (< (f-logxor u3 #x80) #x40)
483 (< (f-logxor u4 #x80) #x40)
484 (or (>= u1 #xf1) (>= u2 #x90)))
485 (logior
486 (f-logior (f-ash (f-logand u1 7) 18)
487 (f-ash (f-logxor u2 #x80) 12))
488 (f-logior (f-ash (f-logxor u3 #x80) 6)
489 (f-logxor u4 #x80)))
490 (encode-raw-octets 4)))))))
491 (t (encode-raw-octets 1)))
492 dest di)))
493 finally (return (the fixnum (- di d-start)))))))
494
495 ;;;; UTF-16
496
497 ;;; TODO: add a way to pass some info at compile-time telling us that,
498 ;;; for example, the maximum code-point will always be < #x10000 in
499 ;;; which case we could simply return (* 2 (- end start)).
500 (defmacro utf16-octet-counter (getter type)
501 `(named-lambda utf-16-octet-counter (seq start end max)
502 (declare (type ,type seq) (fixnum start end max))
503 (loop with noctets fixnum = 0
504 for i fixnum from start below end
505 for code of-type code-point = (,getter seq i)
506 do (let ((new (the fixnum (+ (if (< code #x10000) 2 4) noctets))))
507 (if (and (plusp max) (> new max))
508 (loop-finish)
509 (setq noctets new)))
510 finally (return (values noctets i)))))
511
512 (defmacro utf-16-combine-surrogate-pairs (u1 u2)
513 `(the (unsigned-byte 21)
514 (+ #x10000
515 (the (unsigned-byte 20)
516 (logior
517 (the (unsigned-byte 20)
518 (ash (the (unsigned-byte 10) (- ,u1 #xd800)) 10))
519 (the (unsigned-byte 10)
520 (- ,u2 #xdc00)))))))
521
522 (defmacro define-utf-16 (name &optional endianness)
523 (check-type endianness (or null (eql :be) (eql :le)))
524 (check-type name keyword)
525 (let ((swap-var (gensym "SWAP"))
526 (code-point-counter-name
527 (format-symbol t '#:~a-code-point-counter (string name)))
528 (encoder-name (format-symbol t '#:~a-encoder (string name)))
529 (decoder-name (format-symbol t '#:~a-decoder (string name))))
530 (labels ((make-bom-check-form (end start getter seq)
531 (if (null endianness)
532 ``((,',swap-var
533 (when (> ,,end ,,start)
534 (case (,,getter ,,seq ,,start 2 :ne)
535 (#.+byte-order-mark-code+ (incf ,,start 2) nil)
536 (#.+swapped-byte-order-mark-code+ (incf ,,start 2) t)
537 (t #+little-endian t)))))
538 '()))
539 (make-getter-form (getter src i)
540 (case endianness
541 (:le ``(,,getter ,,src ,,i 2 :le))
542 (:be ``(,,getter ,,src ,,i 2 :be))
543 (t ``(if ,',swap-var
544 (,,getter ,,src ,,i 2 :re)
545 (,,getter ,,src ,,i 2 :ne)))))
546 (make-setter-form (setter code dest di)
547 (case endianness
548 (:be ``(,,setter ,,code ,,dest ,,di 2 :be))
549 (:le ``(,,setter ,,code ,,dest ,,di 2 :le))
550 (t ``(,,setter ,,code ,,dest ,,di 2 :ne)))))
551 `(progn
552 (define-octet-counter ,name (getter type)
553 `(utf16-octet-counter ,getter ,type))
554 (define-code-point-counter ,name (getter type)
555 `(named-lambda ,',code-point-counter-name (seq start end max)
556 (declare (type ,type seq) (fixnum start end max))
557 (let* ,,(make-bom-check-form ''end ''start 'getter ''seq)
558 (loop with count fixnum = 0
559 with i fixnum = start
560 while (<= i (- end 2)) do
561 (let* ((code ,,(make-getter-form 'getter ''seq ''i))
562 (next-i (+ i (if (or (< code #xd800) (>= code #xdc00))
563 2
564 4))))
565 (declare (type (unsigned-byte 16) code) (fixnum next-i))
566 (cond
567 ((> next-i end)
568 (decoding-error
569 (vector (,getter seq i) (,getter seq (1+ i)))
570 ,',name seq i nil 'end-of-input-in-character)
571 (return (values count i)))
572 (t
573 (setq i next-i
574 count (1+ count))
575 (when (and (plusp max) (= count max))
576 (return (values count i))))))
577 finally (progn
578 (assert (= i end))
579 (return (values count i)))))))
580 (define-encoder ,name (getter src-type setter dest-type)
581 `(named-lambda ,',encoder-name (src start end dest d-start)
582 (declare (type ,src-type src)
583 (type ,dest-type dest)
584 (fixnum start end d-start))
585 (loop with di fixnum = d-start
586 for i fixnum from start below end
587 for code of-type code-point = (,getter src i)
588 for high-bits fixnum = (- code #x10000) do
589 (cond ((< high-bits 0)
590 ,,(make-setter-form 'setter ''code ''dest ''di)
591 (incf di 2))
592 (t
593 ,,(make-setter-form
594 'setter ''(logior #xd800 (f-ash high-bits -10))
595 ''dest ''di)
596 ,,(make-setter-form
597 'setter ''(logior #xdc00 (f-logand high-bits #x3ff))
598 ''dest ''(+ di 2))
599 (incf di 4)))
600 finally (return (the fixnum (- di d-start))))))
601 (define-decoder ,name (getter src-type setter dest-type)
602 `(named-lambda ,',decoder-name (src start end dest d-start)
603 (declare (type ,src-type src)
604 (type ,dest-type dest)
605 (fixnum start end d-start))
606 (let ,,(make-bom-check-form ''end ''start 'getter ''src)
607 (loop with i fixnum = start
608 for di fixnum from d-start
609 until (= i end) do
610 (let ((u1 ,,(make-getter-form 'getter ''src ''i)))
611 (declare (type (unsigned-byte 16) u1))
612 (incf i 2)
613 (,setter (cond
614 ((or (< u1 #xd800) (>= u1 #xe000)) ; 2 octets
615 u1)
616 ((< u1 #xdc00) ; 4 octets
617 (let ((u2 ,,(make-getter-form 'getter ''src ''i)))
618 (declare (type (unsigned-byte 16) u2))
619 (incf i 2)
620 (if (and (>= u2 #xdc00) (< u2 #xe000))
621 (utf-16-combine-surrogate-pairs u1 u2)
622 (decoding-error
623 (vector (,getter src (- i 4))
624 (,getter src (- i 3))
625 (,getter src (- i 2))
626 (,getter src (- i 1)))
627 ,',name src i +repl+))))
628 (t
629 (decoding-error (vector (,getter src (- i 2))
630 (,getter src (- i 1)))
631 ,',name src i +repl+)))
632 dest di))
633 finally (return (the fixnum (- di d-start)))))))
634 ',name))))
635
636 (define-character-encoding :utf-16
637 "A 16-bit, variable-length encoding in which characters with
638 code points less than #x10000 can be encoded in a single 16-bit
639 word and characters with larger codes can be encoded in a pair of
640 16-bit words. The endianness of the encoded data is indicated by
641 the endianness of a byte-order-mark character (#\u+feff)
642 prepended to the data; in the absence of such a character on
643 input, the data is assumed to be in big-endian order. Output is
644 written in native byte-order with a leading byte-order mark."
645 :max-units-per-char 2
646 :code-unit-size 16
647 :native-endianness t ; not necessarily true when decoding
648 :decode-literal-code-unit-limit #xd800
649 :encode-literal-code-unit-limit #x10000
650 :use-bom #+big-endian :utf-16be #+little-endian :utf-16le
651 :bom-encoding #+big-endian #(#xfe #xff) #+little-endian #(#xff #xfe)
652 :nul-encoding #(0 0)
653 :default-replacement #xfffd
654 :ambiguous #+little-endian t #+big-endian nil)
655
656 (define-utf-16 :utf-16)
657
658 (define-character-encoding :utf-16le
659 "A 16-bit, variable-length encoding in which characters with
660 code points less than #x10000 can be encoded in a single 16-bit
661 word and characters with larger codes can be encoded in a pair of
662 16-bit words. The data is assumed to be in little-endian order. Output is
663 written in little-endian byte-order without a leading byte-order mark."
664 :aliases '(:utf-16/le)
665 :max-units-per-char 2
666 :code-unit-size 16
667 :native-endianness #+little-endian t #+big-endian nil
668 :decode-literal-code-unit-limit #xd800
669 :encode-literal-code-unit-limit #x10000
670 :nul-encoding #(0 0)
671 :default-replacement #xfffd)
672
673 (define-utf-16 :utf-16le :le)
674
675 (define-character-encoding :utf-16be
676 "A 16-bit, variable-length encoding in which characters with
677 code points less than #x10000 can be encoded in a single 16-bit
678 word and characters with larger codes can be encoded in a pair of
679 16-bit words. The data is assumed to be in big-endian order. Output is
680 written in big-endian byte-order without a leading byte-order mark."
681 :aliases '(:utf-16/be)
682 :max-units-per-char 2
683 :code-unit-size 16
684 :native-endianness #+little-endian nil #+big-endian t
685 :decode-literal-code-unit-limit #xd800
686 :encode-literal-code-unit-limit #x10000
687 :nul-encoding #(0 0)
688 :default-replacement #xfffd)
689
690 (define-utf-16 :utf-16be :be)
691
692 (defmacro define-ucs (name bytes &optional endianness (limit #x110000))
693 (check-type name keyword)
694 (check-type bytes (or (eql 2) (eql 4)))
695 (check-type endianness (or null (eql :le) (eql :be)))
696 (let ((swap-var (gensym "SWAP"))
697 (code-point-counter-name
698 (format-symbol t '#:~a-code-point-counter (string name)))
699 (encoder-name
700 (format-symbol t '#:~a-encoder (string name)))
701 (decoder-name
702 (format-symbol t '#:~a-decoder (string name))))
703 (labels ((make-bom-check-form (end start getter src)
704 (if (null endianness)
705 ``(when (not (zerop (- ,,end ,,start)))
706 (case (,,getter ,,src 0 ,',bytes :ne)
707 (#.+byte-order-mark-code+
708 (incf ,,start ,',bytes) nil)
709 (#.+swapped-byte-order-mark-code-32+
710 (incf ,,start ,',bytes) t)
711 (t #+little-endian t)))
712 '()))
713 (make-setter-form (setter code dest di)
714 ``(,,setter ,,code ,,dest ,,di ,',bytes
715 ,',(or endianness :ne)))
716 (make-getter-form (getter src i)
717 (if (null endianness)
718 ``(if ,',swap-var
719 (,,getter ,,src ,,i ,',bytes :re)
720 (,,getter ,,src ,,i ,',bytes :ne))
721 ``(,,getter ,,src ,,i ,',bytes ,',endianness))))
722 `(progn
723 (define-code-point-counter ,name (getter type)
724 `(named-lambda ,',code-point-counter-name (seq start end max)
725 (declare (type ,type seq) (fixnum start end max))
726 ;; check for bom
727 ,,(make-bom-check-form ''end ''start 'getter ''seq)
728 (multiple-value-bind (count rem)
729 (floor (- end start) ,',bytes)
730 (cond
731 ((and (plusp max) (> count max))
732 (values max (the fixnum (+ start (* ,',bytes max)))))
733 (t
734 ;; check for incomplete last character
735 (unless (zerop rem)
736 (let ((vector (make-array ,',bytes :fill-pointer 0)))
737 (dotimes (i rem)
738 (vector-push (,getter seq (+ i (- end rem))) vector))
739 (decoding-error vector ,',name seq (the fixnum (- end rem)) nil
740 'end-of-input-in-character)
741 (decf end rem)))
742 (values count end))))))
743 (define-encoder ,name (getter src-type setter dest-type)
744 `(named-lambda ,',encoder-name (src start end dest d-start)
745 (declare (type ,src-type src)
746 (type ,dest-type dest)
747 (fixnum start end d-start))
748 (loop for i fixnum from start below end
749 and di fixnum from d-start by ,',bytes
750 for code of-type code-point = (,getter src i)
751 do (if (>= code ,',limit)
752 (encoding-error code ,',name src i +repl+)
753 ,,(make-setter-form 'setter ''code ''dest ''di))
754 finally (return (the fixnum (- di d-start))))))
755 (define-decoder ,name (getter src-type setter dest-type)
756 `(named-lambda ,',decoder-name (src start end dest d-start)
757 (declare (type ,src-type src)
758 (type ,dest-type dest)
759 (fixnum start end d-start))
760 (let ((,',swap-var ,,(make-bom-check-form ''end ''start 'getter ''src)))
761 (declare (ignorable ,',swap-var))
762 (loop for i fixnum from start below end by ,',bytes
763 and di from d-start
764 do (,setter (let ((unit ,,(make-getter-form 'getter ''src ''i)))
765 (if (>= unit ,',limit)
766 (decoding-error
767 (vector (,getter src i)
768 (,getter src (+ i 1))
769 ,@,(if (= bytes 4)
770 ``((,getter src (+ i 2))
771 (,getter src (+ i 3)))))
772 ,',name src i +repl+
773 'character-out-of-range)
774 unit))
775 dest di)
776 finally (return (the fixnum (- di d-start)))))))
777 ',name))))
778
779 ;;;; UTF-32
780
781 (define-character-encoding :utf-32
782 "A 32-bit, fixed-length encoding in which all Unicode
783 characters can be encoded in a single 32-bit word. The
784 endianness of the encoded data is indicated by the endianness of
785 a byte-order-mark character (#\u+feff) prepended to the data; in
786 the absence of such a character on input, input data is assumed
787 to be in big-endian order. Output is written in native byte
788 order with a leading byte-order mark."
789 :aliases '(:ucs-4)
790 :max-units-per-char 1
791 :code-unit-size 32
792 :native-endianness t ; not necessarily true when decoding
793 :literal-char-code-limit #x110000
794 :use-bom #+little-endian :utf-32le #+big-endian :utf-32be
795 :bom-encoding
796 #+big-endian #(#x00 #x00 #xfe #xff)
797 #+little-endian #(#xff #xfe #x00 #x00)
798 :nul-encoding #(0 0 0 0)
799 :ambiguous #+little-endian t #+big-endian nil)
800
801 (define-ucs :utf-32 4)
802
803 (define-character-encoding :utf-32le
804 "A 32-bit, fixed-length encoding in which all Unicode
805 characters can be encoded in a single 32-bit word. Input data is assumed
806 to be in little-endian order. Output is also written in little-endian byte
807 order without a leading byte-order mark."
808 :max-units-per-char 1
809 :code-unit-size 32
810 :aliases '(:utf-32/le :ucs-4le :ucs-4/le)
811 :native-endianness #+little-endian t #+big-endian nil
812 :literal-char-code-limit #x110000
813 :nul-encoding #(0 0 0 0))
814
815 (define-ucs :utf-32le 4 :le)
816
817 (define-character-encoding :utf-32be
818 "A 32-bit, fixed-length encoding in which all Unicode
819 characters can be encoded in a single 32-bit word. Input data is assumed
820 to be in big-endian order. Output is also written in big-endian byte
821 order without a leading byte-order mark."
822 :max-units-per-char 1
823 :code-unit-size 32
824 :aliases '(:utf-32/be :ucs-4be :ucs-4/be)
825 :native-endianness #+little-endian nil #+big-endian t
826 :literal-char-code-limit #x110000
827 :nul-encoding #(0 0 0 0))
828
829 (define-ucs :utf-32be 4 :be)
830
831 ;; UCS-2
832
833 (define-character-encoding :ucs-2
834 "A 16-bit, fixed-length encoding in which all Unicode
835 characters can be encoded in a single 16-bit word. The
836 endianness of the encoded data is indicated by the endianness of
837 a byte-order-mark character (#\u+feff) prepended to the data; in
838 the absence of such a character on input, input data is assumed
839 to be in big-endian order. Output is written in native byte
840 order with a leading byte-order mark."
841 :aliases '(:ucs-2)
842 :max-units-per-char 1
843 :code-unit-size 16
844 :native-endianness t ; not necessarily true when decoding
845 :literal-char-code-limit #x10000
846 :use-bom #+little-endian :ucs-2le #+big-endian :ucs-2be
847 :bom-encoding
848 #+big-endian #(#xfe #xff)
849 #+little-endian #(#xff #xfe)
850 :nul-encoding #(0 0)
851 :ambiguous #+little-endian t #+big-endian nil)
852
853 (define-ucs :ucs-2 2 nil #x10000)
854
855 (define-character-encoding :ucs-2le
856 "A 16-bit, fixed-length encoding in which all Unicode
857 characters can be encoded in a single 16-bit word. Input data is assumed
858 to be in little-endian order. Output is also written in little-endian byte
859 order without a leading byte-order mark."
860 :max-units-per-char 1
861 :code-unit-size 16
862 :aliases '(:ucs-2/le)
863 :native-endianness #+little-endian t #+big-endian nil
864 :literal-char-code-limit #x10000
865 :nul-encoding #(0 0))
866
867 (define-ucs :ucs-2le 2 :le #x10000)
868
869 (define-character-encoding :ucs-2be
870 "A 16-bit, fixed-length encoding in which all Unicode
871 characters can be encoded in a single 16-bit word. Input data is assumed
872 to be in big-endian order. Output is also written in big-endian byte
873 order without a leading byte-order mark."
874 :max-units-per-char 1
875 :code-unit-size 16
876 :aliases '(:ucs-2/be)
877 :native-endianness #+little-endian nil #+big-endian t
878 :literal-char-code-limit #x10000
879 :nul-encoding #(0 0))
880
881 (define-ucs :ucs-2be 2 :be #x10000)