encodings.lisp - clic - Clic is an command line interactive client for gopher written in Common LISP
(HTM) git clone git://bitreich.org/clic/ git://enlrupgkhuxnvlhsf6lc3fziv5h2hhfrinws65d7roiv6bfj7d652fid.onion/clic/
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) Tags
(DIR) README
(DIR) LICENSE
---
encodings.lisp (22651B)
---
1 ;;;; -*- Mode: lisp; indent-tabs-mode: nil -*-
2 ;;;
3 ;;; encodings.lisp --- Character encodings and mappings.
4 ;;;
5 ;;; Copyright (C) 2007, Luis Oliveira <loliveira@common-lisp.net>
6 ;;;
7 ;;; Permission is hereby granted, free of charge, to any person
8 ;;; obtaining a copy of this software and associated documentation
9 ;;; files (the "Software"), to deal in the Software without
10 ;;; restriction, including without limitation the rights to use, copy,
11 ;;; modify, merge, publish, distribute, sublicense, and/or sell copies
12 ;;; of the Software, and to permit persons to whom the Software is
13 ;;; furnished to do so, subject to the following conditions:
14 ;;;
15 ;;; The above copyright notice and this permission notice shall be
16 ;;; included in all copies or substantial portions of the Software.
17 ;;;
18 ;;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 ;;; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 ;;; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21 ;;; NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
22 ;;; HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
23 ;;; WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24 ;;; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 ;;; DEALINGS IN THE SOFTWARE.
26
27 (in-package #:babel-encodings)
28
29 ;;;; Character Encodings
30
31 (defclass character-encoding ()
32 ((name :initarg :name :reader enc-name
33 :initform (error "Must specify a NAME for this character encoding."))
34 ;; Most of these documentation strings are taken from OpenMCL.
35 (documentation
36 :initarg :documentation :reader enc-documentation :initform nil)
37 ;; A non-exhaustive list of aliases for the encoding.
38 (aliases :initarg :aliases :initform nil :reader enc-aliases)
39 ;; Specified in bits. Usually 8, 16 or 32.
40 (code-unit-size
41 :initarg :code-unit-size :reader enc-code-unit-size :initform 8)
42 (max-units-per-char
43 :initarg :max-units-per-char :reader enc-max-units-per-char :initform 1)
44 ;; If NIL, it is necessary to swap 16- and 32-bit units.
45 (native-endianness
46 :initarg :native-endianness :reader enc-native-endianness :initform t)
47 ;; Code units less than this value map to themselves on input.
48 (decode-literal-code-unit-limit
49 :initarg :decode-literal-code-unit-limit :initform 0
50 :reader enc-decode-literal-code-unit-limit)
51 ;; Code points less than this value map to themselves on output.
52 (encode-literal-code-unit-limit
53 :initarg :encode-literal-code-unit-limit :initform 0
54 :reader enc-encode-literal-code-unit-limit)
55 ;; Defines whether it is necessary to prepend a byte-order-mark to
56 ;; determine the endianness.
57 (use-bom :initarg :use-bom :initform nil :reader enc-use-bom)
58 ;; How the byte-order-mark should be encoded, specified as a
59 ;; sequence of octets. NIL if it cannot be encoded.
60 (bom-encoding
61 :initarg :bom-encoding :reader enc-bom-encoding :initform nil)
62 ;; How should NUL be encoded, specified as sequence of octets.
63 (nul-encoding
64 :initarg :nul-encoding :reader enc-nul-encoding :initform #(0))
65 ;; Preferred replacement character code point.
66 (default-replacement
67 :initarg :default-replacement :reader enc-default-replacement
68 :initform #x1a)
69 ;; Does VALID-STRING => OCTETS => STRING2 guarantee a valid
70 ;; STRING2? UTF-{16,32} on little-endian plaforms don't because
71 ;; they assume different endianness on each direction.
72 (ambiguous
73 :initarg :ambiguous :reader ambiguous-encoding-p :initform nil)))
74
75 ;;; I'm too lazy to write all the identical limits twice.
76 (defmethod initialize-instance :after ((enc character-encoding)
77 &key literal-char-code-limit)
78 (when literal-char-code-limit
79 (setf (slot-value enc 'encode-literal-code-unit-limit)
80 literal-char-code-limit)
81 (setf (slot-value enc 'decode-literal-code-unit-limit)
82 literal-char-code-limit)))
83
84 #-(and)
85 (defmethod describe-object ((enc character-encoding) s)
86 "Prints out the name, aliases and documentation slots of a
87 character encoding object."
88 (with-slots (name aliases documentation) enc
89 (format s "~&~S" name)
90 (when aliases
91 (format s " [Aliases:~{ ~S~}]" aliases))
92 (format s "~&~A~%~%" documentation))
93 (call-next-method))
94
95 (defvar *supported-character-encodings* nil)
96
97 (defun list-character-encodings ()
98 "List of keyword symbols denoting supported character
99 encodings. This list does not include aliases."
100 *supported-character-encodings*)
101
102 (defvar *character-encodings* (make-hash-table :test 'eq))
103
104 (defvar *default-character-encoding* :utf-8
105 "Special variable used to determine the default character
106 encoding.")
107
108 (defun get-character-encoding (name)
109 "Lookups the character encoding denoted by the keyword symbol
110 NAME. Signals an error if one is not found. If NAME is already
111 a CHARACTER-ENCONDING object, it is returned unmodified."
112 (when (typep name 'character-encoding)
113 (return-from get-character-encoding name))
114 (when (eq name :default)
115 (setq name *default-character-encoding*))
116 (or (gethash name *character-encodings*)
117 (error "Unknown character encoding: ~S" name)))
118
119 (defmethod ambiguous-encoding-p ((encoding symbol))
120 (ambiguous-encoding-p (get-character-encoding encoding)))
121
122 (defun notice-character-encoding (enc)
123 (pushnew (enc-name enc) *supported-character-encodings*)
124 (dolist (kw (cons (enc-name enc) (enc-aliases enc)))
125 (setf (gethash kw *character-encodings*) enc))
126 (enc-name enc))
127
128 (defmacro define-character-encoding (name docstring &body options)
129 `(notice-character-encoding
130 (make-instance 'character-encoding :name ,name ,@options
131 :documentation ,docstring)))
132
133 ;;;; Mappings
134
135 ;;; TODO: describe what mappings are
136
137 (defun make-fixed-width-counter (getter type &optional (unit-size-in-bits 8))
138 (declare (ignore getter type))
139 (check-type unit-size-in-bits positive-fixnum)
140 (let ((unit-size-in-bytes (/ unit-size-in-bits 8)))
141 `(named-lambda fixed-width-counter (seq start end max)
142 (declare (ignore seq) (fixnum start end max))
143 ;; XXX: the result can be bigger than a fixnum when (> unit-size
144 ;; 1) and we don't want that to happen. Possible solution: signal
145 ;; a warning (hmm, make that an actual error) and truncate.
146 (if (plusp max)
147 (let ((count (the fixnum (min (floor max ,unit-size-in-bytes)
148 (the fixnum (- end start))))))
149 (values (the fixnum (* count ,unit-size-in-bytes))
150 (the fixnum (+ start count))))
151 (values (the fixnum (* (the fixnum (- end start))
152 ,unit-size-in-bytes))
153 (the fixnum end))))))
154
155 ;;; Useful to develop new encodings incrementally starting with octet
156 ;;; and code-unit counters.
157 (defun make-dummy-coder (sg st ds dt)
158 (declare (ignore sg st ds dt))
159 `(named-lambda dummy-coder (src s e dest i)
160 (declare (ignore src s e dest i))
161 (error "this encoder/decoder hasn't been implemented yet")))
162
163 ;;; TODO: document here
164 ;;;
165 ;;; ENCODER -- (lambda (src-getter src-type dest-setter dest-type) ...)
166 ;;; DECODER -- (lambda (src-getter src-type dest-setter dest-type) ...)
167 ;;;
168 ;;; OCTET-COUNTER -- (lambda (getter type) ...)
169 ;;; CODE-POINT-COUNTER -- (lambda (getter type) ...)
170 (defclass abstract-mapping ()
171 ((encoder-factory :accessor encoder-factory :initform 'make-dummy-coder)
172 (decoder-factory :accessor decoder-factory :initform 'make-dummy-coder)
173 (octet-counter-factory :accessor octet-counter-factory
174 :initform 'make-fixed-width-counter)
175 (code-point-counter-factory :accessor code-point-counter-factory
176 :initform 'make-fixed-width-counter)))
177
178 ;;; TODO: document these
179 ;;;
180 ;;; ENCODER -- (lambda (src start end dest d-start) ...)
181 ;;; DECODER -- (lambda (src start end dest d-start) ...)
182 ;;;
183 ;;; OCTET-COUNTER -- (lambda (seq start end max-octets) ...)
184 ;;; CODE-POINT-COUNTER -- (lambda (seq start end max-chars) ...)
185 ;;; => N-CHARS NEW-END
186 ;;; (important: describe NEW-END)
187 (defclass concrete-mapping ()
188 ((encoder :accessor encoder)
189 (decoder :accessor decoder)
190 (octet-counter :accessor octet-counter)
191 (code-point-counter :accessor code-point-counter)))
192
193 (defparameter *abstract-mappings* (make-hash-table :test 'eq))
194
195 (defun get-abstract-mapping (encoding)
196 (gethash encoding *abstract-mappings*))
197
198 (defun (setf get-abstract-mapping) (value encoding)
199 (setf (gethash encoding *abstract-mappings*) value))
200
201 (defun %register-mapping-part (encoding slot-name fn)
202 (let ((mapping (get-abstract-mapping encoding)))
203 (unless mapping
204 (setq mapping (make-instance 'abstract-mapping))
205 (setf (get-abstract-mapping encoding) mapping))
206 (setf (slot-value mapping slot-name) fn)))
207
208 ;;; See enc-*.lisp for example usages of these 4 macros.
209
210 (defmacro define-encoder (encoding (sa st da dt) &body body)
211 `(%register-mapping-part ,encoding 'encoder-factory
212 (named-lambda encoder (,sa ,st ,da ,dt)
213 ,@body)))
214
215 (defmacro define-decoder (encoding (sa st da dt) &body body)
216 `(%register-mapping-part ,encoding 'decoder-factory
217 (named-lambda decoder (,sa ,st ,da ,dt)
218 ,@body)))
219
220 (defmacro define-octet-counter (encoding (acc type) &body body)
221 `(%register-mapping-part ,encoding 'octet-counter-factory
222 (named-lambda octet-counter-factory (,acc ,type)
223 ,@body)))
224
225 (defmacro define-code-point-counter (encoding (acc type) &body body)
226 `(%register-mapping-part ,encoding 'code-point-counter-factory
227 (named-lambda code-point-counter (,acc ,type)
228 ,@body)))
229
230 (defun instantiate-encoder (encoding am octet-seq-getter octet-seq-type
231 code-point-seq-setter code-point-seq-type)
232 (declare (ignore encoding))
233 (funcall (encoder-factory am)
234 octet-seq-getter
235 octet-seq-type
236 code-point-seq-setter
237 code-point-seq-type))
238
239 (defun instantiate-decoder (encoding am octet-seq-getter octet-seq-type
240 code-point-seq-setter code-point-seq-type)
241 (declare (ignore encoding))
242 (funcall (decoder-factory am)
243 octet-seq-getter
244 octet-seq-type
245 code-point-seq-setter
246 code-point-seq-type))
247
248 (defun instantiate-code-point-counter (encoding am octet-seq-getter
249 octet-seq-type)
250 (declare (ignore encoding))
251 (funcall (code-point-counter-factory am)
252 octet-seq-getter
253 octet-seq-type))
254
255 (defun instantiate-octet-counter (encoding am code-point-seq-getter
256 code-point-seq-type)
257 (if (= 1 (enc-max-units-per-char encoding))
258 (make-fixed-width-counter code-point-seq-getter code-point-seq-type
259 (enc-code-unit-size encoding))
260 (funcall (octet-counter-factory am)
261 code-point-seq-getter
262 code-point-seq-type)))
263
264 ;;; Expands into code generated by the available abstract mappings
265 ;;; that will be compiled into concrete mappings. This is used in
266 ;;; e.g. strings.lisp to define mappings between strings and
267 ;;; (unsigned-byte 8) vectors.
268 ;;;
269 ;;; For each encoding funcall the abstract mappings at macro-expansion
270 ;;; time with the src/dest accessors and types to generate the
271 ;;; appropriate code for the concrete mappings. These functions are
272 ;;; then saved in their respective slots of the CONCRETE-MAPPING
273 ;;; object.
274 (defmacro instantiate-concrete-mappings
275 (&key (encodings (hash-table-keys *abstract-mappings*))
276 (optimize '((speed 3) (debug 0) (compilation-speed 0)))
277 octet-seq-getter octet-seq-setter octet-seq-type
278 code-point-seq-getter code-point-seq-setter code-point-seq-type
279 (instantiate-decoders t))
280 `(let ((ht (make-hash-table :test 'eq)))
281 (declare (optimize ,@optimize)
282 #+sbcl (sb-ext:muffle-conditions sb-ext:compiler-note))
283 (flet ((notice-mapping (encoding-name cm)
284 (let* ((encoding (get-character-encoding encoding-name))
285 (aliases (enc-aliases encoding)))
286 (dolist (kw (cons (enc-name encoding) aliases))
287 (setf (gethash kw ht) cm)))))
288 ,@(loop for encoding-name in encodings
289 for encoding = (get-character-encoding encoding-name)
290 for am = (gethash encoding-name *abstract-mappings*)
291 collect
292 `(let ((cm (make-instance 'concrete-mapping)))
293 (setf (encoder cm)
294 ,(instantiate-encoder encoding am
295 code-point-seq-getter
296 code-point-seq-type
297 octet-seq-setter
298 octet-seq-type))
299 ,(when instantiate-decoders
300 `(progn
301 (setf (decoder cm)
302 ,(instantiate-decoder encoding am
303 octet-seq-getter
304 octet-seq-type
305 code-point-seq-setter
306 code-point-seq-type))
307 (setf (code-point-counter cm)
308 ,(instantiate-code-point-counter
309 encoding am octet-seq-getter octet-seq-type))))
310 (setf (octet-counter cm)
311 ,(instantiate-octet-counter encoding am
312 code-point-seq-getter
313 code-point-seq-type))
314 (notice-mapping ,encoding-name cm))))
315 ht))
316
317 ;;; debugging stuff
318
319 #-(and)
320 (defun pprint-instantiate-concrete-mappings
321 (&key (encodings (hash-table-keys *abstract-mappings*))
322 (optimize '((debug 3) (safety 3)))
323 (octet-seq-setter 'ub-set) (octet-seq-getter 'ub-get)
324 (octet-seq-type '(simple-array (unsigned-byte 8) (*)))
325 (code-point-seq-setter 'string-set)
326 (code-point-seq-getter 'string-get)
327 (code-point-seq-type 'simple-unicode-string))
328 (let ((encodings (ensure-list encodings))
329 (*package* (find-package :babel-encodings))
330 (*print-case* :downcase))
331 (pprint
332 (macroexpand
333 `(instantiate-concrete-mappings
334 :encodings ,encodings
335 :optimize ,optimize
336 :octet-seq-getter ,octet-seq-getter
337 :octet-seq-setter ,octet-seq-setter
338 :octet-seq-type ,octet-seq-type
339 :code-point-seq-getter ,code-point-seq-getter
340 :code-point-seq-setter ,code-point-seq-setter
341 :code-point-seq-type ,code-point-seq-type))))
342 (values))
343
344 ;;;; Utilities used in enc-*.lisp
345
346 (defconstant +default-substitution-code-point+ #x1a
347 "Default ASCII substitution character code point used in case of an encoding/decoding error.")
348
349 ;;; We're converting between objects of the (UNSIGNED-BYTE 8) and
350 ;;; (MOD #x110000) types which are aliased here to UB8 and CODE-POINT
351 ;;; for convenience.
352 (deftype ub8 () '(unsigned-byte 8))
353 (deftype code-point () '(mod #x110000))
354
355 ;;; Utility macro around DEFINE-ENCODER that takes care of most of the
356 ;;; work need to deal with an 8-bit, fixed-width character encoding.
357 ;;;
358 ;;; BODY will be inside a loop and its return value will placed in the
359 ;;; destination buffer. BODY will be surounded by lexical BLOCK which
360 ;;; will have the ENCODING's name, usually a keyword. It handles all
361 ;;; sorts of type declarations.
362 ;;;
363 ;;; See enc-ascii.lisp for a simple usage example.
364 (defmacro define-unibyte-encoder (encoding (code) &body body)
365 (with-unique-names (s-getter s-type d-setter d-type
366 src start end dest d-start i di)
367 `(define-encoder ,encoding (,s-getter ,s-type ,d-setter ,d-type)
368 `(named-lambda ,',(symbolicate encoding '#:-unibyte-encoder)
369 (,',src ,',start ,',end ,',dest ,',d-start)
370 (declare (type ,,s-type ,',src)
371 (type ,,d-type ,',dest)
372 (fixnum ,',start ,',end ,',d-start))
373 (loop for ,',i fixnum from ,',start below ,',end
374 and ,',di fixnum from ,',d-start do
375 (,,d-setter
376 (macrolet
377 ;; this should probably be a function...
378 ((handle-error (&optional (c ''character-encoding-error))
379 `(encoding-error
380 ,',',code ,',',encoding ,',',src ,',',i
381 +default-substitution-code-point+ ,c)))
382 (let ((,',code (,,s-getter ,',src ,',i)))
383 (declare (type code-point ,',code))
384 (block ,',encoding ,@',body)))
385 ,',dest ,',di)
386 finally (return (the fixnum (- ,',di ,',d-start))))))))
387
388 ;;; The decoder version of the above macro.
389 (defmacro define-unibyte-decoder (encoding (octet) &body body)
390 (with-unique-names (s-getter s-type d-setter d-type
391 src start end dest d-start i di)
392 `(define-decoder ,encoding (,s-getter ,s-type ,d-setter ,d-type)
393 `(named-lambda ,',(symbolicate encoding '#:-unibyte-encoder)
394 (,',src ,',start ,',end ,',dest ,',d-start)
395 (declare (type ,,s-type ,',src)
396 (type ,,d-type ,',dest)
397 (fixnum ,',start ,',end ,',d-start))
398 (loop for ,',i fixnum from ,',start below ,',end
399 and ,',di fixnum from ,',d-start do
400 (,,d-setter
401 (macrolet
402 ;; this should probably be a function...
403 ((handle-error (&optional (c ''character-decoding-error))
404 `(decoding-error
405 (vector ,',',octet) ,',',encoding ,',',src ,',',i
406 +default-substitution-code-point+ ,c)))
407 (let ((,',octet (,,s-getter ,',src ,',i)))
408 (declare (type ub8 ,',octet))
409 (block ,',encoding ,@',body)))
410 ,',dest ,',di)
411 finally (return (the fixnum (- ,',di ,',d-start))))))))
412
413 ;;;; Error Conditions
414 ;;;
415 ;;; For now, we don't define any actual restarts. The only mechanism
416 ;;; for "restarting" a coding error is the
417 ;;; *SUPPRESS-CHARACTER-CODING-ERRORS* special variable which, when
418 ;;; bound to T (the default), suppresses any error and uses a default
419 ;;; replacement character instead.
420 ;;;
421 ;;; If it turns out that other more options are necessary, possible
422 ;;; alternative approaches include:
423 ;;;
424 ;;; a) use a *REPLACEMENT-CHARACTER* special variable that lets us
425 ;;; pick our own replacement character. The encoder must do
426 ;;; additional work to check if this is character is encodable.
427 ;;;
428 ;;; b) offer a restart to pick a replacement character. Same
429 ;;; problem as above.
430 ;;;
431 ;;; Both approaches pose encoding problems when dealing with a
432 ;;; variable-width encodings because different replacement characters
433 ;;; will need different numbers of octets. This is not a problem for
434 ;;; UTF but will be a problem for the CJK charsets. Approach (a) is
435 ;;; nevertheless easier since the replacement character is known in
436 ;;; advance and therefore the octet-counter can account for it.
437 ;;;
438 ;;; For more complex restarts like SBCL's -- that'll let you specify
439 ;;; _several_ replacement characters for a single character error --
440 ;;; will probably need extra support code outside the encoder/decoder
441 ;;; (i.e. in the string-to-octets function, for example) since the
442 ;;; encoders/decoders deal with pre-allocated fixed-length buffers.
443 ;;;
444 ;;; SBCL has ASCII-specific (MALFORMED-ASCII) and UTF8-specific
445 ;;; errors. Why? Do we want to add some of those too?
446
447 ;;; FIXME: We used to deal with this with an extra ERRORP argument for
448 ;;; encoders, decoders, etc... Still undecided on the best way to do
449 ;;; it. We could also use a simple restart instead of this...
450 ;;;
451 ;;; In any case, this is not for the users to bind and it's not
452 ;;; exported from the BABEL package.
453 (defvar *suppress-character-coding-errors* nil
454 "If non-NIL, encoding or decoding errors are suppressed and the
455 the current character encoding's default replacement character is
456 used.")
457
458 ;;; All of Babel's error conditions are subtypes of
459 ;;; CHARACTER-CODING-ERROR. This error hierarchy is based on SBCL's.
460 (define-condition character-coding-error (error)
461 ((buffer :initarg :buffer :reader character-coding-error-buffer)
462 (position :initarg :position :reader character-coding-error-position)
463 (encoding :initarg :encoding :reader character-coding-error-encoding)))
464
465 (define-condition character-encoding-error (character-coding-error)
466 ((code :initarg :code :reader character-encoding-error-code))
467 (:report (lambda (c s)
468 (format s "Unable to encode character code point ~A as ~S."
469 (character-encoding-error-code c)
470 (character-coding-error-encoding c)))))
471
472 (declaim (inline encoding-error))
473 (defun encoding-error (code enc buf pos &optional
474 (sub +default-substitution-code-point+)
475 (e 'character-encoding-error))
476 (unless *suppress-character-coding-errors*
477 (error e :encoding enc :buffer buf :position pos :code code))
478 sub)
479
480 (define-condition character-decoding-error (character-coding-error)
481 ((octets :initarg :octets :reader character-decoding-error-octets))
482 (:report (lambda (c s)
483 (format s "Illegal ~S character starting at position ~D."
484 (character-coding-error-encoding c)
485 (character-coding-error-position c)))))
486
487 (define-condition end-of-input-in-character (character-decoding-error)
488 ()
489 (:documentation "Signalled by DECODERs or CODE-POINT-COUNTERs
490 of variable-width character encodings."))
491
492 (define-condition character-out-of-range (character-decoding-error)
493 ()
494 (:documentation
495 "Signalled when the character being decoded is out of range."))
496
497 (declaim (inline decoding-error))
498 (defun decoding-error (octets enc buf pos &optional
499 (sub +default-substitution-code-point+)
500 (e 'character-decoding-error))
501 (unless *suppress-character-coding-errors*
502 (error e :octets octets :encoding enc :buffer buf :position pos))
503 sub)