tenc-gbk.lisp - clic - Clic is an command line interactive client for gopher written in Common LISP
(HTM) git clone git://bitreich.org/clic/ git://hg6vgqziawt5s4dj.onion/clic/
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) Tags
(DIR) LICENSE
---
tenc-gbk.lisp (8297B)
---
1 ;;;; -*- Mode: lisp; indent-tabs-mode: nil -*-
2 ;;;
3 ;;; enc-gbk.lisp --- GBK encodings.
4 ;;;
5 ;;; Copyright (C) 2011, Li Wenpeng <levin108@gmail.com>
6 ;;;
7 ;;; Permission is hereby granted, free of charge, to any person
8 ;;; obtaining a copy of this software and associated documentation
9 ;;; files (the "Software"), to deal in the Software without
10 ;;; restriction, including without limitation the rights to use, copy,
11 ;;; modify, merge, publish, distribute, sublicense, and/or sell copies
12 ;;; of the Software, and to permit persons to whom the Software is
13 ;;; furnished to do so, subject to the following conditions:
14 ;;;
15 ;;; The above copyright notice and this permission notice shall be
16 ;;; included in all copies or substantial portions of the Software.
17 ;;;
18 ;;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 ;;; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 ;;; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21 ;;; NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
22 ;;; HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
23 ;;; WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24 ;;; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 ;;; DEALINGS IN THE SOFTWARE.
26
27 (in-package #:babel-encodings)
28
29 ;; populated in gbk-map.lisp
30 (defvar *gbk-unicode-mapping*)
31
32 (defconstant +gbk2-offset+ 0)
33 (defconstant +gbk3-offset+ 6763)
34 (defconstant +gbk4-offset+ (+ 6763 6080))
35 (defconstant +gbk1-offset+ 20902)
36 (defconstant +gbk5-offset+ (+ 20902 846))
37
38 (define-character-encoding :gbk
39 "GBK is an extension of the GB2312 character set for simplified
40 Chinese characters, used in the People's Republic of China."
41 :max-units-per-char 4
42 :literal-char-code-limit #x80)
43
44 (define-condition invalid-gbk-byte (character-decoding-error)
45 ()
46 (:documentation "Signalled when an invalid GBK byte is found."))
47
48 (define-condition invalid-gbk-character (character-encoding-error)
49 ()
50 (:documentation "Signalled when an invalid GBK character is found."))
51
52 (define-octet-counter :gbk (getter type)
53 `(lambda (seq start end max)
54 (declare (type ,type seq) (fixnum start end max))
55 (let ((noctets 0))
56 (loop for i from start below end
57 for u1 of-type code-point = (,getter seq i)
58 do (cond ((< u1 #x80) (incf noctets))
59 (t (incf noctets 2)))
60 (when (and (plusp max) (= noctets max))
61 (return (values noctets i)))
62 finally (return (values noctets i))))))
63
64 (define-code-point-counter :gbk (getter type)
65 `(lambda (seq start end max)
66 (declare (type ,type seq))
67 (let (u1 (noctets 0))
68 (loop with i = start
69 while (< i end)
70 do (setf u1 (,getter seq i))
71 (cond
72 ((eq 0 (logand u1 #x80)) (incf i))
73 (t (incf i 2)))
74 (incf noctets)
75 (when (and (plusp max) (= noctets max))
76 (return (values noctets i)))
77 finally (return (values noctets i))))))
78
79 (define-encoder :gbk (getter src-type setter dest-type)
80 `(lambda (src start end dest d-start)
81 (declare (type ,src-type src)
82 (type ,dest-type dest)
83 (fixnum start end d-start))
84 (macrolet
85 ((do-encoding (index)
86 `(let ((u1 0) (u2 0))
87 (cond
88 ((<= +gbk2-offset+ ,index (- +gbk3-offset+ 1)) ; gbk/2
89 (setf u1 (+ #xB0 (truncate (/ ,index 94))))
90 (setf u2 (+ #xA1 (mod ,index 94))))
91 ((<= +gbk3-offset+ ,index (- +gbk4-offset+ 1)) ; gbk/3
92 (setf index (- ,index +gbk3-offset+))
93 (setf u1 (+ #x81 (truncate (/ ,index 190))))
94 (setf u2 (+ #x40 (mod ,index 190)))
95 (if (>= u2 #x7F) (incf u2)))
96 ((<= +gbk4-offset+ ,index (- +gbk1-offset+ 1)) ; gbk/4
97 (setf index (- ,index +gbk4-offset+))
98 (setf u1 (+ #xAA (truncate (/ ,index 96))))
99 (setf u2 (+ #x40 (mod ,index 96)))
100 (if (>= u2 #x7F) (incf u2)))
101 ((<= +gbk1-offset+ ,index (- +gbk5-offset+ 1)) ; gbk/1
102 (setf index (- ,index +gbk1-offset+))
103 (setf u1 (+ #xA1 (truncate (/ ,index 94))))
104 (setf u2 (+ #xA1 (mod ,index 94))))
105 ((<= +gbk5-offset+ ,index (length *gbk-unicode-mapping*)) ; gbk/5
106 (setf index (- ,index +gbk5-offset+))
107 (setf u1 (+ #xA8 (truncate (/ ,index 96))))
108 (setf u2 (+ #x40 (mod ,index 96)))
109 (if (>= u2 #x7F) (incf u2))))
110 (values u1 u2))))
111 (let ((c 0) index (noctets 0))
112 (loop for i from start below end
113 for code of-type code-point = (,getter src i)
114 do (macrolet
115 ((handle-error (&optional (c 'character-encoding-error))
116 `(encoding-error code :gbk src i +repl+ ',c)))
117 (setf c (code-char code))
118 (cond
119 ((< code #x80) ; ascii
120 (,setter code dest noctets)
121 (incf noctets))
122 (t ; gbk
123 (setf index
124 (position c *gbk-unicode-mapping*))
125
126 (if (not index)
127 (handle-error invalid-gbk-character))
128 (multiple-value-bind (uh ul) (do-encoding index)
129 (,setter uh dest noctets)
130 (,setter ul dest (+ 1 noctets))
131 (incf noctets 2)))))
132 finally (return (the fixnum (- noctets d-start))))))))
133
134 (define-decoder :gbk (getter src-type setter dest-type)
135 `(lambda (src start end dest d-start)
136 (declare (type ,src-type src)
137 (type ,dest-type dest))
138 (let ((u1 0) (u2 0) (index 0) (tmp 0) (noctets 0))
139 (loop with i = start
140 while (< i end)
141 do (macrolet
142 ((handle-error (&optional (c 'character-decoding-error))
143 `(decoding-error #(u1 u2) :gbk src i +repl+ ',c)))
144 (setf u1 (,getter src i))
145 (incf i)
146 (cond
147 ((eq 0 (logand u1 #x80))
148 (,setter u1 dest noctets))
149 (t
150 (setf u2 (,getter src i))
151 (incf i)
152 (setf index
153 (block setter-block
154 (cond
155 ((and (<= #xB0 u1 #xF7) (<= #xA1 u2 #xFE))
156 (+ +gbk2-offset+ (+ (* 94 (- u1 #xB0)) (- u2 #xA1))))
157
158 ((and (<= #x81 u1 #xA0) (<= #x40 u2 #xFE))
159 (cond ((> u2 #x7F) (setf tmp 1))
160 (t (setf tmp 0)))
161 (+ +gbk3-offset+ (* 190 (- u1 #x81)) (- u2 #x40 tmp)))
162
163 ((and (<= #xAA u1 #xFE) (<= #x40 #xA0))
164 (cond ((> u2 #x7F) (setf tmp 1))
165 (t (setf tmp 0)))
166 (+ +gbk4-offset+ (* 96 (- u1 #xAA)) (- u2 #x40 tmp)))
167
168 ((and (<= #xA1 u1 #xA9) (<= #xA1 u2 #xFE))
169 (+ +gbk1-offset+ (* 94 (- u1 #xA1)) (- u2 #xA1)))
170
171 ((and (<= #xA8 u1 #xA9) (<= #x40 #xA0))
172 (cond ((> u2 #x7F) (setf tmp 1))
173 (t (setf tmp 0)))
174 (+ +gbk5-offset+ (* 96 (- u1 #xA8)) (- u2 #x40 tmp)))
175 (t
176 (handle-error invalid-gbk-byte)))))
177
178 (when (>= index (length *gbk-unicode-mapping*))
179 (handle-error invalid-gbk-byte))
180 (,setter (char-code
181 (elt *gbk-unicode-mapping* index))
182 dest noctets)))
183 (incf noctets))
184 finally (return (the fixnum (- noctets d-start)))))))