Initial implementation of CHARCODE.
[lisp-utils.git] / charcode.lisp
CommitLineData
fd26d811
FT
1;;;; CHAR-CODE -- Conversions between characters and byte
2;;;; representations thereof
3
4(defpackage :charcode
5 (:use :cl #+sbcl :sb-gray #-sbcl :gray)
6 (:export "MAKE-ENCODER" "MAKE-DECODER" "ENCODE-STRING" "DECODE-STRING"
7 "CODING-ERROR"
8 "LATIN-1" "LATIN1" "UTF-8" "UTF8"))
9(in-package :charcode)
10
11;;; General stuff
12
13(define-condition coding-error (error)
14 ((input :initarg :input)
15 (position :initarg :position)
16 (result :initarg :result)))
17
18(define-condition simple-coding-error (coding-error simple-error) ())
19
20(defun coding-error (input position result format &rest format-args)
21 (error 'simple-coding-error
22 :input input :position position :result result
23 :format-control format :format-arguments format-args))
24
25(deftype decoder-fun () `(function ((array (unsigned-byte 8))
26 (array character)
27 &key (start fixnum) (end fixnum))
28 (member t nil)))
29(deftype encoder-fun () `(function ((array character)
30 (array (unsigned-byte 8))
31 &key (start fixnum) (end fixnum))
32 (member t nil)))
33
34(defmacro define-encoder ((name) &body make-encoder)
35 `(setf (get ',name 'make-encoder) #'(lambda () ,@make-encoder)))
36
37(defmacro define-decoder ((name) &body make-decoder)
38 `(setf (get ',name 'make-decoder) #'(lambda () ,@make-decoder)))
39
40(defmacro define-codec-synonyms (name &rest synonyms)
41 `(eval-when (:load-toplevel :execute)
42 ,@(mapcar #'(lambda (sym)
43 `(setf (get ',sym 'make-encoder) (get ',name 'make-encoder)
44 (get ',sym 'make-decoder) (get ',name 'make-decoder)))
45 synonyms)))
46
47(defun make-encoder (name)
48 (the encoder-fun (values (funcall (get name 'make-encoder)))))
49
50(defun make-decoder (name)
51 (the decoder-fun (values (funcall (get name 'make-decoder)))))
52
53(defun encode-string (string coding)
54 (declare (type string string))
55 (let ((encoder (make-encoder coding))
56 (buf (make-array (list (length string)) :element-type '(unsigned-byte 8) :adjustable t :fill-pointer 0)))
57 (unless (funcall encoder string buf)
58 (coding-error string (length string) buf "Encoding of string in ~A ended prematurely." coding))
59 buf))
60
61(defun decode-string (buffer coding)
62 (declare (type (array (unsigned-byte 8)) buffer))
63 (let ((decoder (make-decoder coding))
64 (buf (make-array (list (length buffer)) :element-type 'character :adjustable t :fill-pointer 0)))
65 (unless (funcall decoder buffer buf)
66 (coding-error buffer (length buffer) buf "~A byte sequence ended prematurely." coding))
67 buf))
68
69;;; Gray stream implementation
70
71(defclass codec-character-stream (fundamental-character-input-stream fundamental-character-output-stream)
72 ((decoder :initarg :decoder)
73 (encoder :initarg :encoder)
74 (back :initarg :back)
75 (read-pos :initform 0)
76 (buffer :initform (make-array '(64) :element-type 'character :adjustable t :fill-pointer 0))))
77
78(defmethod close ((stream codec-character-stream) &key abort)
79 (with-slots (back) stream
80 (close back :abort abort))
81 (call-next-method))
82
83(defmethod open-stream-p ((stream codec-character-stream))
84 (with-slots (back) stream
85 (open-stream-p stream)))
86
87(defun ccs-ensure-buffer (stream len)
88 (declare (type codec-character-stream stream)
89 (type integer len))
90 (with-slots (decoder back buffer read-pos) stream
91 (let ((readbuf (make-array (list len) :element-type '(unsigned-byte 8))))
92 (loop (unless (< (- (length buffer) read-pos) len) (return t))
93 (let ((readlen (read-sequence readbuf back)))
94 (when (= readlen 0)
95 (return-from ccs-ensure-buffer nil))
96 (funcall decoder readbuf buffer :end readlen))))))
97
98(defun ccs-clear-buffer (stream)
99 (declare (type codec-character-stream stream))
100 (with-slots (read-pos buffer) stream
101 (replace buffer buffer :start2 read-pos)
102 (setf (fill-pointer buffer) (- (fill-pointer buffer) read-pos)
103 read-pos 0)))
104
105(defmethod stream-read-char ((stream codec-character-stream))
106 (unless (ccs-ensure-buffer stream 1)
107 (return-from stream-read-char :eof))
108 (with-slots (read-pos buffer) stream
109 (prog1 (aref buffer read-pos)
110 (when (>= (incf read-pos) 16)
111 (ccs-clear-buffer stream)))))
112
113(defmethod stream-unread-char ((stream codec-character-stream) char)
114 (with-slots (read-pos buffer) stream
115 (when (= read-pos 0)
116 (let ((len (length buffer)))
117 (when (< (array-dimension buffer 0) (+ len 16))
118 (adjust-array buffer (list (setf (fill-pointer buffer)
119 (+ len 16)))))
120 (replace buffer buffer :start1 16 :end2 len)))
121 (setf (aref buffer read-pos) char)
122 (decf read-pos)
123 nil))
124
125(defun ccs-wont-hang-p (stream)
126 (declare (type codec-character-stream stream))
127 (with-slots (read-pos back buffer) stream
128 (or (and (< read-pos (length buffer)) (aref buffer read-pos))
129 (listen back))))
130
131(defmethod stream-read-char-no-hang ((stream codec-character-stream))
132 (if (ccs-wont-hang-p stream)
133 (stream-read-char stream)
134 nil))
135
136(defmethod stream-peek-char ((stream codec-character-stream))
137 (unless (ccs-ensure-buffer stream 1)
138 (return-from stream-peek-char :eof))
139 (with-slots (read-pos buffer) stream
140 (aref buffer read-pos)))
141
142(defmethod stream-listen ((stream codec-character-stream))
143 (if (ccs-wont-hang-p stream)
144 (let ((peek (stream-peek-char stream)))
145 (if (eq peek :eof)
146 nil
147 peek))
148 nil))
149
150(defmethod stream-write-char ((stream codec-character-stream) char)
151 (with-slots (encoder back) stream
152 (let ((seq (make-array '(1) :element-type 'character :initial-element char))
153 (outbuf (make-array '(16) :element-type '(unsigned-byte 8) :adjustable t :fill-pointer 0)))
154 (funcall encoder seq outbuf)
155 (write-sequence outbuf back))))
156
157(defmethod stream-finish-output ((stream codec-character-stream))
158 (finish-output (slot-value stream 'back)))
159
160(defmethod stream-force-output ((stream codec-character-stream))
161 (force-output (slot-value stream 'back)))
162
163(defmethod stream-read-sequence ((stream codec-character-stream) seq &optional (start 0) (end (length seq)))
164 (ccs-ensure-buffer stream (- end start))
165 (with-slots (read-pos buffer) stream
166 (replace seq buffer :start1 start :end1 end :start2 read-pos :end2 (length buffer))
167 (let ((len (min (- end start) (- (length buffer) read-pos))))
168 (when (>= (incf read-pos len) 128)
169 (ccs-clear-buffer stream)))))
170
171(defmethod stream-write-sequence ((stream codec-character-stream) seq &optional (start 0) (end (length seq)))
172 (with-slots (encoder back) stream
173 (let ((outbuf (make-array (list (- end start)) :element-type '(unsigned-byte 8) :adjustable t :fill-pointer 0)))
174 (funcall encoder seq outbuf)
175 (write-sequence outbuf back))))
176
177;;; Implementation-specific functions
178
179#+(or (and clisp unicode) sbcl)
180(defun unicode->char (unicode)
181 (declare (type (unsigned-byte 24) unicode))
182 (code-char unicode))
183
184#+(or (and clisp unicode) sbcl)
185(defun char->unicode (char)
186 (declare (type character char))
187 (char-code char))
188
189;;; Latin-1
190
191(defun decode-latin-1 (byteseq charseq &key (start 0) (end (length byteseq)))
192 (declare (type (array (unsigned-byte 8)) byteseq)
193 (type (array character) charseq)
194 (type fixnum start end))
195 (do ((i start (1+ i)))
196 ((>= i end))
197 (vector-push-extend (unicode->char (aref byteseq i)) charseq))
198 t)
199
200(defun encode-latin-1 (charseq byteseq &key (start 0) (end (length charseq)))
201 (declare (type (array (unsigned-byte 8)) byteseq)
202 (type (array character) charseq)
203 (type fixnum start end))
204 (loop
205 (restart-case
206 (loop
207 (unless (< start end) (return-from encode-latin-1 t))
208 (vector-push-extend (let ((cp (char->unicode (aref charseq (prog1 start (incf start))))))
209 (unless (< cp 256)
210 (coding-error charseq start byteseq "ISO-8859-1 cannot encode code-points higher than 256."))
211 cp)
212 byteseq))
213 (:replace-char (&optional (replacement #\?))
214 :report "Replace this character with another."
215 (vector-push-extend (char->unicode replacement) byteseq))
216 (:skip-char ()
217 :report "Ignore this character."
218 nil))))
219
220(define-decoder (latin-1)
221 #'decode-latin-1)
222
223(define-encoder (latin-1)
224 #'encode-latin-1)
225
226(define-codec-synonyms latin-1 latin1 iso-8859-1)
227
228;;; UTF-8
229
230(defun encode-utf-8 (charseq byteseq &key (start 0) (end (length charseq)))
231 (declare (type (array (unsigned-byte 8)) byteseq)
232 (type (array character) charseq)
233 (type fixnum start end))
234 (do ((i start (1+ i)))
235 ((>= i end))
236 (let ((cp (char->unicode (aref charseq i))))
237 (if (< cp 128)
238 (vector-push-extend cp byteseq)
239 (let ((nbytes 0)
240 (bytes '()))
241 (loop
242 (push (logior (ldb (byte 6 0) cp) #x80) bytes)
243 (setf cp (truncate cp 64))
244 (incf nbytes)
245 (when (< cp (expt 2 (- 6 nbytes)))
246 (push (logior (logand #xff (lognot (1- (expt 2 (- 7 nbytes)))))
247 cp)
248 bytes)
249 (return)))
250 (dolist (byte bytes)
251 (vector-push-extend byte byteseq))))))
252 t)
253
254(define-encoder (utf-8)
255 #'encode-utf-8)
256
257(define-decoder (utf-8)
258 (let ((mbuf 0)
259 (mlen 0))
260 (flet ((decode (byteseq charseq &key (start 0) (end (length byteseq)))
261 (declare (type (array (unsigned-byte 8)) byteseq)
262 (type (array character) charseq)
263 (type fixnum start end))
264 (let ((i start))
265 (flet ((failure (format &rest args)
266 (error 'simple-coding-error
267 :input byteseq :position i :result charseq
268 :format-control format :format-arguments args)))
269 (loop
270 (restart-case
271 (progn
272 (loop
273 (unless (< i end) (return))
274 (let ((byte (aref byteseq (prog1 i (incf i)))))
275 (if (= mlen 0)
276 (if (< byte 128)
277 (vector-push-extend (unicode->char byte) charseq)
278 (setf mlen (block zero
279 (dotimes (i 7)
280 (when (= (ldb (byte 1 (- 7 i)) byte) 0)
281 (when (< i 2)
282 (failure "UTF-8 sequence started with continuation byte: ~D" byte))
283 (return-from zero (1- i))))
284 (failure "Invalid UTF-8 sequence start byte: ~D" byte))
285 mbuf (ldb (byte (- 6 mlen) 0) byte)))
286 (progn (when (not (= (ldb (byte 2 6) byte) 2))
287 (failure "Invalid UTF-8 continuation byte: ~D" byte))
288 (setf mbuf (+ (* mbuf 64) (ldb (byte 6 0) byte)))
289 (when (= (decf mlen) 0)
290 (when (< mbuf 128)
291 (with-simple-restart (:accept "Accept anyway.")
292 (failure "UTF-8 multibyte sequence denoted an ASCII character ~S (either an encoding error or an attempt at breaking security)." (unicode->char mbuf))))
293 (vector-push-extend (unicode->char mbuf) charseq))))))
294 (return-from decode (= mlen 0)))
295 (:replace-char (&optional (replacement (unicode->char #xfffd)))
296 :report "Replace the invalid bytes with a character."
297 (vector-push-extend replacement charseq)
298 (loop (unless (and (< i end) (= (ldb (byte 2 6) (aref byteseq i)) 2))
299 (return))
300 (incf i))
301 (setf mlen 0))
302 (:skip-char ()
303 :report "Ignore the invalid byte sequence."
304 (loop (unless (and (< i end) (= (ldb (byte 2 6) (aref byteseq i)) 2))
305 (return))
306 (incf i))
307 (setf mlen 0))))))))
308 #'decode)))
309
310(define-codec-synonyms utf-8 utf8)