Added keyword synonyms for the current codecs.
[lisp-utils.git] / charcode.lisp
CommitLineData
fd26d811
FT
1;;;; CHAR-CODE -- Conversions between characters and byte
2;;;; representations thereof
3
4(defpackage :charcode
5 (:use :cl #+sbcl :sb-gray #-sbcl :gray)
6f9e13dc 6 (:export "MAKE-ENCODER" "MAKE-DECODER" "ENCODE-STRING" "DECODE-STRING" "SYSTEM-CHARSET"
fd26d811 7 "CODING-ERROR"
ebf1ccf7 8 "MAKE-CODEC-CHARACTER-STREAM"
4dd02a73 9 "ASCII" "LATIN-1" "LATIN1" "UTF-8" "UTF8"))
fd26d811
FT
10(in-package :charcode)
11
12;;; General stuff
13
14(define-condition coding-error (error)
15 ((input :initarg :input)
16 (position :initarg :position)
17 (result :initarg :result)))
18
19(define-condition simple-coding-error (coding-error simple-error) ())
20
21(defun coding-error (input position result format &rest format-args)
22 (error 'simple-coding-error
23 :input input :position position :result result
24 :format-control format :format-arguments format-args))
25
26(deftype decoder-fun () `(function ((array (unsigned-byte 8))
27 (array character)
28 &key (start fixnum) (end fixnum))
29 (member t nil)))
30(deftype encoder-fun () `(function ((array character)
31 (array (unsigned-byte 8))
32 &key (start fixnum) (end fixnum))
33 (member t nil)))
34
35(defmacro define-encoder ((name) &body make-encoder)
36 `(setf (get ',name 'make-encoder) #'(lambda () ,@make-encoder)))
37
38(defmacro define-decoder ((name) &body make-decoder)
39 `(setf (get ',name 'make-decoder) #'(lambda () ,@make-decoder)))
40
41(defmacro define-codec-synonyms (name &rest synonyms)
42 `(eval-when (:load-toplevel :execute)
43 ,@(mapcar #'(lambda (sym)
44 `(setf (get ',sym 'make-encoder) (get ',name 'make-encoder)
45 (get ',sym 'make-decoder) (get ',name 'make-decoder)))
46 synonyms)))
47
48(defun make-encoder (name)
49 (the encoder-fun (values (funcall (get name 'make-encoder)))))
50
51(defun make-decoder (name)
52 (the decoder-fun (values (funcall (get name 'make-decoder)))))
53
6f9e13dc
FT
54(defun system-charset ()
55 ;; XXX: Replace me with something perhaps more sensible.
56 'utf-8)
57
58(defun encode-string (string &optional (coding (system-charset)))
fd26d811
FT
59 (declare (type string string))
60 (let ((encoder (make-encoder coding))
61 (buf (make-array (list (length string)) :element-type '(unsigned-byte 8) :adjustable t :fill-pointer 0)))
62 (unless (funcall encoder string buf)
63 (coding-error string (length string) buf "Encoding of string in ~A ended prematurely." coding))
64 buf))
65
6f9e13dc 66(defun decode-string (buffer &optional (coding (system-charset)))
fd26d811
FT
67 (declare (type (array (unsigned-byte 8)) buffer))
68 (let ((decoder (make-decoder coding))
69 (buf (make-array (list (length buffer)) :element-type 'character :adjustable t :fill-pointer 0)))
70 (unless (funcall decoder buffer buf)
71 (coding-error buffer (length buffer) buf "~A byte sequence ended prematurely." coding))
72 buf))
73
74;;; Gray stream implementation
75
76(defclass codec-character-stream (fundamental-character-input-stream fundamental-character-output-stream)
77 ((decoder :initarg :decoder)
78 (encoder :initarg :encoder)
79 (back :initarg :back)
80 (read-pos :initform 0)
81 (buffer :initform (make-array '(64) :element-type 'character :adjustable t :fill-pointer 0))))
82
6f9e13dc 83(defun make-codec-character-stream (real-stream &optional (charset (system-charset)))
ebf1ccf7
FT
84 (declare (type stream real-stream))
85 (make-instance 'codec-character-stream :decoder (make-decoder charset) :encoder (make-encoder charset) :back real-stream))
86
fd26d811
FT
87(defmethod close ((stream codec-character-stream) &key abort)
88 (with-slots (back) stream
89 (close back :abort abort))
90 (call-next-method))
91
92(defmethod open-stream-p ((stream codec-character-stream))
93 (with-slots (back) stream
94 (open-stream-p stream)))
95
96(defun ccs-ensure-buffer (stream len)
97 (declare (type codec-character-stream stream)
98 (type integer len))
99 (with-slots (decoder back buffer read-pos) stream
100 (let ((readbuf (make-array (list len) :element-type '(unsigned-byte 8))))
101 (loop (unless (< (- (length buffer) read-pos) len) (return t))
75545f66 102 (let ((readlen (read-sequence readbuf back :end (- len (- (length buffer) read-pos)))))
fd26d811
FT
103 (when (= readlen 0)
104 (return-from ccs-ensure-buffer nil))
105 (funcall decoder readbuf buffer :end readlen))))))
106
107(defun ccs-clear-buffer (stream)
108 (declare (type codec-character-stream stream))
109 (with-slots (read-pos buffer) stream
110 (replace buffer buffer :start2 read-pos)
75545f66
FT
111 (setf (fill-pointer buffer) (- (fill-pointer buffer) read-pos)
112 read-pos 0)))
fd26d811
FT
113
114(defmethod stream-read-char ((stream codec-character-stream))
115 (unless (ccs-ensure-buffer stream 1)
116 (return-from stream-read-char :eof))
117 (with-slots (read-pos buffer) stream
118 (prog1 (aref buffer read-pos)
119 (when (>= (incf read-pos) 16)
120 (ccs-clear-buffer stream)))))
121
122(defmethod stream-unread-char ((stream codec-character-stream) char)
123 (with-slots (read-pos buffer) stream
124 (when (= read-pos 0)
125 (let ((len (length buffer)))
126 (when (< (array-dimension buffer 0) (+ len 16))
127 (adjust-array buffer (list (setf (fill-pointer buffer)
128 (+ len 16)))))
129 (replace buffer buffer :start1 16 :end2 len)))
75545f66 130 (setf (aref buffer (decf read-pos)) char)
fd26d811
FT
131 nil))
132
133(defun ccs-wont-hang-p (stream)
134 (declare (type codec-character-stream stream))
135 (with-slots (read-pos back buffer) stream
136 (or (and (< read-pos (length buffer)) (aref buffer read-pos))
137 (listen back))))
138
139(defmethod stream-read-char-no-hang ((stream codec-character-stream))
140 (if (ccs-wont-hang-p stream)
141 (stream-read-char stream)
142 nil))
143
144(defmethod stream-peek-char ((stream codec-character-stream))
145 (unless (ccs-ensure-buffer stream 1)
146 (return-from stream-peek-char :eof))
147 (with-slots (read-pos buffer) stream
148 (aref buffer read-pos)))
149
150(defmethod stream-listen ((stream codec-character-stream))
151 (if (ccs-wont-hang-p stream)
152 (let ((peek (stream-peek-char stream)))
153 (if (eq peek :eof)
154 nil
155 peek))
156 nil))
157
158(defmethod stream-write-char ((stream codec-character-stream) char)
159 (with-slots (encoder back) stream
160 (let ((seq (make-array '(1) :element-type 'character :initial-element char))
161 (outbuf (make-array '(16) :element-type '(unsigned-byte 8) :adjustable t :fill-pointer 0)))
162 (funcall encoder seq outbuf)
163 (write-sequence outbuf back))))
164
165(defmethod stream-finish-output ((stream codec-character-stream))
166 (finish-output (slot-value stream 'back)))
167
168(defmethod stream-force-output ((stream codec-character-stream))
169 (force-output (slot-value stream 'back)))
170
171(defmethod stream-read-sequence ((stream codec-character-stream) seq &optional (start 0) (end (length seq)))
172 (ccs-ensure-buffer stream (- end start))
173 (with-slots (read-pos buffer) stream
174 (replace seq buffer :start1 start :end1 end :start2 read-pos :end2 (length buffer))
175 (let ((len (min (- end start) (- (length buffer) read-pos))))
176 (when (>= (incf read-pos len) 128)
177 (ccs-clear-buffer stream)))))
178
179(defmethod stream-write-sequence ((stream codec-character-stream) seq &optional (start 0) (end (length seq)))
180 (with-slots (encoder back) stream
181 (let ((outbuf (make-array (list (- end start)) :element-type '(unsigned-byte 8) :adjustable t :fill-pointer 0)))
182 (funcall encoder seq outbuf)
183 (write-sequence outbuf back))))
184
185;;; Implementation-specific functions
186
187#+(or (and clisp unicode) sbcl)
188(defun unicode->char (unicode)
189 (declare (type (unsigned-byte 24) unicode))
190 (code-char unicode))
191
192#+(or (and clisp unicode) sbcl)
193(defun char->unicode (char)
194 (declare (type character char))
195 (char-code char))
196
4dd02a73
FT
197;;; ASCII
198
199(defun decode-ascii (byteseq charseq &key (start 0) (end (length byteseq)))
200 (declare (type (array (unsigned-byte 8)) byteseq)
201 (type (array character) charseq)
202 (type fixnum start end))
203 (loop
204 (restart-case
205 (loop
206 (unless (< start end) (return-from decode-ascii t))
207 (let ((byte (aref byteseq (prog1 start (incf start)))))
208 (unless (< byte 128)
209 (coding-error byteseq start charseq "Invalid byte ~D in ASCII stream." byte))
210 (vector-push-extend (unicode->char byte) charseq)))
211 (:replace-char (&optional (replacement (unicode->char #xfffd)))
212 :report "Replace the invalid byte with a character."
213 (vector-push-extend replacement charseq))
214 (:skip-char ()
215 :report "Ignore the invalid byte."
216 nil))))
217
218(defun encode-ascii (charseq byteseq &key (start 0) (end (length charseq)))
219 (declare (type (array (unsigned-byte 8)) byteseq)
220 (type (array character) charseq)
221 (type fixnum start end))
222 (loop
223 (restart-case
224 (loop
225 (unless (< start end) (return-from encode-ascii t))
226 (vector-push-extend (let ((cp (char->unicode (aref charseq (prog1 start (incf start))))))
227 (unless (< cp 128)
228 (coding-error charseq start byteseq "ASCII cannot encode code-points higher than 128."))
229 cp)
230 byteseq))
231 (:replace-char (&optional (replacement #\?))
232 :report "Replace this character with another."
233 (vector-push-extend (char->unicode replacement) byteseq))
234 (:skip-char ()
235 :report "Ignore this character."
236 nil))))
237
238(define-decoder (ascii)
239 #'decode-ascii)
240
241(define-encoder (ascii)
242 #'encode-ascii)
243
f8eaaa29
FT
244(define-codec-synonyms ascii :ascii)
245
fd26d811
FT
246;;; Latin-1
247
248(defun decode-latin-1 (byteseq charseq &key (start 0) (end (length byteseq)))
249 (declare (type (array (unsigned-byte 8)) byteseq)
250 (type (array character) charseq)
251 (type fixnum start end))
252 (do ((i start (1+ i)))
253 ((>= i end))
254 (vector-push-extend (unicode->char (aref byteseq i)) charseq))
255 t)
256
257(defun encode-latin-1 (charseq byteseq &key (start 0) (end (length charseq)))
258 (declare (type (array (unsigned-byte 8)) byteseq)
259 (type (array character) charseq)
260 (type fixnum start end))
261 (loop
262 (restart-case
263 (loop
264 (unless (< start end) (return-from encode-latin-1 t))
265 (vector-push-extend (let ((cp (char->unicode (aref charseq (prog1 start (incf start))))))
266 (unless (< cp 256)
267 (coding-error charseq start byteseq "ISO-8859-1 cannot encode code-points higher than 256."))
268 cp)
269 byteseq))
270 (:replace-char (&optional (replacement #\?))
271 :report "Replace this character with another."
272 (vector-push-extend (char->unicode replacement) byteseq))
273 (:skip-char ()
274 :report "Ignore this character."
275 nil))))
276
277(define-decoder (latin-1)
278 #'decode-latin-1)
279
280(define-encoder (latin-1)
281 #'encode-latin-1)
282
f8eaaa29 283(define-codec-synonyms latin-1 latin1 iso-8859-1 :latin-1 :latin1 :iso-8859-1)
fd26d811
FT
284
285;;; UTF-8
286
287(defun encode-utf-8 (charseq byteseq &key (start 0) (end (length charseq)))
288 (declare (type (array (unsigned-byte 8)) byteseq)
289 (type (array character) charseq)
290 (type fixnum start end))
291 (do ((i start (1+ i)))
292 ((>= i end))
293 (let ((cp (char->unicode (aref charseq i))))
294 (if (< cp 128)
295 (vector-push-extend cp byteseq)
296 (let ((nbytes 0)
297 (bytes '()))
298 (loop
299 (push (logior (ldb (byte 6 0) cp) #x80) bytes)
300 (setf cp (truncate cp 64))
301 (incf nbytes)
302 (when (< cp (expt 2 (- 6 nbytes)))
303 (push (logior (logand #xff (lognot (1- (expt 2 (- 7 nbytes)))))
304 cp)
305 bytes)
306 (return)))
307 (dolist (byte bytes)
308 (vector-push-extend byte byteseq))))))
309 t)
310
311(define-encoder (utf-8)
312 #'encode-utf-8)
313
314(define-decoder (utf-8)
315 (let ((mbuf 0)
316 (mlen 0))
317 (flet ((decode (byteseq charseq &key (start 0) (end (length byteseq)))
318 (declare (type (array (unsigned-byte 8)) byteseq)
319 (type (array character) charseq)
320 (type fixnum start end))
321 (let ((i start))
322 (flet ((failure (format &rest args)
323 (error 'simple-coding-error
324 :input byteseq :position i :result charseq
325 :format-control format :format-arguments args)))
326 (loop
327 (restart-case
328 (progn
329 (loop
330 (unless (< i end) (return))
331 (let ((byte (aref byteseq (prog1 i (incf i)))))
332 (if (= mlen 0)
333 (if (< byte 128)
334 (vector-push-extend (unicode->char byte) charseq)
335 (setf mlen (block zero
336 (dotimes (i 7)
337 (when (= (ldb (byte 1 (- 7 i)) byte) 0)
338 (when (< i 2)
339 (failure "UTF-8 sequence started with continuation byte: ~D" byte))
340 (return-from zero (1- i))))
341 (failure "Invalid UTF-8 sequence start byte: ~D" byte))
342 mbuf (ldb (byte (- 6 mlen) 0) byte)))
343 (progn (when (not (= (ldb (byte 2 6) byte) 2))
344 (failure "Invalid UTF-8 continuation byte: ~D" byte))
345 (setf mbuf (+ (* mbuf 64) (ldb (byte 6 0) byte)))
346 (when (= (decf mlen) 0)
347 (when (< mbuf 128)
348 (with-simple-restart (:accept "Accept anyway.")
349 (failure "UTF-8 multibyte sequence denoted an ASCII character ~S (either an encoding error or an attempt at breaking security)." (unicode->char mbuf))))
350 (vector-push-extend (unicode->char mbuf) charseq))))))
351 (return-from decode (= mlen 0)))
352 (:replace-char (&optional (replacement (unicode->char #xfffd)))
353 :report "Replace the invalid bytes with a character."
354 (vector-push-extend replacement charseq)
355 (loop (unless (and (< i end) (= (ldb (byte 2 6) (aref byteseq i)) 2))
356 (return))
357 (incf i))
358 (setf mlen 0))
359 (:skip-char ()
360 :report "Ignore the invalid byte sequence."
361 (loop (unless (and (< i end) (= (ldb (byte 2 6) (aref byteseq i)) 2))
362 (return))
363 (incf i))
364 (setf mlen 0))))))))
365 #'decode)))
366
f8eaaa29 367(define-codec-synonyms utf-8 utf8 :utf-8 :utf8)