Signal an error when a named codec could not be found.
[lisp-utils.git] / charcode.lisp
CommitLineData
fd26d811
FT
1;;;; CHAR-CODE -- Conversions between characters and byte
2;;;; representations thereof
3
4(defpackage :charcode
5 (:use :cl #+sbcl :sb-gray #-sbcl :gray)
6f9e13dc 6 (:export "MAKE-ENCODER" "MAKE-DECODER" "ENCODE-STRING" "DECODE-STRING" "SYSTEM-CHARSET"
c94c6f05 7 "NO-CODEC-ERROR" "CODING-ERROR"
ebf1ccf7 8 "MAKE-CODEC-CHARACTER-STREAM"
4dd02a73 9 "ASCII" "LATIN-1" "LATIN1" "UTF-8" "UTF8"))
fd26d811
FT
10(in-package :charcode)
11
12;;; General stuff
13
c94c6f05
FT
14(define-condition no-codec-error (error)
15 ((codec-name :initarg :codec-name))
16 (:report (lambda (c s)
17 (with-slots (codec-name) c
18 (format s "Could find no codec named ~A." codec-name)))))
19
fd26d811
FT
20(define-condition coding-error (error)
21 ((input :initarg :input)
22 (position :initarg :position)
23 (result :initarg :result)))
24
25(define-condition simple-coding-error (coding-error simple-error) ())
26
27(defun coding-error (input position result format &rest format-args)
28 (error 'simple-coding-error
29 :input input :position position :result result
30 :format-control format :format-arguments format-args))
31
32(deftype decoder-fun () `(function ((array (unsigned-byte 8))
33 (array character)
34 &key (start fixnum) (end fixnum))
35 (member t nil)))
36(deftype encoder-fun () `(function ((array character)
37 (array (unsigned-byte 8))
38 &key (start fixnum) (end fixnum))
39 (member t nil)))
40
41(defmacro define-encoder ((name) &body make-encoder)
42 `(setf (get ',name 'make-encoder) #'(lambda () ,@make-encoder)))
43
44(defmacro define-decoder ((name) &body make-decoder)
45 `(setf (get ',name 'make-decoder) #'(lambda () ,@make-decoder)))
46
47(defmacro define-codec-synonyms (name &rest synonyms)
48 `(eval-when (:load-toplevel :execute)
49 ,@(mapcar #'(lambda (sym)
50 `(setf (get ',sym 'make-encoder) (get ',name 'make-encoder)
51 (get ',sym 'make-decoder) (get ',name 'make-decoder)))
52 synonyms)))
53
54(defun make-encoder (name)
c94c6f05
FT
55 (the encoder-fun (values (funcall (or (get name 'make-encoder)
56 (error 'no-codec-error :codec-name name))))))
fd26d811
FT
57
58(defun make-decoder (name)
c94c6f05
FT
59 (the decoder-fun (values (funcall (or (get name 'make-decoder)
60 (error 'no-codec-error :codec-name name))))))
fd26d811 61
6f9e13dc
FT
62(defun system-charset ()
63 ;; XXX: Replace me with something perhaps more sensible.
64 'utf-8)
65
66(defun encode-string (string &optional (coding (system-charset)))
fd26d811
FT
67 (declare (type string string))
68 (let ((encoder (make-encoder coding))
69 (buf (make-array (list (length string)) :element-type '(unsigned-byte 8) :adjustable t :fill-pointer 0)))
70 (unless (funcall encoder string buf)
71 (coding-error string (length string) buf "Encoding of string in ~A ended prematurely." coding))
72 buf))
73
6f9e13dc 74(defun decode-string (buffer &optional (coding (system-charset)))
fd26d811
FT
75 (declare (type (array (unsigned-byte 8)) buffer))
76 (let ((decoder (make-decoder coding))
77 (buf (make-array (list (length buffer)) :element-type 'character :adjustable t :fill-pointer 0)))
78 (unless (funcall decoder buffer buf)
79 (coding-error buffer (length buffer) buf "~A byte sequence ended prematurely." coding))
80 buf))
81
82;;; Gray stream implementation
83
84(defclass codec-character-stream (fundamental-character-input-stream fundamental-character-output-stream)
85 ((decoder :initarg :decoder)
86 (encoder :initarg :encoder)
87 (back :initarg :back)
88 (read-pos :initform 0)
89 (buffer :initform (make-array '(64) :element-type 'character :adjustable t :fill-pointer 0))))
90
6f9e13dc 91(defun make-codec-character-stream (real-stream &optional (charset (system-charset)))
ebf1ccf7
FT
92 (declare (type stream real-stream))
93 (make-instance 'codec-character-stream :decoder (make-decoder charset) :encoder (make-encoder charset) :back real-stream))
94
fd26d811
FT
95(defmethod close ((stream codec-character-stream) &key abort)
96 (with-slots (back) stream
97 (close back :abort abort))
98 (call-next-method))
99
100(defmethod open-stream-p ((stream codec-character-stream))
101 (with-slots (back) stream
102 (open-stream-p stream)))
103
104(defun ccs-ensure-buffer (stream len)
105 (declare (type codec-character-stream stream)
106 (type integer len))
107 (with-slots (decoder back buffer read-pos) stream
108 (let ((readbuf (make-array (list len) :element-type '(unsigned-byte 8))))
109 (loop (unless (< (- (length buffer) read-pos) len) (return t))
75545f66 110 (let ((readlen (read-sequence readbuf back :end (- len (- (length buffer) read-pos)))))
fd26d811
FT
111 (when (= readlen 0)
112 (return-from ccs-ensure-buffer nil))
113 (funcall decoder readbuf buffer :end readlen))))))
114
115(defun ccs-clear-buffer (stream)
116 (declare (type codec-character-stream stream))
117 (with-slots (read-pos buffer) stream
118 (replace buffer buffer :start2 read-pos)
75545f66
FT
119 (setf (fill-pointer buffer) (- (fill-pointer buffer) read-pos)
120 read-pos 0)))
fd26d811
FT
121
122(defmethod stream-read-char ((stream codec-character-stream))
123 (unless (ccs-ensure-buffer stream 1)
124 (return-from stream-read-char :eof))
125 (with-slots (read-pos buffer) stream
126 (prog1 (aref buffer read-pos)
127 (when (>= (incf read-pos) 16)
128 (ccs-clear-buffer stream)))))
129
130(defmethod stream-unread-char ((stream codec-character-stream) char)
131 (with-slots (read-pos buffer) stream
132 (when (= read-pos 0)
133 (let ((len (length buffer)))
134 (when (< (array-dimension buffer 0) (+ len 16))
135 (adjust-array buffer (list (setf (fill-pointer buffer)
136 (+ len 16)))))
137 (replace buffer buffer :start1 16 :end2 len)))
75545f66 138 (setf (aref buffer (decf read-pos)) char)
fd26d811
FT
139 nil))
140
141(defun ccs-wont-hang-p (stream)
142 (declare (type codec-character-stream stream))
143 (with-slots (read-pos back buffer) stream
144 (or (and (< read-pos (length buffer)) (aref buffer read-pos))
145 (listen back))))
146
147(defmethod stream-read-char-no-hang ((stream codec-character-stream))
148 (if (ccs-wont-hang-p stream)
149 (stream-read-char stream)
150 nil))
151
152(defmethod stream-peek-char ((stream codec-character-stream))
153 (unless (ccs-ensure-buffer stream 1)
154 (return-from stream-peek-char :eof))
155 (with-slots (read-pos buffer) stream
156 (aref buffer read-pos)))
157
158(defmethod stream-listen ((stream codec-character-stream))
159 (if (ccs-wont-hang-p stream)
160 (let ((peek (stream-peek-char stream)))
161 (if (eq peek :eof)
162 nil
163 peek))
164 nil))
165
166(defmethod stream-write-char ((stream codec-character-stream) char)
167 (with-slots (encoder back) stream
168 (let ((seq (make-array '(1) :element-type 'character :initial-element char))
169 (outbuf (make-array '(16) :element-type '(unsigned-byte 8) :adjustable t :fill-pointer 0)))
170 (funcall encoder seq outbuf)
171 (write-sequence outbuf back))))
172
173(defmethod stream-finish-output ((stream codec-character-stream))
174 (finish-output (slot-value stream 'back)))
175
176(defmethod stream-force-output ((stream codec-character-stream))
177 (force-output (slot-value stream 'back)))
178
179(defmethod stream-read-sequence ((stream codec-character-stream) seq &optional (start 0) (end (length seq)))
180 (ccs-ensure-buffer stream (- end start))
181 (with-slots (read-pos buffer) stream
182 (replace seq buffer :start1 start :end1 end :start2 read-pos :end2 (length buffer))
183 (let ((len (min (- end start) (- (length buffer) read-pos))))
184 (when (>= (incf read-pos len) 128)
185 (ccs-clear-buffer stream)))))
186
187(defmethod stream-write-sequence ((stream codec-character-stream) seq &optional (start 0) (end (length seq)))
188 (with-slots (encoder back) stream
189 (let ((outbuf (make-array (list (- end start)) :element-type '(unsigned-byte 8) :adjustable t :fill-pointer 0)))
190 (funcall encoder seq outbuf)
191 (write-sequence outbuf back))))
192
193;;; Implementation-specific functions
194
195#+(or (and clisp unicode) sbcl)
196(defun unicode->char (unicode)
197 (declare (type (unsigned-byte 24) unicode))
198 (code-char unicode))
199
200#+(or (and clisp unicode) sbcl)
201(defun char->unicode (char)
202 (declare (type character char))
203 (char-code char))
204
4dd02a73
FT
205;;; ASCII
206
207(defun decode-ascii (byteseq charseq &key (start 0) (end (length byteseq)))
208 (declare (type (array (unsigned-byte 8)) byteseq)
209 (type (array character) charseq)
210 (type fixnum start end))
211 (loop
212 (restart-case
213 (loop
214 (unless (< start end) (return-from decode-ascii t))
215 (let ((byte (aref byteseq (prog1 start (incf start)))))
216 (unless (< byte 128)
217 (coding-error byteseq start charseq "Invalid byte ~D in ASCII stream." byte))
218 (vector-push-extend (unicode->char byte) charseq)))
219 (:replace-char (&optional (replacement (unicode->char #xfffd)))
220 :report "Replace the invalid byte with a character."
221 (vector-push-extend replacement charseq))
222 (:skip-char ()
223 :report "Ignore the invalid byte."
224 nil))))
225
226(defun encode-ascii (charseq byteseq &key (start 0) (end (length charseq)))
227 (declare (type (array (unsigned-byte 8)) byteseq)
228 (type (array character) charseq)
229 (type fixnum start end))
230 (loop
231 (restart-case
232 (loop
233 (unless (< start end) (return-from encode-ascii t))
234 (vector-push-extend (let ((cp (char->unicode (aref charseq (prog1 start (incf start))))))
235 (unless (< cp 128)
236 (coding-error charseq start byteseq "ASCII cannot encode code-points higher than 128."))
237 cp)
238 byteseq))
239 (:replace-char (&optional (replacement #\?))
240 :report "Replace this character with another."
241 (vector-push-extend (char->unicode replacement) byteseq))
242 (:skip-char ()
243 :report "Ignore this character."
244 nil))))
245
246(define-decoder (ascii)
247 #'decode-ascii)
248
249(define-encoder (ascii)
250 #'encode-ascii)
251
f8eaaa29
FT
252(define-codec-synonyms ascii :ascii)
253
fd26d811
FT
254;;; Latin-1
255
256(defun decode-latin-1 (byteseq charseq &key (start 0) (end (length byteseq)))
257 (declare (type (array (unsigned-byte 8)) byteseq)
258 (type (array character) charseq)
259 (type fixnum start end))
260 (do ((i start (1+ i)))
261 ((>= i end))
262 (vector-push-extend (unicode->char (aref byteseq i)) charseq))
263 t)
264
265(defun encode-latin-1 (charseq byteseq &key (start 0) (end (length charseq)))
266 (declare (type (array (unsigned-byte 8)) byteseq)
267 (type (array character) charseq)
268 (type fixnum start end))
269 (loop
270 (restart-case
271 (loop
272 (unless (< start end) (return-from encode-latin-1 t))
273 (vector-push-extend (let ((cp (char->unicode (aref charseq (prog1 start (incf start))))))
274 (unless (< cp 256)
275 (coding-error charseq start byteseq "ISO-8859-1 cannot encode code-points higher than 256."))
276 cp)
277 byteseq))
278 (:replace-char (&optional (replacement #\?))
279 :report "Replace this character with another."
280 (vector-push-extend (char->unicode replacement) byteseq))
281 (:skip-char ()
282 :report "Ignore this character."
283 nil))))
284
285(define-decoder (latin-1)
286 #'decode-latin-1)
287
288(define-encoder (latin-1)
289 #'encode-latin-1)
290
f8eaaa29 291(define-codec-synonyms latin-1 latin1 iso-8859-1 :latin-1 :latin1 :iso-8859-1)
fd26d811
FT
292
293;;; UTF-8
294
295(defun encode-utf-8 (charseq byteseq &key (start 0) (end (length charseq)))
296 (declare (type (array (unsigned-byte 8)) byteseq)
297 (type (array character) charseq)
298 (type fixnum start end))
299 (do ((i start (1+ i)))
300 ((>= i end))
301 (let ((cp (char->unicode (aref charseq i))))
302 (if (< cp 128)
303 (vector-push-extend cp byteseq)
304 (let ((nbytes 0)
305 (bytes '()))
306 (loop
307 (push (logior (ldb (byte 6 0) cp) #x80) bytes)
308 (setf cp (truncate cp 64))
309 (incf nbytes)
310 (when (< cp (expt 2 (- 6 nbytes)))
311 (push (logior (logand #xff (lognot (1- (expt 2 (- 7 nbytes)))))
312 cp)
313 bytes)
314 (return)))
315 (dolist (byte bytes)
316 (vector-push-extend byte byteseq))))))
317 t)
318
319(define-encoder (utf-8)
320 #'encode-utf-8)
321
322(define-decoder (utf-8)
323 (let ((mbuf 0)
324 (mlen 0))
325 (flet ((decode (byteseq charseq &key (start 0) (end (length byteseq)))
326 (declare (type (array (unsigned-byte 8)) byteseq)
327 (type (array character) charseq)
328 (type fixnum start end))
329 (let ((i start))
330 (flet ((failure (format &rest args)
331 (error 'simple-coding-error
332 :input byteseq :position i :result charseq
333 :format-control format :format-arguments args)))
334 (loop
335 (restart-case
336 (progn
337 (loop
338 (unless (< i end) (return))
339 (let ((byte (aref byteseq (prog1 i (incf i)))))
340 (if (= mlen 0)
341 (if (< byte 128)
342 (vector-push-extend (unicode->char byte) charseq)
343 (setf mlen (block zero
344 (dotimes (i 7)
345 (when (= (ldb (byte 1 (- 7 i)) byte) 0)
346 (when (< i 2)
347 (failure "UTF-8 sequence started with continuation byte: ~D" byte))
348 (return-from zero (1- i))))
349 (failure "Invalid UTF-8 sequence start byte: ~D" byte))
350 mbuf (ldb (byte (- 6 mlen) 0) byte)))
351 (progn (when (not (= (ldb (byte 2 6) byte) 2))
352 (failure "Invalid UTF-8 continuation byte: ~D" byte))
353 (setf mbuf (+ (* mbuf 64) (ldb (byte 6 0) byte)))
354 (when (= (decf mlen) 0)
355 (when (< mbuf 128)
356 (with-simple-restart (:accept "Accept anyway.")
357 (failure "UTF-8 multibyte sequence denoted an ASCII character ~S (either an encoding error or an attempt at breaking security)." (unicode->char mbuf))))
358 (vector-push-extend (unicode->char mbuf) charseq))))))
359 (return-from decode (= mlen 0)))
360 (:replace-char (&optional (replacement (unicode->char #xfffd)))
361 :report "Replace the invalid bytes with a character."
362 (vector-push-extend replacement charseq)
363 (loop (unless (and (< i end) (= (ldb (byte 2 6) (aref byteseq i)) 2))
364 (return))
365 (incf i))
366 (setf mlen 0))
367 (:skip-char ()
368 :report "Ignore the invalid byte sequence."
369 (loop (unless (and (< i end) (= (ldb (byte 2 6) (aref byteseq i)) 2))
370 (return))
371 (incf i))
372 (setf mlen 0))))))))
373 #'decode)))
374
f8eaaa29 375(define-codec-synonyms utf-8 utf8 :utf-8 :utf8)