Skip to content

Commit

Permalink
UTF-8 tests
Browse files Browse the repository at this point in the history
  • Loading branch information
vii committed Mar 11, 2008
1 parent 50c8b4c commit a32384f
Show file tree
Hide file tree
Showing 6 changed files with 75 additions and 11 deletions.
2 changes: 1 addition & 1 deletion src/io/sendbuf.lisp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
(cond ((sendbuf-p x)
(my merge x))
(x
(let ((buf (force-byte-vector x)))
(let ((buf (force-simple-byte-vector x)))
(unless (zerop (length buf))
(incf (my num-bufs))
(incf (my len) (the sendbuf-small-integer (length buf)))
Expand Down
5 changes: 3 additions & 2 deletions src/lib/utf8.lisp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

(defun utf8-encode-really (string)
(declare (optimize speed))
(declare (type simple-string string))
(let ((dest-len
(loop for c across string summing (utf8-char-length (char-code c)))) )
(let ((vec (make-byte-vector dest-len)))
Expand Down Expand Up @@ -47,9 +48,9 @@
(let ((vec (make-byte-vector (length str))))
(loop for i fixnum from 0 for s across str do
(let ((c (char-code s)))
(when (> c 127)
(when (> #x80 c)
(return-from encode (utf8-encode-really str)))
(setf (aref vec i) (char-code s))))
(setf (aref vec i) c)))
vec)))

#+tpd2-big-characters-in-strings
Expand Down
26 changes: 18 additions & 8 deletions src/lib/utils.lisp
Original file line number Diff line number Diff line change
Expand Up @@ -6,23 +6,32 @@
(def-if-unbound defun-consistent utf8-encode (string)
(map 'byte-vector 'char-code string))

(defun-consistent byte-vector-to-string (vec)
(utf8-decode vec))
(defun-consistent byte-vector-to-simple-byte-vector (val)
(declare (optimize speed (safety 0)))
(declare (type (and byte-vector (not simple-byte-vector)) val))
(let ((ret (make-byte-vector (length val))))
(replace ret val)
ret))

(declaim (ftype (function ((and byte-vector (not simple-byte-vector))) simple-byte-vector) byte-vector-to-simple-byte-vector-consistent-internal))

(defun-consistent force-string (val)
(declare (optimize speed))
(declare (optimize speed (safety 0)))
(let ((str
(the string
(typecase val
(null "")
(symbol (symbol-name val))
(byte-vector (byte-vector-to-string val))
(string val)
(simple-byte-vector (utf8-decode val))
(byte-vector (utf8-decode (byte-vector-to-simple-byte-vector val)))
(t (let ((*print-pretty* nil)) (princ-to-string val)))))))
(etypecase str
(simple-string str)
(string (replace (make-string (length str)) (the (and string (not simple-string)) str))))))
(string
(locally
(declare (type (and string (not simple-string)) str))
(replace (make-string (length str)) str))))))

(declaim (ftype (function (t) simple-string) force-string-consistent-internal))

Expand All @@ -46,12 +55,13 @@
(etypecase val
(simple-byte-vector val)
(byte-vector
(let ((ret (make-byte-vector (length val))))
(replace ret (the (and byte-vector (not simple-byte-vector)) val))
ret)))))
(byte-vector-to-simple-byte-vector val)))))

(declaim (ftype (function (t) simple-byte-vector) force-simple-byte-vector-consistent-internal))

(defun-consistent byte-vector-to-string (vec)
(utf8-decode (force-simple-byte-vector vec)))

(defun byte-vector-cat (&rest args)
(declare (optimize speed))
(let ((vecs (mapcar (lambda(x)(force-byte-vector x)) args)))
Expand Down
1 change: 1 addition & 0 deletions src/packages.lisp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#:byte-vector-to-string
#:make-byte-vector
#:force-byte-vector
#:force-simple-byte-vector
#:utf8-encode
#:utf8-decode
#:with-pointer-to-vector-data
Expand Down
51 changes: 51 additions & 0 deletions t/utf8.lisp
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
(in-package #:tpd2.test)

(def-suite utf8 :in :tpd2)
(in-suite utf8)

#+(and sbcl tpd2-big-characters-in-strings)
(test string-to-octets
(loop for code in '(0 #x7f #x80 #x81 #x400 #x7ff #x800 #x801 #x1000 #xffff #x10000 #x10001 #x10002 #x10000) do
(let ((string (string (code-char code))))
(let ((u8e (utf8-encode string))
(sto (sb-ext:string-to-octets string :external-format :utf-8)))
(is (equalp u8e sto))

(let ((u8d (utf8-decode sto))
(ots (sb-ext:octets-to-string u8e :external-format :utf-8)))
(is (equalp u8d string))
(is (equalp ots string)))))))

(defun make-bad-utf8-sequence (code length)
(let ((vec (make-byte-vector length)) (i 0))
(flet ((out (val)
(setf (aref vec i) val)
(incf i)))
(ecase length
(1 (out code))
(2
(out (logior #xc0 (ash code -6)))
(out (logior #x80 (logand code #x3f))))
(3
(out (logior #xe0 (ash code -12)))
(out (logior #x80 (logand (ash code -6) #x3f)))
(out (logior #x80 (logand code #x3f))))
(4
(out (logior #xf0 (ash code -18)))
(out (logior #x80 (logand (ash code -12) #x3f)))
(out (logior #x80 (logand (ash code -6) #x3f)))
(out (logior #x80 (logand code #x3f))))))
vec))

#+tpd2-big-characters-in-strings
(test invalid-sequences
(let ((bad-seqs
'( (2 0) (2 #x40) (2 #x7f)
(3 #x400) (3 #x7ff)
(4 #x800)))
(invalid-decode (string (code-char #xfffd))))
(loop for (min-len code) in bad-seqs do
(loop for len from min-len upto 4 do
(let ((dec (utf8-decode (make-bad-utf8-sequence code len))))
(is (string= dec invalid-decode)))))
(is (string= (utf8-decode (force-byte-vector '(#x80))) invalid-decode))))
1 change: 1 addition & 0 deletions teepeedee2.asd
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@
:components (
(:file "suite")
(:file "io" :depends-on ("suite"))
(:file "utf8" :depends-on ("suite"))
(:file "regex" :depends-on ("suite")))))
:depends-on (
:trivial-garbage
Expand Down

0 comments on commit a32384f

Please sign in to comment.