|
| 1 | + |
| 2 | +(in-package :clml.utility.arff) |
| 3 | + |
| 4 | +(defun spacep (char) |
| 5 | + (member char '(#\space #\tab) :test 'char-equal)) |
| 6 | + |
| 7 | +(defun next-non-space (string &optional (start 0)) |
| 8 | + "Gets the next index of a non space character" |
| 9 | + (position-if-not 'spacep string :start start)) |
| 10 | + |
| 11 | +(defun next-space (string &optional (start 0)) |
| 12 | + "Gets the next index of a space character" |
| 13 | + (position-if 'spacep string :start start)) |
| 14 | + |
| 15 | + |
| 16 | +(defun read-string (string &optional (start 0)) |
| 17 | + "Reads a string from the string then returns the string |
| 18 | + and the next index" |
| 19 | + (let ((start (next-non-space string start))) |
| 20 | + (unless start |
| 21 | + (error "Cannot read a string from ~S" string)) |
| 22 | + (let* ((start-sentinal-p (or (eql #\" (char string start)) (eql #\' (char string start)))) |
| 23 | + (end-test (if start-sentinal-p |
| 24 | + (let ((start-char (char string start))) |
| 25 | + (incf start) |
| 26 | + (lambda (char) (char= char start-char))) |
| 27 | + (lambda (char) (member char '(#\space #\tab #\{ #\} #\,) :test #'char=)))) |
| 28 | + (end (position-if end-test string :start start))) |
| 29 | + (unless end |
| 30 | + (setf end (length string))) |
| 31 | + (values (subseq string start end) (if start-sentinal-p (1+ end) end))))) |
| 32 | + |
| 33 | +(defun read-datatype (string &optional (start 0)) |
| 34 | + (let* ((start (next-non-space string start)) |
| 35 | + (first-word (subseq string start (next-space string start)))) |
| 36 | + (cond |
| 37 | + ((null start) |
| 38 | + (error "Cannot read a datatype from ~S" string)) |
| 39 | + ((or (string-equal "numeric" first-word) |
| 40 | + (string-equal "real" first-word)) |
| 41 | + 'number) |
| 42 | + ((string-equal "integer" first-word) |
| 43 | + 'integer) |
| 44 | + ((string-equal "string" first-word) |
| 45 | + 'string) |
| 46 | + ((eql #\{ (char string start)) |
| 47 | + (iter (initially (setf loop-start start)) |
| 48 | + (for (type loop-start) = (multiple-value-list (read-string string (1+ loop-start)))) |
| 49 | + (collect type) |
| 50 | + (setf loop-start (next-non-space string loop-start)) |
| 51 | + (unless loop-start (error "No closing } for nominal specifier")) |
| 52 | + (while (char= (char string loop-start) #\,)))) |
| 53 | + (t (error "Cannot read a datatype from ~S" string))))) |
| 54 | + |
| 55 | +(defun parse-header-line (line) |
| 56 | + "Parses a line from the header into a cons pair containing a symbol |
| 57 | + describing the line and a list containing any payload for the line. |
| 58 | + Possible line descriptions are: |
| 59 | + -- :relation - The name for the data. Has a single string as the payload |
| 60 | + -- :attribute - The next attribute. Has a string name and a datatype as |
| 61 | + the payload |
| 62 | + -- :data - indicates the start of the data section |
| 63 | + -- :empty-line - indicates an empty or comment line" |
| 64 | + |
| 65 | + ;Find first non space element to use as starting point |
| 66 | + (let* ((start (next-non-space line)) |
| 67 | + (line-type (when start (subseq line (1+ start) (next-space line (1+ start)))))) |
| 68 | + (cond |
| 69 | + ((or (null start) |
| 70 | + (eql #\% (char line start))) |
| 71 | + '(:empty-line)) |
| 72 | + ((not (eql #\@ (char line start))) |
| 73 | + (error "Unable to process line ~S" line)) |
| 74 | + ((string-equal "relation" line-type) |
| 75 | + '(:relation)) |
| 76 | + ((string-equal "attribute" line-type) |
| 77 | + (multiple-value-bind (name next-start) |
| 78 | + (read-string line (next-space line (1+ start))) |
| 79 | + (list :attribute name (read-datatype line next-start)))) |
| 80 | + ((string-equal "data" line-type) |
| 81 | + '(:data)) |
| 82 | + (t (error "Unable to process line ~S" line))))) |
| 83 | + |
| 84 | +(defun parse-arff-string (str) |
| 85 | + (coerce |
| 86 | + (let ((quote-or-escape-p (or (zerop (count #\" str :test #'char-equal)) |
| 87 | + (zerop (count #\' str :test #'char-equal)) |
| 88 | + (zerop (count #\\ str :test #'char-equal)))) |
| 89 | + (sep-regex (cl-ppcre:parse-string (string #\,)))) |
| 90 | + (cond ((not quote-or-escape-p) |
| 91 | + (substitute nil "?" (cl-ppcre:split sep-regex str) :test 'string=)) |
| 92 | + (t |
| 93 | + (macrolet ((push-f (fld flds) `(push (coerce (reverse ,fld) 'string) ,flds))) |
| 94 | + (loop with state = :at-first ;; :at-first | :data-nq | :data-sq | :data-dq | :at-end | :line-end |
| 95 | + with field with fields |
| 96 | + for chr of-type character across str |
| 97 | + until (eq state :line-end) |
| 98 | + do (cond ((eq state :data-escape) (push chr field)) |
| 99 | + ((eq state :at-first) |
| 100 | + (setf field nil) |
| 101 | + (cond ((char-equal chr #\\) (setf state :data-escape)) |
| 102 | + ((char-equal chr #\") (setf state :data-dq)) |
| 103 | + ((char-equal chr #\') (setf state :data-sq)) |
| 104 | + ((char-equal chr #\,) (push "" fields)) |
| 105 | + ((char-equal chr #\%) |
| 106 | + (when fields ;else its a blank line |
| 107 | + (push "" fields)) |
| 108 | + (setf state :line-end)) |
| 109 | + (t (setf state :data-nq) (push chr field)))) |
| 110 | + ((eq state :data-nq) |
| 111 | + (cond ((char-equal chr #\\) (setf state :data-escape)) |
| 112 | + ((char-equal chr #\,) |
| 113 | + (if (equal '(#\?) field) |
| 114 | + (push nil fields) |
| 115 | + (push-f field fields)) |
| 116 | + (setf state :at-first)) |
| 117 | + ((char-equal chr #\%) |
| 118 | + (if (equal '(#\?) field) |
| 119 | + (push nil fields) |
| 120 | + (push-f field fields)) |
| 121 | + (setf state :line-end)) |
| 122 | + (t (push chr field)))) |
| 123 | + ((eq state :data-dq) |
| 124 | + (cond ((char-equal chr #\\) (setf state :data-escape)) |
| 125 | + ((char-equal chr #\") (setf state :at-end)) |
| 126 | + (t (push chr field)))) |
| 127 | + ((eq state :data-sq) |
| 128 | + (cond ((char-equal chr #\\) (setf state :data-escape)) |
| 129 | + ((char-equal chr #\') (setf state :at-end)) |
| 130 | + (t (push chr field)))) |
| 131 | + ((eq state :at-end) |
| 132 | + (cond ((char-equal chr #\%) (setf state :line-end)) |
| 133 | + ((char-equal chr #\,) |
| 134 | + (push-f field fields) |
| 135 | + (setf state :at-first)) |
| 136 | + (t (error "illegal value ( ~A ) after quotation" chr))))) |
| 137 | + finally (return |
| 138 | + (progn |
| 139 | + (unless (eq state :line-end) (push-f field fields)) |
| 140 | + (reverse fields)))))))) |
| 141 | + 'vector)) |
| 142 | + |
| 143 | +(defun read-arff-line (stream &key type-conv-fns map-fns (start 0) end) |
| 144 | + "Read one line from stream and return a csv record. |
| 145 | +
|
| 146 | +A CSV record is a vector of elements. |
| 147 | +
|
| 148 | +type-conv-fns should be a list of functions. |
| 149 | +If type-conv-fns is nil (the default case), then all will be treated |
| 150 | +as string. |
| 151 | +
|
| 152 | +map-fns is a list of functions of one argument and output one result. |
| 153 | +each function in it will be applied to the parsed element. |
| 154 | +If map-fns is nil, then nothing will be applied. |
| 155 | +
|
| 156 | +start and end specifies how many elements per record will be included. |
| 157 | +If start or end is negative, it counts from the end. -1 is the last element. |
| 158 | +" |
| 159 | + (declare (type (or (simple-array function *) null) type-conv-fns map-fns)) |
| 160 | + (let* ((rline (read-line stream nil nil))) |
| 161 | + (when rline |
| 162 | + (let* ((line (string-trim '(#\Space #\Tab #\Newline #\Return) rline)) |
| 163 | + (strs (parse-arff-string line)) |
| 164 | + (strs-size (length strs))) |
| 165 | + (when (= (length strs) 0) |
| 166 | + (return-from read-arff-line nil)) |
| 167 | + (when (< start 0) |
| 168 | + (setf start (+ start strs-size))) |
| 169 | + (when (and end (< end 0)) |
| 170 | + (setf end (+ end strs-size))) |
| 171 | + (setf strs (subseq strs start end)) |
| 172 | + (when type-conv-fns |
| 173 | + (unless (= (length strs) (length type-conv-fns)) |
| 174 | + (error "Number of type specifier (~a) does not match the number of elements (~a)." |
| 175 | + (length type-conv-fns) (length strs)))) |
| 176 | + (when map-fns |
| 177 | + (unless (= (length strs) (length map-fns)) |
| 178 | + (error "Number of mapping functions (~a) does not match the number of elements (~a)." |
| 179 | + (length strs) (length map-fns)))) |
| 180 | + (let ((result strs)) |
| 181 | + ;; strs is not needed so we simply overwrite it |
| 182 | + (when type-conv-fns |
| 183 | + (setf result |
| 184 | + (map 'vector #'funcall type-conv-fns result))) |
| 185 | + (when map-fns |
| 186 | + (setf result |
| 187 | + (map 'vector #'funcall map-fns result))) |
| 188 | + result))))) |
| 189 | + |
| 190 | +(defun read-arff-stream (stream &key map-fns (start 0) end) |
| 191 | + "Read from stream in arff format until eof and return a csv table. |
| 192 | +
|
| 193 | +A csv table is a vector of csv records. |
| 194 | +A csv record is a vector of elements. |
| 195 | +
|
| 196 | +map-fns is a list of functions of one argument and output one result. |
| 197 | +each function in it will be applied to the parsed element. |
| 198 | +If any function in the list is nil or t, it equals to #'identity. |
| 199 | +If map-fns is nil, then nothing will be applied. |
| 200 | +
|
| 201 | +start and end specifies how many elements per record will be included. |
| 202 | +If start or end is negative, it counts from the end. -1 is the last element. |
| 203 | +" |
| 204 | + (let* ((attributes |
| 205 | + (loop for (line-type . line-data) = (parse-header-line (read-line stream)) |
| 206 | + until (eql line-type :data) |
| 207 | + if (eql line-type :attribute) |
| 208 | + collect line-data)) |
| 209 | + (type-conv-fns |
| 210 | + (macrolet ((make-num-specifier (specifier) |
| 211 | + `(lambda (s) (let ((s (clml.utility.csv::parse-number-no-error s s))) |
| 212 | + (if (numberp s) (funcall ,specifier s) s))))) |
| 213 | + (map 'vector |
| 214 | + (lambda (attribute) |
| 215 | + (let ((type (second attribute))) |
| 216 | + (cond |
| 217 | + ((eql 'string type) #'identity) |
| 218 | + ((eql 'number type) (make-num-specifier #'identity)) |
| 219 | + ((eql 'integer type) (make-num-specifier #'round)) |
| 220 | + ((listp type) |
| 221 | + (lambda (value) |
| 222 | + (if (member value type :test #'string-equal) |
| 223 | + value |
| 224 | + (error "~S is not one of ~S" value type)))) |
| 225 | + (t (error "Unknown attribute type ~S" type))))) |
| 226 | + attributes))) |
| 227 | + (map-fns |
| 228 | + (when map-fns |
| 229 | + (map 'vector |
| 230 | + (lambda (fn) |
| 231 | + (cond ((or (eq fn t) |
| 232 | + (eq fn nil)) |
| 233 | + #'identity) |
| 234 | + ((functionp fn) |
| 235 | + fn) |
| 236 | + ((and (symbolp fn) |
| 237 | + (not (keywordp fn))) |
| 238 | + (symbol-function fn)) |
| 239 | + (t (error "~a is not a valid function specifier." fn)))) |
| 240 | + map-fns)))) |
| 241 | + (loop for rec = (read-arff-line stream :type-conv-fns type-conv-fns :map-fns map-fns |
| 242 | + :start start :end end) |
| 243 | + while rec |
| 244 | + collect rec into result |
| 245 | + finally (return |
| 246 | + (values |
| 247 | + (coerce result 'vector) |
| 248 | + (map 'vector 'first attributes)))))) |
0 commit comments