Skip to content

Commit bb97250

Browse files
Add arff stream reader
1 parent e0b3c6b commit bb97250

File tree

4 files changed

+255
-3
lines changed

4 files changed

+255
-3
lines changed

hjs/src/read-data.lisp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -501,7 +501,7 @@ line is column name. However if CSV-HEADER-P is a list of strings then
501501
CSV-HEADER-P specifies the column names. EXTERNAL-FORMAT defaults to shift-jis."
502502
(when (and (not external-format-p)
503503
(eql type :csv))
504-
(setf external-format *csv-default-external-format*))
504+
(setf external-format clml.utility.csv::*csv-default-external-format*))
505505
(with-open-file (stream filename :external-format external-format)
506506
#+allegro (setf (excl:eol-convention f) :anynl-dos)
507507
(read-data-from-stream stream

utility/clml.utility.asd

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
)
3939
:components (
4040
(:file "package")
41+
(:file "arff")
4142
(:file "csv")
4243
(:file "priority-que")
4344
(:file "fetch")

utility/src/arff.lisp

Lines changed: 248 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,248 @@
1+
2+
(in-package :clml.utility.arff)
3+
4+
(defun spacep (char)
5+
(member char '(#\space #\tab) :test 'char-equal))
6+
7+
(defun next-non-space (string &optional (start 0))
8+
"Gets the next index of a non space character"
9+
(position-if-not 'spacep string :start start))
10+
11+
(defun next-space (string &optional (start 0))
12+
"Gets the next index of a space character"
13+
(position-if 'spacep string :start start))
14+
15+
16+
(defun read-string (string &optional (start 0))
17+
"Reads a string from the string then returns the string
18+
and the next index"
19+
(let ((start (next-non-space string start)))
20+
(unless start
21+
(error "Cannot read a string from ~S" string))
22+
(let* ((start-sentinal-p (or (eql #\" (char string start)) (eql #\' (char string start))))
23+
(end-test (if start-sentinal-p
24+
(let ((start-char (char string start)))
25+
(incf start)
26+
(lambda (char) (char= char start-char)))
27+
(lambda (char) (member char '(#\space #\tab #\{ #\} #\,) :test #'char=))))
28+
(end (position-if end-test string :start start)))
29+
(unless end
30+
(setf end (length string)))
31+
(values (subseq string start end) (if start-sentinal-p (1+ end) end)))))
32+
33+
(defun read-datatype (string &optional (start 0))
34+
(let* ((start (next-non-space string start))
35+
(first-word (subseq string start (next-space string start))))
36+
(cond
37+
((null start)
38+
(error "Cannot read a datatype from ~S" string))
39+
((or (string-equal "numeric" first-word)
40+
(string-equal "real" first-word))
41+
'number)
42+
((string-equal "integer" first-word)
43+
'integer)
44+
((string-equal "string" first-word)
45+
'string)
46+
((eql #\{ (char string start))
47+
(iter (initially (setf loop-start start))
48+
(for (type loop-start) = (multiple-value-list (read-string string (1+ loop-start))))
49+
(collect type)
50+
(setf loop-start (next-non-space string loop-start))
51+
(unless loop-start (error "No closing } for nominal specifier"))
52+
(while (char= (char string loop-start) #\,))))
53+
(t (error "Cannot read a datatype from ~S" string)))))
54+
55+
(defun parse-header-line (line)
56+
"Parses a line from the header into a cons pair containing a symbol
57+
describing the line and a list containing any payload for the line.
58+
Possible line descriptions are:
59+
-- :relation - The name for the data. Has a single string as the payload
60+
-- :attribute - The next attribute. Has a string name and a datatype as
61+
the payload
62+
-- :data - indicates the start of the data section
63+
-- :empty-line - indicates an empty or comment line"
64+
65+
;Find first non space element to use as starting point
66+
(let* ((start (next-non-space line))
67+
(line-type (when start (subseq line (1+ start) (next-space line (1+ start))))))
68+
(cond
69+
((or (null start)
70+
(eql #\% (char line start)))
71+
'(:empty-line))
72+
((not (eql #\@ (char line start)))
73+
(error "Unable to process line ~S" line))
74+
((string-equal "relation" line-type)
75+
'(:relation))
76+
((string-equal "attribute" line-type)
77+
(multiple-value-bind (name next-start)
78+
(read-string line (next-space line (1+ start)))
79+
(list :attribute name (read-datatype line next-start))))
80+
((string-equal "data" line-type)
81+
'(:data))
82+
(t (error "Unable to process line ~S" line)))))
83+
84+
(defun parse-arff-string (str)
85+
(coerce
86+
(let ((quote-or-escape-p (or (zerop (count #\" str :test #'char-equal))
87+
(zerop (count #\' str :test #'char-equal))
88+
(zerop (count #\\ str :test #'char-equal))))
89+
(sep-regex (cl-ppcre:parse-string (string #\,))))
90+
(cond ((not quote-or-escape-p)
91+
(substitute nil "?" (cl-ppcre:split sep-regex str) :test 'string=))
92+
(t
93+
(macrolet ((push-f (fld flds) `(push (coerce (reverse ,fld) 'string) ,flds)))
94+
(loop with state = :at-first ;; :at-first | :data-nq | :data-sq | :data-dq | :at-end | :line-end
95+
with field with fields
96+
for chr of-type character across str
97+
until (eq state :line-end)
98+
do (cond ((eq state :data-escape) (push chr field))
99+
((eq state :at-first)
100+
(setf field nil)
101+
(cond ((char-equal chr #\\) (setf state :data-escape))
102+
((char-equal chr #\") (setf state :data-dq))
103+
((char-equal chr #\') (setf state :data-sq))
104+
((char-equal chr #\,) (push "" fields))
105+
((char-equal chr #\%)
106+
(when fields ;else its a blank line
107+
(push "" fields))
108+
(setf state :line-end))
109+
(t (setf state :data-nq) (push chr field))))
110+
((eq state :data-nq)
111+
(cond ((char-equal chr #\\) (setf state :data-escape))
112+
((char-equal chr #\,)
113+
(if (equal '(#\?) field)
114+
(push nil fields)
115+
(push-f field fields))
116+
(setf state :at-first))
117+
((char-equal chr #\%)
118+
(if (equal '(#\?) field)
119+
(push nil fields)
120+
(push-f field fields))
121+
(setf state :line-end))
122+
(t (push chr field))))
123+
((eq state :data-dq)
124+
(cond ((char-equal chr #\\) (setf state :data-escape))
125+
((char-equal chr #\") (setf state :at-end))
126+
(t (push chr field))))
127+
((eq state :data-sq)
128+
(cond ((char-equal chr #\\) (setf state :data-escape))
129+
((char-equal chr #\') (setf state :at-end))
130+
(t (push chr field))))
131+
((eq state :at-end)
132+
(cond ((char-equal chr #\%) (setf state :line-end))
133+
((char-equal chr #\,)
134+
(push-f field fields)
135+
(setf state :at-first))
136+
(t (error "illegal value ( ~A ) after quotation" chr)))))
137+
finally (return
138+
(progn
139+
(unless (eq state :line-end) (push-f field fields))
140+
(reverse fields))))))))
141+
'vector))
142+
143+
(defun read-arff-line (stream &key type-conv-fns map-fns (start 0) end)
144+
"Read one line from stream and return a csv record.
145+
146+
A CSV record is a vector of elements.
147+
148+
type-conv-fns should be a list of functions.
149+
If type-conv-fns is nil (the default case), then all will be treated
150+
as string.
151+
152+
map-fns is a list of functions of one argument and output one result.
153+
each function in it will be applied to the parsed element.
154+
If map-fns is nil, then nothing will be applied.
155+
156+
start and end specifies how many elements per record will be included.
157+
If start or end is negative, it counts from the end. -1 is the last element.
158+
"
159+
(declare (type (or (simple-array function *) null) type-conv-fns map-fns))
160+
(let* ((rline (read-line stream nil nil)))
161+
(when rline
162+
(let* ((line (string-trim '(#\Space #\Tab #\Newline #\Return) rline))
163+
(strs (parse-arff-string line))
164+
(strs-size (length strs)))
165+
(when (= (length strs) 0)
166+
(return-from read-arff-line nil))
167+
(when (< start 0)
168+
(setf start (+ start strs-size)))
169+
(when (and end (< end 0))
170+
(setf end (+ end strs-size)))
171+
(setf strs (subseq strs start end))
172+
(when type-conv-fns
173+
(unless (= (length strs) (length type-conv-fns))
174+
(error "Number of type specifier (~a) does not match the number of elements (~a)."
175+
(length type-conv-fns) (length strs))))
176+
(when map-fns
177+
(unless (= (length strs) (length map-fns))
178+
(error "Number of mapping functions (~a) does not match the number of elements (~a)."
179+
(length strs) (length map-fns))))
180+
(let ((result strs))
181+
;; strs is not needed so we simply overwrite it
182+
(when type-conv-fns
183+
(setf result
184+
(map 'vector #'funcall type-conv-fns result)))
185+
(when map-fns
186+
(setf result
187+
(map 'vector #'funcall map-fns result)))
188+
result)))))
189+
190+
(defun read-arff-stream (stream &key map-fns (start 0) end)
191+
"Read from stream in arff format until eof and return a csv table.
192+
193+
A csv table is a vector of csv records.
194+
A csv record is a vector of elements.
195+
196+
map-fns is a list of functions of one argument and output one result.
197+
each function in it will be applied to the parsed element.
198+
If any function in the list is nil or t, it equals to #'identity.
199+
If map-fns is nil, then nothing will be applied.
200+
201+
start and end specifies how many elements per record will be included.
202+
If start or end is negative, it counts from the end. -1 is the last element.
203+
"
204+
(let* ((attributes
205+
(loop for (line-type . line-data) = (parse-header-line (read-line stream))
206+
until (eql line-type :data)
207+
if (eql line-type :attribute)
208+
collect line-data))
209+
(type-conv-fns
210+
(macrolet ((make-num-specifier (specifier)
211+
`(lambda (s) (let ((s (clml.utility.csv::parse-number-no-error s s)))
212+
(if (numberp s) (funcall ,specifier s) s)))))
213+
(map 'vector
214+
(lambda (attribute)
215+
(let ((type (second attribute)))
216+
(cond
217+
((eql 'string type) #'identity)
218+
((eql 'number type) (make-num-specifier #'identity))
219+
((eql 'integer type) (make-num-specifier #'round))
220+
((listp type)
221+
(lambda (value)
222+
(if (member value type :test #'string-equal)
223+
value
224+
(error "~S is not one of ~S" value type))))
225+
(t (error "Unknown attribute type ~S" type)))))
226+
attributes)))
227+
(map-fns
228+
(when map-fns
229+
(map 'vector
230+
(lambda (fn)
231+
(cond ((or (eq fn t)
232+
(eq fn nil))
233+
#'identity)
234+
((functionp fn)
235+
fn)
236+
((and (symbolp fn)
237+
(not (keywordp fn)))
238+
(symbol-function fn))
239+
(t (error "~a is not a valid function specifier." fn))))
240+
map-fns))))
241+
(loop for rec = (read-arff-line stream :type-conv-fns type-conv-fns :map-fns map-fns
242+
:start start :end end)
243+
while rec
244+
collect rec into result
245+
finally (return
246+
(values
247+
(coerce result 'vector)
248+
(map 'vector 'first attributes))))))

utility/src/package.lisp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
;-*- coding: utf-8 -*-
22

3+
(defpackage :clml.utility.arff
4+
(:use :common-lisp :iterate)
5+
(:export #:read-arff-stream))
6+
37
(defpackage :clml.utility.csv
48
(:use :common-lisp :iterate :parse-number)
59

@@ -17,7 +21,7 @@
1721
(:import-from :iterate :iter :iterate)
1822
(:shadowing-import-from :iterate :while)
1923
(:import-from :alexandria #:define-constant)
20-
24+
2125
(:export #:make-prique
2226
#:prique-empty-p
2327
#:prique-box-item
@@ -32,4 +36,3 @@
3236
(:use :common-lisp)
3337
(:export
3438
#:fetch #:process-finance-header))
35-

0 commit comments

Comments
 (0)