Skip to content

Commit

Permalink
Add arff stream reader
Browse files Browse the repository at this point in the history
  • Loading branch information
neil-lindquist committed Jul 14, 2018
1 parent e0b3c6b commit bb97250
Show file tree
Hide file tree
Showing 4 changed files with 255 additions and 3 deletions.
2 changes: 1 addition & 1 deletion hjs/src/read-data.lisp
Original file line number Diff line number Diff line change
Expand Up @@ -501,7 +501,7 @@ line is column name. However if CSV-HEADER-P is a list of strings then
CSV-HEADER-P specifies the column names. EXTERNAL-FORMAT defaults to shift-jis."
(when (and (not external-format-p)
(eql type :csv))
(setf external-format *csv-default-external-format*))
(setf external-format clml.utility.csv::*csv-default-external-format*))
(with-open-file (stream filename :external-format external-format)
#+allegro (setf (excl:eol-convention f) :anynl-dos)
(read-data-from-stream stream
Expand Down
1 change: 1 addition & 0 deletions utility/clml.utility.asd
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
)
:components (
(:file "package")
(:file "arff")
(:file "csv")
(:file "priority-que")
(:file "fetch")
Expand Down
248 changes: 248 additions & 0 deletions utility/src/arff.lisp
Original file line number Diff line number Diff line change
@@ -0,0 +1,248 @@

(in-package :clml.utility.arff)

(defun spacep (char)
(member char '(#\space #\tab) :test 'char-equal))

(defun next-non-space (string &optional (start 0))
"Gets the next index of a non space character"
(position-if-not 'spacep string :start start))

(defun next-space (string &optional (start 0))
"Gets the next index of a space character"
(position-if 'spacep string :start start))


(defun read-string (string &optional (start 0))
"Reads a string from the string then returns the string
and the next index"
(let ((start (next-non-space string start)))
(unless start
(error "Cannot read a string from ~S" string))
(let* ((start-sentinal-p (or (eql #\" (char string start)) (eql #\' (char string start))))
(end-test (if start-sentinal-p
(let ((start-char (char string start)))
(incf start)
(lambda (char) (char= char start-char)))
(lambda (char) (member char '(#\space #\tab #\{ #\} #\,) :test #'char=))))
(end (position-if end-test string :start start)))
(unless end
(setf end (length string)))
(values (subseq string start end) (if start-sentinal-p (1+ end) end)))))

(defun read-datatype (string &optional (start 0))
(let* ((start (next-non-space string start))
(first-word (subseq string start (next-space string start))))
(cond
((null start)
(error "Cannot read a datatype from ~S" string))
((or (string-equal "numeric" first-word)
(string-equal "real" first-word))
'number)
((string-equal "integer" first-word)
'integer)
((string-equal "string" first-word)
'string)
((eql #\{ (char string start))
(iter (initially (setf loop-start start))
(for (type loop-start) = (multiple-value-list (read-string string (1+ loop-start))))
(collect type)
(setf loop-start (next-non-space string loop-start))
(unless loop-start (error "No closing } for nominal specifier"))
(while (char= (char string loop-start) #\,))))
(t (error "Cannot read a datatype from ~S" string)))))

(defun parse-header-line (line)
"Parses a line from the header into a cons pair containing a symbol
describing the line and a list containing any payload for the line.
Possible line descriptions are:
-- :relation - The name for the data. Has a single string as the payload
-- :attribute - The next attribute. Has a string name and a datatype as
the payload
-- :data - indicates the start of the data section
-- :empty-line - indicates an empty or comment line"

;Find first non space element to use as starting point
(let* ((start (next-non-space line))
(line-type (when start (subseq line (1+ start) (next-space line (1+ start))))))
(cond
((or (null start)
(eql #\% (char line start)))
'(:empty-line))
((not (eql #\@ (char line start)))
(error "Unable to process line ~S" line))
((string-equal "relation" line-type)
'(:relation))
((string-equal "attribute" line-type)
(multiple-value-bind (name next-start)
(read-string line (next-space line (1+ start)))
(list :attribute name (read-datatype line next-start))))
((string-equal "data" line-type)
'(:data))
(t (error "Unable to process line ~S" line)))))

(defun parse-arff-string (str)
(coerce
(let ((quote-or-escape-p (or (zerop (count #\" str :test #'char-equal))
(zerop (count #\' str :test #'char-equal))
(zerop (count #\\ str :test #'char-equal))))
(sep-regex (cl-ppcre:parse-string (string #\,))))
(cond ((not quote-or-escape-p)
(substitute nil "?" (cl-ppcre:split sep-regex str) :test 'string=))
(t
(macrolet ((push-f (fld flds) `(push (coerce (reverse ,fld) 'string) ,flds)))
(loop with state = :at-first ;; :at-first | :data-nq | :data-sq | :data-dq | :at-end | :line-end
with field with fields
for chr of-type character across str
until (eq state :line-end)
do (cond ((eq state :data-escape) (push chr field))
((eq state :at-first)
(setf field nil)
(cond ((char-equal chr #\\) (setf state :data-escape))
((char-equal chr #\") (setf state :data-dq))
((char-equal chr #\') (setf state :data-sq))
((char-equal chr #\,) (push "" fields))
((char-equal chr #\%)
(when fields ;else its a blank line
(push "" fields))
(setf state :line-end))
(t (setf state :data-nq) (push chr field))))
((eq state :data-nq)
(cond ((char-equal chr #\\) (setf state :data-escape))
((char-equal chr #\,)
(if (equal '(#\?) field)
(push nil fields)
(push-f field fields))
(setf state :at-first))
((char-equal chr #\%)
(if (equal '(#\?) field)
(push nil fields)
(push-f field fields))
(setf state :line-end))
(t (push chr field))))
((eq state :data-dq)
(cond ((char-equal chr #\\) (setf state :data-escape))
((char-equal chr #\") (setf state :at-end))
(t (push chr field))))
((eq state :data-sq)
(cond ((char-equal chr #\\) (setf state :data-escape))
((char-equal chr #\') (setf state :at-end))
(t (push chr field))))
((eq state :at-end)
(cond ((char-equal chr #\%) (setf state :line-end))
((char-equal chr #\,)
(push-f field fields)
(setf state :at-first))
(t (error "illegal value ( ~A ) after quotation" chr)))))
finally (return
(progn
(unless (eq state :line-end) (push-f field fields))
(reverse fields))))))))
'vector))

(defun read-arff-line (stream &key type-conv-fns map-fns (start 0) end)
"Read one line from stream and return a csv record.
A CSV record is a vector of elements.
type-conv-fns should be a list of functions.
If type-conv-fns is nil (the default case), then all will be treated
as string.
map-fns is a list of functions of one argument and output one result.
each function in it will be applied to the parsed element.
If map-fns is nil, then nothing will be applied.
start and end specifies how many elements per record will be included.
If start or end is negative, it counts from the end. -1 is the last element.
"
(declare (type (or (simple-array function *) null) type-conv-fns map-fns))
(let* ((rline (read-line stream nil nil)))
(when rline
(let* ((line (string-trim '(#\Space #\Tab #\Newline #\Return) rline))
(strs (parse-arff-string line))
(strs-size (length strs)))
(when (= (length strs) 0)
(return-from read-arff-line nil))
(when (< start 0)
(setf start (+ start strs-size)))
(when (and end (< end 0))
(setf end (+ end strs-size)))
(setf strs (subseq strs start end))
(when type-conv-fns
(unless (= (length strs) (length type-conv-fns))
(error "Number of type specifier (~a) does not match the number of elements (~a)."
(length type-conv-fns) (length strs))))
(when map-fns
(unless (= (length strs) (length map-fns))
(error "Number of mapping functions (~a) does not match the number of elements (~a)."
(length strs) (length map-fns))))
(let ((result strs))
;; strs is not needed so we simply overwrite it
(when type-conv-fns
(setf result
(map 'vector #'funcall type-conv-fns result)))
(when map-fns
(setf result
(map 'vector #'funcall map-fns result)))
result)))))

(defun read-arff-stream (stream &key map-fns (start 0) end)
"Read from stream in arff format until eof and return a csv table.
A csv table is a vector of csv records.
A csv record is a vector of elements.
map-fns is a list of functions of one argument and output one result.
each function in it will be applied to the parsed element.
If any function in the list is nil or t, it equals to #'identity.
If map-fns is nil, then nothing will be applied.
start and end specifies how many elements per record will be included.
If start or end is negative, it counts from the end. -1 is the last element.
"
(let* ((attributes
(loop for (line-type . line-data) = (parse-header-line (read-line stream))
until (eql line-type :data)
if (eql line-type :attribute)
collect line-data))
(type-conv-fns
(macrolet ((make-num-specifier (specifier)
`(lambda (s) (let ((s (clml.utility.csv::parse-number-no-error s s)))
(if (numberp s) (funcall ,specifier s) s)))))
(map 'vector
(lambda (attribute)
(let ((type (second attribute)))
(cond
((eql 'string type) #'identity)
((eql 'number type) (make-num-specifier #'identity))
((eql 'integer type) (make-num-specifier #'round))
((listp type)
(lambda (value)
(if (member value type :test #'string-equal)
value
(error "~S is not one of ~S" value type))))
(t (error "Unknown attribute type ~S" type)))))
attributes)))
(map-fns
(when map-fns
(map 'vector
(lambda (fn)
(cond ((or (eq fn t)
(eq fn nil))
#'identity)
((functionp fn)
fn)
((and (symbolp fn)
(not (keywordp fn)))
(symbol-function fn))
(t (error "~a is not a valid function specifier." fn))))
map-fns))))
(loop for rec = (read-arff-line stream :type-conv-fns type-conv-fns :map-fns map-fns
:start start :end end)
while rec
collect rec into result
finally (return
(values
(coerce result 'vector)
(map 'vector 'first attributes))))))
7 changes: 5 additions & 2 deletions utility/src/package.lisp
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
;-*- coding: utf-8 -*-

(defpackage :clml.utility.arff
(:use :common-lisp :iterate)
(:export #:read-arff-stream))

(defpackage :clml.utility.csv
(:use :common-lisp :iterate :parse-number)

Expand All @@ -17,7 +21,7 @@
(:import-from :iterate :iter :iterate)
(:shadowing-import-from :iterate :while)
(:import-from :alexandria #:define-constant)

(:export #:make-prique
#:prique-empty-p
#:prique-box-item
Expand All @@ -32,4 +36,3 @@
(:use :common-lisp)
(:export
#:fetch #:process-finance-header))

0 comments on commit bb97250

Please sign in to comment.