-
Notifications
You must be signed in to change notification settings - Fork 5
/
confusion-matrix.lisp
361 lines (330 loc) · 11.2 KB
/
confusion-matrix.lisp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
;;;; Confusion matrices
;;;; class definition, initiatlization, formatting, auxiliary functions, etc
;;;;
;;;; For two sets of analyses of a set of sentences, a confusion
;;;; matrix compares them with respect to a label for each token. Each
;;;; token of a sentence is classified by a pair of labels (L1, L2),
;;;; meaning that the first analyses for the token labels it as L1,
;;;; while the second analyses labels it as L2.
;;; Usage overview:
;;
;; For creating a new confusion matrix from two lists of sentence, use
;; MAKE-CONFUSION-MATRIX.
;; For the list of labels, use CONFUSION-MATRIX-LABELS.
;; For accessing the cells, use CONFUSION-MATRIX-CELL-COUNT for the
;; values and CONFUSION-MATRIX-CELL-TOKENS for the instances (in ids,
;; not the objects).
;; Notice that, by default, not necessarily the matrix is square, as
;; not every pair LABEL1 LABEL2 is instantiated. For normalizing it,
;; instantiating every possible LABEL1 LABEL2 pair with a cell with 0
;; entries, use CONFUSION-MATRIX-NORMALIZE.
;;; class definition
(in-package :conllu.evaluate)
(defclass confusion-matrix ()
((corpus-id :initarg :corpus-id
:accessor confusion-matrix-corpus-id
:documentation "Identifier of the corpus or experiment.")
(key-fn :initarg :key-fn
:initform #'token-upostag
:accessor confusion-matrix-key-fn
:documentation "Function used to label each token.")
(test-fn :initarg :test-fn
:initform #'equal
:accessor confusion-matrix-test-fn
:documentation "Function which compares two labels.
Typically a form of equality.")
(sort-fn :initarg :sort-fn
:initform #'(lambda (x y)
(string<=
(format nil "~a" x)
(format nil "~a" y)))
:accessor confusion-matrix-sort-fn
:documentation "Function which sorts labels. By default,
converts labels to string and uses lexicographical order")
(rows :accessor confusion-matrix-rows
:documentation "Parameter which contains the contents of the
confusion matrix."
;; Hash table which maps to rows, which are themselves
;; hash tables that maps to an array #(COUNT LIST), where
;; LIST is a list of (sentence-id . token-id)
)))
(defmethod initialize-instance :after
((obj confusion-matrix) &key &allow-other-keys)
(setf (confusion-matrix-rows obj)
(make-hash-table :test (confusion-matrix-test-fn obj))))
(defmethod print-object ((obj confusion-matrix) out)
(print-unreadable-object (obj out :type t)
(format out "~%~{~a~%~}"
(mapcar
#'(lambda (label-pair)
`(,(first label-pair)
,(second label-pair)
,(confusion-matrix-cell-count
(first label-pair)
(second label-pair)
obj)))
(confusion-matrix-cells-labels obj)))))
;;; Utility functions
(defun confusion-matrix-rows-labels (cm)
"Returns the list of labels occuring in the rows of the confusion
matix CM."
(sort
(copy-list
(alexandria:hash-table-keys
(confusion-matrix-rows cm)))
(confusion-matrix-sort-fn cm)))
(defun confusion-matrix-columns-labels (cm)
(sort
(remove-duplicates
(alexandria:mappend
#'alexandria:hash-table-keys
(alexandria:hash-table-values (confusion-matrix-rows cm)))
:test (confusion-matrix-test-fn cm))
(confusion-matrix-sort-fn cm)))
(defun confusion-matrix-labels (cm)
"Returns the list of all labels in the confusion matrix CM."
;; output: list of labels
(sort
(remove-duplicates
(append
(confusion-matrix-rows-labels cm)
(confusion-matrix-columns-labels cm))
:test (confusion-matrix-test-fn cm))
(confusion-matrix-sort-fn cm)))
(defun confusion-matrix-cells-labels (cm)
"Returns a list of '(LABEL1 LABEL2) for each cell in the confusion
matrix CM."
;; output: list of pairs of labels
(apply #'append
(mapcar
#'(lambda (row)
(mapcar
#'(lambda (column)
`(,row ,column))
(sort (alexandria:hash-table-keys (gethash
row
(confusion-matrix-rows cm)))
(confusion-matrix-sort-fn cm))))
(sort (alexandria:hash-table-keys (confusion-matrix-rows cm))
(confusion-matrix-sort-fn cm)))))
(defun confusion-matrix-cell-count (label1 label2 cm &key default-if-undefined)
"Returns the number of tokens that are contained in the cell defined
by LABEL1 LABEL2 in the confusion matrix CM.
If DEFAULT-IF-UNDEFINED, returns 0. Otherwise, raises an error in
case there is no such cell."
;; output: int
(let ((entry-array
(gethash label2
(gethash label1 (confusion-matrix-rows cm)))))
(cond
(entry-array
(aref
entry-array
0))
(default-if-undefined
0)
(t
(error "There is no cell (~a ~a)."
label1
label2)))))
(defun confusion-matrix-cell-tokens (label1 label2 cm &key default-if-undefined)
"Returns the list of (SENT-ID . TOKEN-ID) of tokens in the cell
LABEL1 LABEL2.
If DEFAULT-IF-UNDEFINED, returns the empty list. Otherwise, raises
an error in case there is no such cell."
;; output: list of (sent-id . token-id)
(let ((entry-array
(gethash label2
(gethash label1 (confusion-matrix-rows cm)))))
(cond
(entry-array
(aref
entry-array
1))
(default-if-undefined
'())
(t
(error "There is no cell (~a ~a)."
label1
label2)))))
;; TODO
;; (defun confusion-matrix-sentences-ids (cm)
;; ;; output: list of strings
;; ...)
;; (defun confusion-matrix-exact-match-sentences (cm)
;; ;; output: list of strings (sent-id)
;; ...)
;;; initialization
(defun make-confusion-matrix (list-sent1 list-sent2
&key corpus-id (key-fn #'token-upostag) (test-fn #'equal)
(sort-fn #'(lambda (x y)
(string<=
(format nil "~a" x)
(format nil "~a" y)))))
"Creates a new confusion matrix from the lists of sentences
LIST-SENT1 and LIST-SENT2."
(assert (equal
(length list-sent1)
(length list-sent2))
()
"LIST-SENT1 and LIST-SENT2 should have the same number of sentences!")
(let ((cm (make-instance 'confusion-matrix
:test-fn test-fn
:key-fn key-fn
:corpus-id corpus-id
:sort-fn sort-fn)))
(confusion-matrix-update list-sent1 list-sent2 cm)))
(defun confusion-matrix-update (list-sent1 list-sent2 cm)
"Updates an existing confusion matrix by a list of sentences
LIST-SENT1 and LIST-SENT2."
(assert (equal
(length list-sent1)
(length list-sent2))
()
"LIST-SENT1 and LIST-SENT2 should have the same number of sentences!")
(mapc
#'(lambda (sent1 sent2)
(confusion-matrix-update-sentences sent1 sent2 cm))
list-sent1
list-sent2)
cm)
;;; low-level updating
(defun confusion-matrix-update-sentences (sent1 sent2 cm)
"Updates an existing confusion matrix by a pair of matching
sentences SENT1 and SENT2. That is, SENT1 and SENT2 should be
alternative analyses of the same natural language sentence."
(assert (equal
(sentence-size sent1)
(sentence-size sent2))
()
"SENTENCES~%~a~%and~%~a~% do not have the same number of tokens!"
sent1
sent2)
(assert (equal
(sentence-id sent1)
(sentence-id sent2))
()
"SENTENCES~%~a~%and~%~a~% do not have the same ID!"
sent1
sent2)
(mapc
#'(lambda (tk1 tk2)
(confusion-matrix-update-tokens
tk1 tk2 cm))
(sentence-tokens sent1)
(sentence-tokens sent2)))
(defun confusion-matrix-update-tokens (token1 token2 cm)
(assert (equal
(token-id token1)
(token-id token2))
()
"Different tokens are being compared! Tokens ~a and ~a do not have the same ID. ~%Perhaps different sentences are being compared."
token1 token2)
(assert (equal
(token-form token1)
(token-form token2))
()
"Different tokens are being compared! Tokens ~a and ~a do not have the same FORM. ~%Perhaps different sentences are being compared."
token1 token2)
(insert-entry-confusion-matrix
(funcall (confusion-matrix-key-fn cm)
token1)
(funcall (confusion-matrix-key-fn cm)
token2)
token1
cm))
(defun insert-entry-confusion-matrix (label1 label2 token cm)
"Inserts TOKEN as an occurence in the cell LABEL1 LABEL2 of the
confusion matrix CM."
(unless (existing-cell-p label1 label2 cm)
(create-cell label1 label2 cm))
(let ((cell (gethash
label2
(gethash label1 (confusion-matrix-rows cm)))))
(incf (aref cell 0))
(push `(,(sentence-id (token-sentence token))
,(token-id token))
(aref cell 1))
cell))
(defun existing-cell-p (label1 label2 cm)
"Predicate for verifying whether the cell for row LABEL1 and column
LABEL2 already exist in the confusion matrix CM."
(when (member label1
(alexandria:hash-table-keys
(confusion-matrix-rows cm))
:test (confusion-matrix-test-fn cm))
(let ((row (gethash label1
(confusion-matrix-rows cm))))
(assert (hash-table-p row))
(if (member label2
(alexandria:hash-table-keys
row)
:test (confusion-matrix-test-fn cm))
t))))
(defun create-cell (label1 label2 cm)
"Creates the cell for row LABEL1 and column LABEL2 in confusion
matrix CM."
(unless (member label1
(alexandria:hash-table-keys
(confusion-matrix-rows cm))
:test (confusion-matrix-test-fn cm))
(setf (gethash label1
(confusion-matrix-rows cm))
(make-hash-table :test
(confusion-matrix-test-fn cm))))
(let ((row (gethash label1
(confusion-matrix-rows cm))))
(unless (member label2
(alexandria:hash-table-keys
row)
:test (confusion-matrix-test-fn cm))
(setf (gethash label2
row)
(make-array
2
:initial-contents '(0 ()) )))))
;;; content adjustment
(defun confusion-matrix-copy (cm)
"Returns a new CONFUSION-MATRIX with the same cells values as CM."
(let ((cm-new (make-instance 'confusion-matrix
:test-fn (confusion-matrix-test-fn cm)
:key-fn (confusion-matrix-key-fn cm)
:corpus-id (confusion-matrix-corpus-id cm)
:sort-fn (confusion-matrix-sort-fn cm))))
(maphash
#'(lambda (label row)
(setf (gethash label
(confusion-matrix-rows cm-new))
(alexandria:copy-hash-table
(gethash label
(confusion-matrix-rows cm)))))
(confusion-matrix-rows cm))
cm-new))
(defun confusion-matrix-normalize (cm)
"Returns a new CONFUSION-MATRIX with new empty cells for each pair
(LABEL1 LABEL2) of labels in (confusion-matrix-labels CM) that are
undefined in CM."
(let* ((cm-labels (confusion-matrix-labels cm))
(cm-new (confusion-matrix-copy cm)))
(dolist (label1 cm-labels)
(unless (member label1
(alexandria:hash-table-keys
(confusion-matrix-rows cm-new))
:test (confusion-matrix-test-fn cm-new))
(setf (gethash label1
(confusion-matrix-rows cm-new))
(make-hash-table :test
(confusion-matrix-test-fn cm-new))))
(let ((row (gethash label1
(confusion-matrix-rows cm-new))))
(dolist (label2 cm-labels)
(unless (member label2
(alexandria:hash-table-keys
row)
:test (confusion-matrix-test-fn cm-new))
(setf (gethash label2
row)
(make-array
2
:initial-contents '(0 ()) ))))))
cm-new))