-
Notifications
You must be signed in to change notification settings - Fork 5
/
conllu-prolog.lisp
121 lines (99 loc) · 5.11 KB
/
conllu-prolog.lisp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
;; Copyright 2016 IBM
;; Licensed under the Apache License, Version 2.0 (the "License");
;; you may not use this file except in compliance with the License.
;; You may obtain a copy of the License at
;; http://www.apache.org/licenses/LICENSE-2.0
;; Unless required by applicable law or agreed to in writing, software
;; distributed under the License is distributed on an "AS IS" BASIS,
;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
;; See the License for the specific language governing permissions and
;; limitations under the License.
;;;; conll-prolog.lisp
(in-package #:conllu.prolog)
(defun toprologid (str)
(cl-ppcre:regex-replace-all "[^A-Za-z0-9_]"
(string-downcase (string-trim '(#\- #\. #\, #\; #\` #\' #\space)
(format nil "~A" str))) "_"))
(defun make-id (context prefix id)
(unless id
(setf id (uuid:make-v4-uuid)))
(if context
(format nil "c~a_~a_~a" (toprologid context) (toprologid prefix) (toprologid id))
(format nil "~a_~a" (toprologid prefix) (toprologid id))))
(defun is-root (str)
(string-equal str "root"))
(defun clean-dep-rel (str)
(cl-ppcre:regex-replace-all "[^A-Za-z0-9_]"
(string-downcase (string-trim '(#\- #\. #\, #\; #\` #\' #\space) str)) "_"))
(defparameter *clauses* nil)
(defparameter *dependencies* nil)
(defun emit-prolog (clause text)
(push text (gethash clause *clauses*)))
(defun write-prolog (out)
(format out "%% -*- prolog -*-~%")
(format out ":- dynamic nlp_dependency/4.~%")
(maphash (lambda (k clauses)
(format out "~%")
(format out "%% ~a~%" k)
(dolist (c clauses) (format out "~a~%" c))) *clauses*))
(defun prolog-string (str &optional (downcase t))
(format nil "'~a'" (cl-ppcre:regex-replace-all "([\'\\\\])" (if downcase (string-downcase str) str)
'("\\" :match))))
(defun process-features (sentence-id word-index-id feats-str)
(let ((tags nil)
(features (split-sequence #\| feats-str)))
(dolist (feat features)
(let ((k-v (split-sequence #\= feat)))
(when (= 2 (length k-v))
(emit-prolog (car k-v) (format nil "nlp_feat_~a(~a,~a,~a)." (car k-v) sentence-id word-index-id
(prolog-string (cadr k-v) nil))))
;; features that don't have a Key=Value pair are considered
;; "tags" and will be emitted at the end all at once
(when (= 1 (length k-v))
(push (car k-v) tags))))
(when tags
(emit-prolog "tags" (format nil "nlp_tag(~a,~a,[~{'~a'~^,~}])." sentence-id word-index-id (reverse tags))))))
(defun process-tokens (context sentence-id token)
(let ((word-index-id (make-id context "i" (token-id token)))
(dep-rel (token-deprel token))
(misc (mapcar (lambda (x) (split-sequence #\= x)) (split-sequence #\| (token-misc token))))
(head-id (make-id context "i" (token-head token))))
(when (> (length misc) 0)
(let ((sense (assoc "FlSense" misc :test #'equal)))
(when (and sense (not (equal "?" (cadr sense))))
(emit-prolog "sense" (format nil "nlp_sense(~a,~a,~a)."
sentence-id word-index-id (prolog-string (cadr sense)))))))
(emit-prolog "dependency" (format nil "nlp_dependency(~a,~a,~a,~a)."
sentence-id word-index-id head-id (prolog-string dep-rel)))
(process-features sentence-id word-index-id (token-feats token))
(process-features sentence-id word-index-id (token-misc token))
(emit-prolog "idx" (format nil "nlp_index(~a,~a,~a)." sentence-id word-index-id (token-id token)))
(emit-prolog "form" (format nil "nlp_form(~a,~a,~a)."
sentence-id word-index-id (prolog-string (token-form token) nil)))
(emit-prolog "lemma" (format nil "nlp_lemma(~a,~a,~a)."
sentence-id word-index-id (prolog-string (token-lemma token) nil)))
(emit-prolog "pos" (format nil "nlp_pos(~a,~a,'~a')." sentence-id word-index-id (token-upostag token)))
(if (is-root dep-rel)
(emit-prolog "root" (format nil "nlp_sent_root(~a,~a)." sentence-id word-index-id))
(unless (gethash (clean-dep-rel dep-rel) *dependencies*)
(setf (gethash (clean-dep-rel dep-rel) *dependencies*) (prolog-string dep-rel))))))
(defun clean-whitespace (line)
(string-trim '(#\space #\tab) line))
(defun valid-line (line)
(> (length line) 0))
(defun convert-filename (context filename-in filename-out)
(setf *clauses* (make-hash-table :test #'equal))
(setf *dependencies* (make-hash-table :test #'equal))
(let ((sentences (read-conllu filename-in)))
(dolist (s sentences)
(let ((sid (make-id context "s" (sentence-meta-value s "sent_id"))))
(dolist (metadata (sentence-meta s))
(emit-prolog (format nil "sentence_~a" (car metadata))
(format nil "nlp_sentence_~a(~a,~a)." (toprologid (car metadata)) sid
(prolog-string (cdr metadata) nil))))
(emit-prolog "sentence" (format nil "nlp_sentence(~a)." sid))
(dolist (tk (sentence-tokens s))
(process-tokens context sid tk)))))
(with-open-file (fout filename-out :direction :output :if-exists :supersede)
(write-prolog fout)))
;; (convert-filename "sample" "repos/conll-prolog/complex.conll" "complex.pl")