-
Notifications
You must be signed in to change notification settings - Fork 87
/
Copy pathPrincipalComponantAnalysis.swift
197 lines (169 loc) · 8.38 KB
/
PrincipalComponantAnalysis.swift
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
//
// PrincipalComponantAnalysis.swift
// AIToolbox
//
// Created by Kevin Coble on 3/22/16.
// Copyright © 2016 Kevin Coble. All rights reserved.
//
import Foundation
import Accelerate
enum PCAError: Error {
case invalidDimensions
case errorInSVDParameters
case svdDidNotConverge
case pcaNotPerformed
case transformError
}
/// Class to perform principal component analysis
open class PCA {
open fileprivate(set) var initialDimension : Int
open fileprivate(set) var reducedDimension : Int
open var μ : [Double] = [] // Mean of the data set used to find basis vectors
open var eigenValues : [Double] = [] // Array of all eigenvalues - this will be sized at initialDimension
open var basisVectors : [Double] = [] // Matrix (column-major) of the top 'reducedDimension' eigenvectors that for the new basis
public init(initialSize: Int, reduceSize: Int)
{
initialDimension = initialSize
reducedDimension = reduceSize
}
public init?(loadFromFile path: String)
{
// Initialize all the stored properties (Swift requires this, even when returning nil [supposedly fixed in Swift 2.2)
initialDimension = 0
reducedDimension = 0
μ = []
eigenValues = []
basisVectors = []
// Read the property list
let pList = NSDictionary(contentsOfFile: path)
if pList == nil { return nil }
let dictionary : Dictionary = pList! as! Dictionary<String, AnyObject>
// Get the initial and reduced dimensions from the dictionary
let initialDimensionValue = dictionary["initialDimension"] as? NSInteger
if initialDimensionValue == nil { return nil }
initialDimension = initialDimensionValue!
let reducedDimensionValue = dictionary["reducedDimension"] as? NSInteger
if reducedDimensionValue == nil { return nil }
reducedDimension = reducedDimensionValue!
let meanArray = dictionary["mean"] as? NSArray
if meanArray == nil { return nil }
μ = meanArray! as! [Double]
let eigenValueArray = dictionary["eigenValues"] as? NSArray
if eigenValueArray == nil { return nil }
eigenValues = eigenValueArray! as! [Double]
let eigenVectorArray = dictionary["basisVectors"] as? NSArray
if eigenVectorArray == nil { return nil }
basisVectors = eigenVectorArray! as! [Double]
}
/// Routine to get the reduced eigenvector set that is the basis for the reduced dimension subspace
open func getReducedBasisVectorSet(_ data: MLDataSet) throws
{
// Verify we have a valid setup
if (initialDimension < 2 || reducedDimension < 1 || initialDimension < reducedDimension) {
throw PCAError.invalidDimensions
}
// Verify the data set matches the initial dimension
if (data.inputDimension != initialDimension) {
throw MachineLearningError.dataWrongDimension
}
// Make sure we have enough data
if (data.size < 2) {
throw MachineLearningError.notEnoughData
}
// Get the mean of the data
μ = [Double](repeating: 0.0, count: initialDimension)
for point in 0..<data.size {
let inputs = try data.getInput(point)
vDSP_vaddD(inputs, 1, μ, 1, &μ, 1, vDSP_Length(initialDimension))
}
var scale = 1.0 / Double(data.size)
vDSP_vsmulD(μ, 1, &scale, &μ, 1, vDSP_Length(initialDimension))
// Get the data matrix with a mean of 0 - in column-major format for the LAPACK routines
var X = [Double](repeating: 0.0, count: initialDimension * data.size)
var row : [Double] = [Double](repeating: 0.0, count: initialDimension)
for point in 0..<data.size {
let inputs = try data.getInput(point)
vDSP_vsubD(μ, 1, inputs, 1, &row, 1, vDSP_Length(initialDimension))
for column in 0..<initialDimension {
X[column * data.size + point] = row[column]
}
}
// Get the SVD decomposition of the X matrix
let jobZChar = "S" as NSString
var jobZ : Int8 = Int8(jobZChar.character(at: 0)) // return min(m,n) rows of Vt var q : __CLPK_integer
var m : __CLPK_integer = __CLPK_integer(data.size)
var n : __CLPK_integer = __CLPK_integer(initialDimension)
eigenValues = [Double](repeating: 0.0, count: initialDimension)
var u = [Double](repeating: 0.0, count: data.size * data.size)
var vTranspose = [Double](repeating: 0.0, count: initialDimension * initialDimension)
var work : [Double] = [0.0]
var lwork : __CLPK_integer = -1 // Ask for the best size of the work array
let iworkSize = 8 * __CLPK_integer(min(m,n))
var iwork = [__CLPK_integer](repeating: 0, count: Int(iworkSize))
var info : __CLPK_integer = 0
dgesdd_(&jobZ, &m, &n, &X, &m, &eigenValues, &u, &m, &vTranspose, &n, &work, &lwork, &iwork, &info)
if (info != 0 || work[0] < 1) {
throw PCAError.errorInSVDParameters
}
lwork = __CLPK_integer(work[0])
work = [Double](repeating: 0.0, count: Int(work[0]))
dgesdd_(&jobZ, &m, &n, &X, &m, &eigenValues, &u, &m, &vTranspose, &n, &work, &lwork, &iwork, &info)
if (info < 0) {
throw PCAError.errorInSVDParameters
}
if (info > 0) {
throw PCAError.svdDidNotConverge
}
// Extract the new basis vectors - make a row-major matrix for dataset matrix multiplication using vDSP
basisVectors = [Double](repeating: 0.0, count: reducedDimension * initialDimension)
for vector in 0..<reducedDimension {
for column in 0..<initialDimension {
basisVectors[(vector * initialDimension) + column] = vTranspose[vector + (column * initialDimension)]
}
}
}
/// Routine to transform the given dataset into a new dataset using the basis vectors calculated
open func transformDataSet(_ data: MLDataSet) throws ->DataSet
{
// Make sure we have the PCA results to use
if (basisVectors.count <= 0) { throw PCAError.pcaNotPerformed }
// Make sure the data dimension matches
if (data.inputDimension != initialDimension) { throw MachineLearningError.dataWrongDimension }
// Make a new data set with the new dimension
let result = DataSet(dataType: .regression, inputDimension: reducedDimension, outputDimension: 1)
// Convert each data point
var centered = [Double](repeating: 0.0, count: initialDimension)
var transformed = [Double](repeating: 0.0, count: reducedDimension)
for point in 0..<data.size {
// Move relative to the mean of the training data
let inputs = try data.getInput(point)
vDSP_vsubD(μ, 1, inputs, 1, ¢ered, 1, vDSP_Length(initialDimension))
// Convert to the new basis vector
vDSP_mmulD(basisVectors, 1, centered, 1, &transformed, 1, vDSP_Length(reducedDimension), vDSP_Length(1), vDSP_Length(initialDimension))
// Add to the new dataset
do {
try result.addUnlabeledDataPoint(input: transformed)
}
catch {
throw PCAError.transformError
}
}
// Return the result
return result
}
/// Routine to write the model result parameters to a property list path at the provided path
public enum PCAWriteErrors: Error { case failedWriting }
open func saveToFile(_ path: String) throws
{
// Create a property list of the PCA model
var modelDictionary = [String: AnyObject]()
modelDictionary["initialDimension"] = initialDimension as AnyObject?
modelDictionary["reducedDimension"] = reducedDimension as AnyObject?
modelDictionary["mean"] = μ as AnyObject?
modelDictionary["eigenValues"] = eigenValues as AnyObject?
modelDictionary["basisVectors"] = basisVectors as AnyObject?
// Convert to a property list (NSDictionary) and write
let pList = NSDictionary(dictionary: modelDictionary)
if !pList.write(toFile: path, atomically: false) { throw PCAWriteErrors.failedWriting }
}
}