|
| 1 | +// |
| 2 | +// ConvolutionLayer.swift |
| 3 | +// MemkiteMetal |
| 4 | +// |
| 5 | +// Created by Torb Morland & Amund Tveit on 12/12/15. |
| 6 | +// Copyright © 2015 Memkite. All rights reserved. |
| 7 | +// |
| 8 | + |
| 9 | +import Foundation |
| 10 | +import Metal |
| 11 | + |
| 12 | +func getDataFromBlob(blob: NSDictionary) -> ([Float], [Float]) { |
| 13 | + print(" ==> getDataFromBlob") |
| 14 | + |
| 15 | + let shape = blob["shape"] as! NSDictionary |
| 16 | + let data = blob["data"] as! [Float] |
| 17 | + var FloatData = createFloatNumbersArray(data.count) |
| 18 | + for i in 0 ..< data.count { |
| 19 | + FloatData[i] = data[i] |
| 20 | + } |
| 21 | + return (shape["dim"] as! [Float], FloatData) |
| 22 | +} |
| 23 | + |
| 24 | + |
| 25 | + |
| 26 | +func createConvolutionLayerCached(layer: NSDictionary, |
| 27 | + inputBuffer: MTLBuffer, |
| 28 | + inputShape: [Float], |
| 29 | + metalCommandQueue: MTLCommandQueue, metalDefaultLibrary:MTLLibrary, metalDevice:MTLDevice, |
| 30 | + inout layer_data_caches: [Dictionary<String,MTLBuffer>], |
| 31 | + inout blob_cache: [Dictionary<String,([Float],[Float])>], |
| 32 | + layer_number: Int, |
| 33 | + layer_string: String) -> (MTLBuffer, MTLCommandBuffer, [Float]) { |
| 34 | + |
| 35 | + let start = NSDate() |
| 36 | + |
| 37 | + print("CREATECONVLAYERCACHED") |
| 38 | + |
| 39 | +// let metalCommandBuffer = metalCommandQueue.commandBuffer() |
| 40 | + let metalCommandBuffer = metalCommandQueue.commandBufferWithUnretainedReferences() |
| 41 | + |
| 42 | + var convolution_params_dict:NSDictionary = NSDictionary() |
| 43 | + var pad:Float = 0.0 |
| 44 | + var kernel_size:Float = 1.0 |
| 45 | + var stride:Float = 1.0 |
| 46 | + var blobs:[NSDictionary] = [] |
| 47 | + var weights:[Float] = [] |
| 48 | + var weight_shape:[Float] = [] |
| 49 | + var bias_data:[Float] = [] |
| 50 | + var h:Float = 0.0 |
| 51 | + var w:Float = 0.0 |
| 52 | + var result_shape:[Float] = [] |
| 53 | + var outputCount:Int = 0 |
| 54 | + |
| 55 | + var input_dimensions:MetalTensorDimensions = MetalTensorDimensions(n: 0, channels: 0, width: 0, height:0) |
| 56 | + var weight_dimensions:MetalTensorDimensions = MetalTensorDimensions(n: 0, channels: 0, width: 0, height:0) |
| 57 | + var result_dimensions:MetalTensorDimensions = MetalTensorDimensions(n: 0, channels: 0, width: 0, height:0) |
| 58 | + var tensor_dimensions:[MetalTensorDimensions] = [] |
| 59 | + var col_dimensions:MetalTensorDimensions = MetalTensorDimensions(n: 0, channels: 0, width: 0, height:0) |
| 60 | + var col_output:[Float] = [] |
| 61 | + var convolution_params:MetalConvolutionParameters = MetalConvolutionParameters(pad:0, kernel_size: 0, stride: 0) |
| 62 | + |
| 63 | + |
| 64 | + print("NOTCACHINGMODE") |
| 65 | + convolution_params_dict = layer["convolution_param"] as! NSDictionary |
| 66 | + pad = 0.0 |
| 67 | + kernel_size = 1.0 |
| 68 | + stride = 1.0 |
| 69 | + if let val = convolution_params_dict["pad"] as? Float { |
| 70 | + pad = val |
| 71 | + } |
| 72 | + if let val = convolution_params_dict["kernel_size"] as? Float { |
| 73 | + kernel_size = val |
| 74 | + } |
| 75 | + |
| 76 | + let startblob = NSDate() |
| 77 | + |
| 78 | + |
| 79 | + if let tmpval = blob_cache[layer_number]["0"] { |
| 80 | + print("found blob key = 0 in cache") |
| 81 | + (weight_shape, weights) = tmpval |
| 82 | + } else { |
| 83 | + print("didnt find blob key = 0 in cache") |
| 84 | + blobs = layer["blobs"] as! [NSDictionary] |
| 85 | + (weight_shape, weights) = getDataFromBlob(blobs[0]) |
| 86 | + blob_cache[layer_number]["0"] = (weight_shape, weights) |
| 87 | + } |
| 88 | + |
| 89 | +// this can be optimized |
| 90 | + blobs = layer["blobs"] as! [NSDictionary] |
| 91 | + (_, bias_data) = getDataFromBlob(blobs[1]) |
| 92 | + |
| 93 | + print("### Time to blob: \(NSDate().timeIntervalSinceDate(startblob))") |
| 94 | + |
| 95 | + |
| 96 | + /* |
| 97 | + let startblob = NSDate() |
| 98 | + blobs = layer["blobs"] as! [NSDictionary] |
| 99 | + (weight_shape, weights) = getDataFromBlob(blobs[0]) |
| 100 | + (_, bias_data) = getDataFromBlob(blobs[1]) |
| 101 | + print("### Time to blob: \(NSDate().timeIntervalSinceDate(startblob))") |
| 102 | +*/ |
| 103 | + |
| 104 | + |
| 105 | + h = (inputShape[2] + 2 * pad - kernel_size) / stride + 1 |
| 106 | + w = (inputShape[3] + 2 * pad - kernel_size) / stride + 1 |
| 107 | + result_shape = [inputShape[0], weight_shape[0], h, w] |
| 108 | + outputCount = Int(result_shape.reduce(1, combine: *)) |
| 109 | + |
| 110 | + // Create input and output vectors, and corresponding metal buffer |
| 111 | + input_dimensions = MetalTensorDimensions(n: inputShape[0], channels: inputShape[1], width: inputShape[2], height: inputShape[3]) |
| 112 | + weight_dimensions = MetalTensorDimensions(n: weight_shape[0], channels: weight_shape[1], width: weight_shape[2], height: weight_shape[3]) |
| 113 | + col_dimensions = MetalTensorDimensions(n: inputShape[0], channels: inputShape[1] * weight_shape[2] * weight_shape[3], width: inputShape[2], height: inputShape[3]) |
| 114 | + result_dimensions = MetalTensorDimensions(n: result_shape[0], channels: result_shape[1], width: result_shape[2], height: result_shape[3]) |
| 115 | + tensor_dimensions = [input_dimensions, weight_dimensions, col_dimensions, result_dimensions] |
| 116 | + |
| 117 | + |
| 118 | + col_output = createFloatNumbersArray(Int(col_dimensions.n * col_dimensions.channels * col_dimensions.height * col_dimensions.width)) |
| 119 | + |
| 120 | + |
| 121 | + convolution_params = MetalConvolutionParameters(pad: pad, kernel_size: kernel_size, stride: stride) |
| 122 | + print("AFTER NOTCACHINGMODE") |
| 123 | + |
| 124 | + |
| 125 | + print("BEFORE THE BIG CALL") |
| 126 | + |
| 127 | + let resultBuffer = addConvolutionCommandToCommandBufferCached(metalCommandBuffer, inputBuffer: inputBuffer, im2ColCount: col_output.count, weights: weights, outputCount: outputCount, convolution_params: convolution_params, tensor_dimensions: tensor_dimensions, bias: bias_data, metalDefaultLibrary: metalDefaultLibrary, metalDevice:metalDevice, layer_data_caches: &layer_data_caches, layer_number: layer_number,layer_string: layer_string) |
| 128 | + //metalCommandBuffer.commit() |
| 129 | + |
| 130 | + print("AFTER BIG CALL") |
| 131 | + |
| 132 | + print("### Time to setup convolution layer: \(NSDate().timeIntervalSinceDate(start))") |
| 133 | + |
| 134 | + |
| 135 | + return (resultBuffer, metalCommandBuffer, result_shape) |
| 136 | + |
| 137 | +} |
| 138 | + |
| 139 | +func addConvolutionCommandToCommandBufferCached(commandBuffer: MTLCommandBuffer, |
| 140 | + inputBuffer: MTLBuffer, |
| 141 | + im2ColCount: Int, |
| 142 | + weights: [Float], |
| 143 | + outputCount: Int, |
| 144 | + convolution_params: MetalConvolutionParameters, |
| 145 | + tensor_dimensions: [MetalTensorDimensions], |
| 146 | + bias: [Float], |
| 147 | + metalDefaultLibrary:MTLLibrary, metalDevice:MTLDevice, |
| 148 | + inout layer_data_caches: [Dictionary<String,MTLBuffer>], |
| 149 | + layer_number: Int, |
| 150 | + layer_string: String) -> MTLBuffer { |
| 151 | + |
| 152 | + let start = NSDate() |
| 153 | + |
| 154 | + print("before output and col_output") |
| 155 | + |
| 156 | + var output:[Float] = [] |
| 157 | + var col_output:[Float] = [] |
| 158 | + |
| 159 | + output = createFloatNumbersArray(outputCount) |
| 160 | + col_output = createFloatNumbersArray(im2ColCount) |
| 161 | + |
| 162 | + print("before setupshaderinpipeline") |
| 163 | + |
| 164 | + let (_, im2colComputePipelineState, _) = setupShaderInMetalPipeline("im2col", metalDefaultLibrary: metalDefaultLibrary, metalDevice: metalDevice) |
| 165 | + |
| 166 | + let resultMetalBuffer = createOrReuseFloatMetalBuffer("resultMetalBuffer", data: output, cache: &layer_data_caches, layer_number: layer_number, metalDevice: metalDevice) |
| 167 | + |
| 168 | + print("after resultmetalbuffer") |
| 169 | + |
| 170 | + let weightMetalBuffer = createOrReuseFloatMetalBuffer("weightMetalBuffer", data: weights, cache: &layer_data_caches, layer_number:layer_number, metalDevice: metalDevice) |
| 171 | + |
| 172 | + |
| 173 | + let convolutionParamsMetalBuffer = createOrReuseConvolutionParametersMetalBuffer("convolutionParamsMetalBuffer", data: convolution_params, cache: &layer_data_caches, layer_number: layer_number, metalDevice: metalDevice) |
| 174 | + let tensorDimensionsMetalBuffer = createOrReuseTensorDimensionsVectorMetalBuffer("tensorDimensionsMetalBuffer", data: tensor_dimensions, cache: &layer_data_caches, layer_number: layer_number, metalDevice: metalDevice) |
| 175 | + |
| 176 | + let colOutputMetalBuffer = createOrReuseFloatMetalBuffer("colOutputMetalBuffer", data: col_output, cache: &layer_data_caches, layer_number: layer_number, metalDevice: metalDevice) |
| 177 | + let biasMetalBuffer = createOrReuseFloatMetalBuffer("bias", data: bias, cache: &layer_data_caches, layer_number:layer_number, metalDevice: metalDevice) |
| 178 | + |
| 179 | + |
| 180 | + // Create Metal compute command encoder for im2col |
| 181 | + var metalComputeCommandEncoder = commandBuffer.computeCommandEncoder() |
| 182 | + metalComputeCommandEncoder.setBuffer(inputBuffer, offset: 0, atIndex: 0) |
| 183 | + metalComputeCommandEncoder.setBuffer(tensorDimensionsMetalBuffer, offset: 0, atIndex: 1) |
| 184 | + metalComputeCommandEncoder.setBuffer(convolutionParamsMetalBuffer, offset: 0, atIndex: 2) |
| 185 | + metalComputeCommandEncoder.setBuffer(colOutputMetalBuffer, offset: 0, atIndex: 3) |
| 186 | + //metalComputeCommandEncoder.setComputePipelineState(im2colComputePipelineState) |
| 187 | + |
| 188 | + |
| 189 | + // Set the shader function that Metal will use |
| 190 | + metalComputeCommandEncoder.setComputePipelineState(im2colComputePipelineState) |
| 191 | + |
| 192 | + // Set up thread groups on GPU |
| 193 | + var threadsPerGroup = MTLSize(width:im2colComputePipelineState.threadExecutionWidth,height:1,depth:1) |
| 194 | + // ensure at least 1 threadgroup |
| 195 | + print("before mtlsize 2") |
| 196 | + var numThreadgroups = MTLSize(width:(col_output.count-1)/im2colComputePipelineState.threadExecutionWidth + 1, height:1, depth:1) |
| 197 | + metalComputeCommandEncoder.dispatchThreadgroups(numThreadgroups, threadsPerThreadgroup: threadsPerGroup) |
| 198 | + |
| 199 | + print("after dispatch") |
| 200 | + |
| 201 | + // Finalize configuration |
| 202 | + metalComputeCommandEncoder.endEncoding() |
| 203 | + |
| 204 | + |
| 205 | + |
| 206 | + |
| 207 | + let (_, convolutionComputePipelineState, _) = setupShaderInMetalPipeline("convolution_layer", metalDefaultLibrary: metalDefaultLibrary, metalDevice: metalDevice) |
| 208 | + metalComputeCommandEncoder = commandBuffer.computeCommandEncoder() |
| 209 | + |
| 210 | + // Create Metal Compute Command Encoder and add input and output buffers to it |
| 211 | + metalComputeCommandEncoder.setBuffer(resultMetalBuffer, offset: 0, atIndex: 0) |
| 212 | + metalComputeCommandEncoder.setBuffer(weightMetalBuffer, offset: 0, atIndex: 1) |
| 213 | + metalComputeCommandEncoder.setBuffer(tensorDimensionsMetalBuffer, offset: 0, atIndex: 2) |
| 214 | + metalComputeCommandEncoder.setBuffer(colOutputMetalBuffer, offset: 0, atIndex: 3) |
| 215 | + metalComputeCommandEncoder.setBuffer(biasMetalBuffer, offset: 0, atIndex: 4) |
| 216 | + |
| 217 | + // Set the shader function that Metal will use |
| 218 | + metalComputeCommandEncoder.setComputePipelineState(convolutionComputePipelineState) |
| 219 | + |
| 220 | + // Set up thread groups on GPU |
| 221 | + threadsPerGroup = MTLSize(width:convolutionComputePipelineState.threadExecutionWidth,height:1,depth:1) |
| 222 | + // ensure at least 1 threadgroup |
| 223 | + numThreadgroups = MTLSize(width:(outputCount-1)/convolutionComputePipelineState.threadExecutionWidth + 1, height:1, depth:1) |
| 224 | + metalComputeCommandEncoder.dispatchThreadgroups(numThreadgroups, threadsPerThreadgroup: threadsPerGroup) |
| 225 | + |
| 226 | + // Finalize configuration |
| 227 | + metalComputeCommandEncoder.endEncoding() |
| 228 | + |
| 229 | + print("after endencoding") |
| 230 | + |
| 231 | + print("#### Time to add convolution layer: \(NSDate().timeIntervalSinceDate(start))") |
| 232 | + |
| 233 | + |
| 234 | + return resultMetalBuffer |
| 235 | + |
| 236 | +} |
| 237 | + |
0 commit comments