Implemented adaptive SGD optimizer (Adagrad). (migueldeicaza#412)

zeahmed · migueldeicaza · commit 006361b33ace · 2019-03-27T11:15:41.000-04:00
* Setting variable shared_name property to avoid variables sharing.

* Added SGD and MomentumSGD optimizers together with relevant tests.

* Tests added for momentum and Nesterov SGD with and without lr decay.

* Added MNIST multilayer test.

* Added MNIST GPU test in disabled mode.

* Added support to place an operation on a specific device.

* Disabled 'DevicePlacementTest' because it requires GPUs.

* Added MNIST multilayer test.

* Updated comments.

* Disabled MnistGPU test.

* Removed uncessary files.

* Added Adagrad optimization algorithm.
diff --git a/TensorFlowSharp/Optimizer.cs b/TensorFlowSharp/Optimizer.cs
@@ -77,7 +77,10 @@ public virtual (TFOutput gradient, Variable variable)[] ComputeGradient(TFOutput
         /// <param name="varList">list of variable to compute the gradients for.
         /// If null the gradient is computed for all the trainable variables in the graph./param>
         /// <returns>An Operation that updates the variables.</returns>
-        public abstract TFOperation[] Minimize(TFOutput loss, Variable[] varList = null);
+        public virtual TFOperation[] Minimize(TFOutput loss, Variable[] varList = null)
+        {
+            return ApplyGradient(ComputeGradient(loss, varList));
+        }
     }
 
     /// <summary>
@@ -188,11 +191,103 @@ public override TFOperation[] ApplyGradient((TFOutput gradient, Variable variabl
             }
             return _updateOps.ToArray();
         }
+    }
+
+    /// <summary>
+    /// Adaptive stochastic gradient descent optimizer.
+    /// </summary>
+    public sealed class Adagrad : Optimizer
+    {
+        /// <summary>
+        /// Varaible to keep track of number of iterations (mini-batch processed)
+        /// </summary>
+        public Variable Iterations { get; }
+
+        /// <summary>
+        /// Variable to keep track of the learning rate.
+        /// </summary>
+        public Variable LearningRate { get; }
+
+        private readonly string _lrName = "LearningRate";
+        private readonly IList<TFOperation> _updateOps = new List<TFOperation>();
+        private float _initialAccumulatorValue;
+        private TFOutput _epsilon;
+
+        /// <summary>
+        /// Construct Adagrad optimizer.
+        /// </summary>
+        /// <param name="graph">The graph object.</param>
+        /// <param name="learningRate">The learning rate for the SGD update.</param>
+        /// <param name="decay">Learning rate decay over each update.</param>
+        /// <param name="initialAccumulatorValue">A floating point value. Starting value for the accumulators, must be positive.</param>
+        /// <param name="operName">Name the optimizer. All the variable that are created in this class will be created under this scope.</param>
+        public Adagrad(TFGraph graph, float learningRate, float decay = 0, float initialAccumulatorValue = 0.1f, string operName = "AdagradOptimizer") : base(graph, operName)
+        {
+            if (initialAccumulatorValue < 0)
+                throw new ArgumentException($"Value must be positive. initialAccumulatorValue = {initialAccumulatorValue}");
+
+            using (var scope = _graph.WithScope(_optimizerName))
+            {
+                Iterations = _graph.Variable(_graph.Const(new TFTensor(0L)), trainable: false, operName: "iterations");
+                _updateOps.Add(_graph.AssignAddVariableOp(Iterations, _graph.Const(1L)));
+                var initialLearningRate = _graph.Const(learningRate);
+                LearningRate = _graph.Variable(initialLearningRate, trainable: false, operName: _lrName);
+                CreateDecayOps(decay, initialLearningRate);
+            }
+            _initialAccumulatorValue = initialAccumulatorValue;
+            _epsilon = _graph.Const(1e-7f);
+        }
+
+        private void CreateDecayOps(float decay, TFOutput initialLearningRate)
+        {
+            if (decay > 0)
+            {
+                var _decay = _graph.Const(decay, "Decay");
+                var one = _graph.Const(1f);
+                _updateOps.Add(_graph.AssignVariableOp(LearningRate,
+                    _graph.Mul(initialLearningRate,
+                                _graph.Div(one,
+                                            _graph.Add(one,
+                                                        _graph.Mul(_decay,
+                                                                    _graph.Cast(Iterations.Read, _decay.OutputType)
+                                                                  )
+                                                       )
+                                           )
+                               )));
+            }
+        }
+
+        private TFOutput[] InitMoments((TFOutput gradient, Variable variable)[] gradientsAndVariables)
+        {
+            var accumulators = new TFOutput[gradientsAndVariables.Length];
+            for (int i = 0; i < gradientsAndVariables.Length; i++)
+            {
+                var gv = gradientsAndVariables[i];
+                var varType = gv.variable.Read.OutputType;
+                var varShape = _graph.GetTensorShape(gv.variable.Read);
+                accumulators[i] = _graph.VariableV2(varShape, varType);
+                _graph.AddInitVariable(_graph.Assign(accumulators[i], _graph.Constant(_initialAccumulatorValue, varShape, varType)).Operation);
+            }
+            return accumulators;
+        }
 
         /// <inheritdoc />
-        public override TFOperation[] Minimize(TFOutput loss, Variable[] varList = null)
+        public override TFOperation[] ApplyGradient((TFOutput gradient, Variable variable)[] gradientsAndVariables)
         {
-            return ApplyGradient(ComputeGradient(loss, varList));
+            var accumulators = InitMoments(gradientsAndVariables);
+            for (int i = 0; i < gradientsAndVariables.Length; i++)
+            {
+                var gv = gradientsAndVariables[i];
+                var lr = _graph.Cast(LearningRate.Read, gv.gradient.OutputType);
+                // accum = g ** 2;
+                var accum = _graph.Add(accumulators[i], _graph.Square(gv.gradient));
+                // accumulators[i] = accum
+                _updateOps.Add(_graph.Assign(accumulators[i], accum).Operation);
+                // w = w - lr * g / sqrt(accum + 1e-7)
+                var denom = _graph.Div(_graph.Mul(lr, gv.gradient), _graph.Sqrt(_graph.Add(accum, _epsilon)));
+                _updateOps.Add(_graph.AssignSubVariableOp(gv.variable, denom));
+            }
+            return _updateOps.ToArray();
         }
     }
 }
diff --git a/tests/TensorFlowSharp.Tests.CSharp/OptimizerTests.cs b/tests/TensorFlowSharp.Tests.CSharp/OptimizerTests.cs
@@ -556,5 +556,118 @@ public void MNISTTwoHiddenLayerNetworkGPUTest()
                 }
             }
         }
+
+
+        [Fact]
+        public void LinearRegresionTrainingWithAdagradTest()
+        {
+            Console.WriteLine("Linear regression");
+            // Parameters
+            var learning_rate = 0.01f;
+            var training_epochs = 5;
+
+            // Training data
+            var train_x = new float[] {
+                3.3f, 4.4f, 5.5f, 6.71f, 6.93f, 4.168f, 9.779f, 6.182f, 7.59f, 2.167f,
+                7.042f, 10.791f, 5.313f, 7.997f, 5.654f, 9.27f, 3.1f
+            };
+            var train_y = new float[] {
+                1.7f, 2.76f,2.09f,3.19f,1.694f,1.573f,3.366f,2.596f,2.53f,1.221f,
+                 2.827f,3.465f,1.65f,2.904f,2.42f,2.94f,1.3f
+            };
+            var n_samples = train_x.Length;
+            using (var graph = new TFGraph())
+            {
+                var rng = new Random(0);
+                // tf Graph Input
+
+                var X = graph.Placeholder(TFDataType.Float, TFShape.Scalar);
+                var Y = graph.Placeholder(TFDataType.Float, TFShape.Scalar);
+
+                var W = graph.Variable(graph.Const(0.1f), operName: "weight");
+                var b = graph.Variable(graph.Const(0.1f), operName: "bias");
+                var pred = graph.Add(graph.Mul(X, W.Read, "x_w"), b.Read);
+
+                var cost = graph.Div(graph.ReduceSum(graph.Pow(graph.Sub(pred, Y), graph.Const(2f))), graph.Mul(graph.Const(2f), graph.Const((float)n_samples), "2_n_samples"));
+
+                var sgd = new Adagrad(graph, learning_rate);
+                var updateOps = sgd.Minimize(cost);
+
+                using (var sesssion = new TFSession(graph))
+                {
+                    sesssion.GetRunner().AddTarget(graph.GetGlobalVariablesInitializer()).Run();
+
+                    var expectedLines = File.ReadAllLines(Path.Combine(_testDataPath, "Adagrad", "expected.txt"));
+                    for (int i = 0; i < training_epochs; i++)
+                    {
+                        for (int j = 0; j < n_samples; j++)
+                        {
+                            var tensors = sesssion.GetRunner()
+                                .AddInput(X, new TFTensor(train_x[j]))
+                                .AddInput(Y, new TFTensor(train_y[j]))
+                                .AddTarget(updateOps).Fetch(cost, W.Read, b.Read, pred).Run();
+                            var output = $"loss: {tensors[0].GetValue():F4}, W: {tensors[1].GetValue():F4}, b: {tensors[2].GetValue():F4}";
+                            Assert.Equal(expectedLines[i * n_samples + j], output);
+                        }
+                    }
+                }
+            }
+        }
+
+        [Fact]
+        public void LinearRegresionTrainingWithAdagradDecayTest()
+        {
+            Console.WriteLine("Linear regression");
+            // Parameters
+            var learning_rate = 0.01f;
+            var training_epochs = 5;
+
+            // Training data
+            var train_x = new float[] {
+                3.3f, 4.4f, 5.5f, 6.71f, 6.93f, 4.168f, 9.779f, 6.182f, 7.59f, 2.167f,
+                7.042f, 10.791f, 5.313f, 7.997f, 5.654f, 9.27f, 3.1f
+            };
+            var train_y = new float[] {
+                1.7f, 2.76f,2.09f,3.19f,1.694f,1.573f,3.366f,2.596f,2.53f,1.221f,
+                 2.827f,3.465f,1.65f,2.904f,2.42f,2.94f,1.3f
+            };
+            var n_samples = train_x.Length;
+            using (var graph = new TFGraph())
+            {
+                var rng = new Random(0);
+                // tf Graph Input
+
+                var X = graph.Placeholder(TFDataType.Float, TFShape.Scalar);
+                var Y = graph.Placeholder(TFDataType.Float, TFShape.Scalar);
+
+                var W = graph.Variable(graph.Const(0.1f), operName: "weight");
+                var b = graph.Variable(graph.Const(0.1f), operName: "bias");
+                var pred = graph.Add(graph.Mul(X, W.Read, "x_w"), b.Read);
+
+                var cost = graph.Div(graph.ReduceSum(graph.Pow(graph.Sub(pred, Y), graph.Const(2f))), graph.Mul(graph.Const(2f), graph.Const((float)n_samples), "2_n_samples"));
+
+                var sgd = new Adagrad(graph, learning_rate, decay: 0.5f);
+                var updateOps = sgd.Minimize(cost);
+
+                using (var sesssion = new TFSession(graph))
+                {
+                    sesssion.GetRunner().AddTarget(graph.GetGlobalVariablesInitializer()).Run();
+
+                    var expectedLines = File.ReadAllLines(Path.Combine(_testDataPath, "AdagradTimeDecay", "expected.txt"));
+                    for (int i = 0; i < training_epochs; i++)
+                    {
+                        for (int j = 0; j < n_samples; j++)
+                        {
+                            var tensors = sesssion.GetRunner()
+                               .AddInput(X, new TFTensor(train_x[j]))
+                               .AddInput(Y, new TFTensor(train_y[j]))
+                               .AddTarget(updateOps).Fetch(sgd.Iterations.Read, cost, W.Read, b.Read, sgd.LearningRate.Read).Run();
+                            var output = $"step: {tensors[0].GetValue():D}, loss: {tensors[1].GetValue():F4}, W: {tensors[2].GetValue():F4}, b: {tensors[3].GetValue():F4}, lr: {tensors[4].GetValue():F8}";
+                            Assert.Equal(expectedLines[i * n_samples + j], output);
+                        }
+                    }
+                }
+            }
+        }
     }
 }
diff --git a/tests/TensorFlowSharp.Tests.CSharp/TensorFlowSharp.Tests.CSharp.csproj b/tests/TensorFlowSharp.Tests.CSharp/TensorFlowSharp.Tests.CSharp.csproj
@@ -86,6 +86,12 @@
   </ItemGroup>
   <ItemGroup>
     <None Include="packages.config" />
+    <None Include="TestData\AdagradTimeDecay\optimizer_lr_test.py">
+      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+    </None>
+    <None Include="TestData\Adagrad\optimizer_lr_test.py">
+      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+    </None>
     <None Include="TestData\MomentumNesterovTimeDecay\optimizer_lr_test.py">
       <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
     </None>
@@ -123,6 +129,12 @@
   </ItemGroup>
   <ItemGroup />
   <ItemGroup>
+    <Content Include="TestData\AdagradTimeDecay\expected.txt">
+      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+    </Content>
+    <Content Include="TestData\Adagrad\expected.txt">
+      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+    </Content>
     <Content Include="TestData\MomentumNesterovTimeDecay\expected.txt">
       <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
     </Content>
diff --git a/tests/TensorFlowSharp.Tests.CSharp/TestData/Adagrad/expected.txt b/tests/TensorFlowSharp.Tests.CSharp/TestData/Adagrad/expected.txt
@@ -0,0 +1,85 @@
+loss: 0.0474, W: 0.1000, b: 0.1000
+loss: 0.1411, W: 0.1061, b: 0.1023
+loss: 0.0540, W: 0.1143, b: 0.1060
+loss: 0.1528, W: 0.1197, b: 0.1082
+loss: 0.0145, W: 0.1270, b: 0.1117
+loss: 0.0250, W: 0.1293, b: 0.1128
+loss: 0.1141, W: 0.1311, b: 0.1142
+loss: 0.0779, W: 0.1378, b: 0.1170
+loss: 0.0528, W: 0.1410, b: 0.1193
+loss: 0.0182, W: 0.1442, b: 0.1212
+loss: 0.0836, W: 0.1447, b: 0.1223
+loss: 0.0892, W: 0.1482, b: 0.1245
+loss: 0.0149, W: 0.1529, b: 0.1268
+loss: 0.0702, W: 0.1539, b: 0.1277
+loss: 0.0579, W: 0.1569, b: 0.1297
+loss: 0.0525, W: 0.1588, b: 0.1315
+loss: 0.0130, W: 0.1616, b: 0.1331
+loss: 0.0313, W: 0.1621, b: 0.1340
+loss: 0.1071, W: 0.1629, b: 0.1352
+loss: 0.0322, W: 0.1647, b: 0.1375
+loss: 0.1104, W: 0.1660, b: 0.1387
+loss: 0.0043, W: 0.1688, b: 0.1410
+loss: 0.0155, W: 0.1693, b: 0.1414
+loss: 0.0717, W: 0.1700, b: 0.1422
+loss: 0.0562, W: 0.1730, b: 0.1440
+loss: 0.0329, W: 0.1747, b: 0.1455
+loss: 0.0141, W: 0.1763, b: 0.1467
+loss: 0.0606, W: 0.1766, b: 0.1475
+loss: 0.0568, W: 0.1786, b: 0.1491
+loss: 0.0085, W: 0.1813, b: 0.1506
+loss: 0.0496, W: 0.1819, b: 0.1511
+loss: 0.0444, W: 0.1837, b: 0.1525
+loss: 0.0338, W: 0.1850, b: 0.1538
+loss: 0.0094, W: 0.1867, b: 0.1549
+loss: 0.0253, W: 0.1871, b: 0.1555
+loss: 0.0930, W: 0.1876, b: 0.1565
+loss: 0.0234, W: 0.1890, b: 0.1583
+loss: 0.0908, W: 0.1898, b: 0.1593
+loss: 0.0012, W: 0.1918, b: 0.1610
+loss: 0.0110, W: 0.1921, b: 0.1612
+loss: 0.0514, W: 0.1925, b: 0.1618
+loss: 0.0445, W: 0.1947, b: 0.1632
+loss: 0.0227, W: 0.1959, b: 0.1644
+loss: 0.0116, W: 0.1970, b: 0.1652
+loss: 0.0476, W: 0.1972, b: 0.1659
+loss: 0.0392, W: 0.1987, b: 0.1671
+loss: 0.0051, W: 0.2007, b: 0.1682
+loss: 0.0374, W: 0.2010, b: 0.1686
+loss: 0.0360, W: 0.2024, b: 0.1697
+loss: 0.0230, W: 0.2034, b: 0.1708
+loss: 0.0072, W: 0.2047, b: 0.1716
+loss: 0.0213, W: 0.2049, b: 0.1721
+loss: 0.0834, W: 0.2054, b: 0.1729
+loss: 0.0179, W: 0.2065, b: 0.1745
+loss: 0.0776, W: 0.2072, b: 0.1752
+loss: 0.0001, W: 0.2088, b: 0.1767
+loss: 0.0081, W: 0.2089, b: 0.1768
+loss: 0.0384, W: 0.2092, b: 0.1772
+loss: 0.0365, W: 0.2109, b: 0.1783
+loss: 0.0162, W: 0.2119, b: 0.1793
+loss: 0.0099, W: 0.2128, b: 0.1800
+loss: 0.0387, W: 0.2130, b: 0.1805
+loss: 0.0278, W: 0.2141, b: 0.1816
+loss: 0.0030, W: 0.2157, b: 0.1824
+loss: 0.0291, W: 0.2159, b: 0.1827
+loss: 0.0299, W: 0.2171, b: 0.1836
+loss: 0.0159, W: 0.2179, b: 0.1845
+loss: 0.0056, W: 0.2189, b: 0.1852
+loss: 0.0184, W: 0.2191, b: 0.1856
+loss: 0.0761, W: 0.2194, b: 0.1863
+loss: 0.0140, W: 0.2204, b: 0.1877
+loss: 0.0679, W: 0.2210, b: 0.1883
+loss: 0.0000, W: 0.2224, b: 0.1896
+loss: 0.0061, W: 0.2224, b: 0.1896
+loss: 0.0293, W: 0.2227, b: 0.1900
+loss: 0.0306, W: 0.2240, b: 0.1908
+loss: 0.0117, W: 0.2249, b: 0.1917
+loss: 0.0086, W: 0.2256, b: 0.1923
+loss: 0.0321, W: 0.2257, b: 0.1927
+loss: 0.0200, W: 0.2267, b: 0.1936
+loss: 0.0018, W: 0.2279, b: 0.1943
+loss: 0.0230, W: 0.2281, b: 0.1946
+loss: 0.0254, W: 0.2291, b: 0.1953
+loss: 0.0111, W: 0.2298, b: 0.1961
+loss: 0.0044, W: 0.2306, b: 0.1966
diff --git a/tests/TensorFlowSharp.Tests.CSharp/TestData/Adagrad/optimizer_lr_test.py b/tests/TensorFlowSharp.Tests.CSharp/TestData/Adagrad/optimizer_lr_test.py
@@ -0,0 +1,35 @@
+# This script is used to create data file (expected.txt)
+# which is used to compare the output from TensorFlowSharp optimizer tests.
+
+import tensorflow as tf
+
+# Training data
+train_x =[
+    3.3, 4.4, 5.5, 6.71, 6.93, 4.168, 9.779, 6.182, 7.59, 2.167,
+    7.042, 10.791, 5.313, 7.997, 5.654, 9.27, 3.1
+]
+train_y = [
+    1.7, 2.76,2.09,3.19,1.694,1.573,3.366,2.596,2.53,1.221,
+        2.827,3.465,1.65,2.904,2.42,2.94,1.3
+]
+n_samples = len(train_x)
+learning_rate = 0.01
+X = tf.placeholder(tf.float32)
+Y = tf.placeholder(tf.float32)
+
+W = tf.Variable(tf.constant(0.1), dtype=tf.float32)
+b = tf.Variable(tf.constant(0.1), dtype=tf.float32)
+
+pred = tf.add(tf.multiply(X,W), b)
+
+cost = tf.divide(tf.reduce_sum(tf.pow(tf.subtract(pred, Y), 2.0)), tf.multiply(2.0, n_samples))
+optimizer = tf.train.AdagradOptimizer(learning_rate).minimize(cost, name = "AdagradOptimizer")
+
+init = tf.global_variables_initializer()
+with tf.Session() as session:
+    session.run(init)
+    for e in range(5):
+        for i in range(n_samples):
+            _, cost_v, W_v, b_v, pred_v = session.run([optimizer, cost, W, b, pred], feed_dict = {X: train_x[i], Y: train_y[i]})
+            print(f"loss: {cost_v:.4f}, W: {W_v:.4f}, b: {b_v:.4f}")
+            #print("Prediction: %f == Actual: %f" % (pred_v, train_y[i]))
diff --git a/tests/TensorFlowSharp.Tests.CSharp/TestData/AdagradTimeDecay/expected.txt b/tests/TensorFlowSharp.Tests.CSharp/TestData/AdagradTimeDecay/expected.txt
diff --git a/tests/TensorFlowSharp.Tests.CSharp/TestData/AdagradTimeDecay/optimizer_lr_test.py b/tests/TensorFlowSharp.Tests.CSharp/TestData/AdagradTimeDecay/optimizer_lr_test.py