feat(SGD): support weight_dacay(l2 actually) (#5587)

Co-authored-by: oneflow-ci-bot <69100618+oneflow-ci-bot@users.noreply.github.com>

feat(SGD): support weight_dacay(l2 actually) (#5587)
Co-authored-by: oneflow-ci-bot <69100618+oneflow-ci-bot@users.noreply.github.com>
5c7bab46 · Yinggang Wang · GitHub · 5341044e · 5c7bab46 · 5c7bab46
Unverified Commit 5c7bab46 authored 3 years ago by Yinggang Wang Committed by GitHub 3 years ago
--- a/oneflow/python/nn/optimizer/sgd.py
+++ b/oneflow/python/nn/optimizer/sgd.py
@@ -30,7 +30,8 @@ from .optimizer import Optimizer, ParamGroup
 class SGD(Optimizer):
    r"""Implements SGD algorithm.
-    This algorithm takes a random sample’s gradient as an approximate estimate of the overall gradient in small batch gradient descent.
+    This algorithm takes a random sample’s gradient as an approximate estimate of
+    the overall gradient in small batch gradient descent.
    When the momentum = 0, the equation of parameters updating is:
@@ -42,15 +43,16 @@ class SGD(Optimizer):
        .. math::
-            & V_t = \beta * V_{t-1} + learning\_rate * g_t
+            & V_t = \beta * V_{t-1} - learning\_rate * (g_t * scale + param_{old} * weight\_decay)
-            & param_{new} = param_{old} - V_t
+            & param_{new} = param_{old} + V_t
    Args:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float, optional): learning rate (default: 1e-3)
        momentum (float, optional): Momentum factor (default: 0.0)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0.0)
        scale (float, optional): the scale factor of loss (default: 1.0)
    """
@@ -60,16 +62,19 @@ class SGD(Optimizer):
        parameters: Union[Iterator[Parameter], List[Dict]],
        lr: float = 1e-3,
        momentum: float = 0.0,
+        weight_decay: float = 0.0,  # SGD's weight_decay actually does L2 Normalize
        scale: float = 1.0,
    ):
        super().__init__()
        assert lr >= 0.0, f"Invalid learning rate: {lr}"
        assert momentum >= 0.0, f"Invalid momentum: {momentum}"
        assert scale >= 0.0, f"Invalid scale factor: {scale}"
+        assert weight_decay >= 0.0, f"Invalid weight_decay: {weight_decay}"
        self._default_options["lr"] = lr
        self._default_options["scale"] = scale
        self._default_options["momentum"] = momentum
+        self._default_options["weight_decay"] = weight_decay
        # Add parameters
        if isinstance(parameters, collections.abc.Iterator):
@@ -93,7 +98,6 @@ class SGD(Optimizer):
            .Input("model_diff")
            .Input("momentum")
            .Attr("l1", 0.0)
-            .Attr("l2", 0.0)
            .Attr("weight_decay", 0.0)
            .Build()
        )
@@ -103,7 +107,6 @@ class SGD(Optimizer):
            .Input("model_diff")
            .Attr("weight_decay", 0.0)
            .Attr("l1", 0.0)
-            .Attr("l2", 0.0)
            .Build()
        )
@@ -115,21 +118,24 @@ class SGD(Optimizer):
            for param_group in self.param_groups:
                lr = param_group["lr"]
+                scale = param_group["scale"]
+                l2 = param_group["weight_decay"]
                for param in param_group.parameters:
                    if param.grad is None:
                        continue
                    if param_group["momentum"] == 0.0:
-                        scale = param_group["scale"]
+                        self._sgd(
-                        self._sgd(param, param.grad, learning_rate_val=lr, scale=scale)
+                            param, param.grad, learning_rate_val=lr, l2=l2, scale=scale
+                        )
                    else:
                        momentum_buf = self._state[param]["momentum_buf"]
-                        scale = param_group["scale"]
                        beta = param_group["momentum"]
                        self._momentum_sgd(
                            param,
                            param.grad,
                            momentum_buf,
                            learning_rate_val=lr,
+                            l2=l2,
                            scale=scale,
                            beta=beta,
                        )

--- a/oneflow/python/test/modules/test_optim_sgd.py
+++ b/oneflow/python/test/modules/test_optim_sgd.py
@@ -19,12 +19,19 @@ from collections import OrderedDict
 import numpy as np
 import oneflow.experimental as flow
-from test_util import GenArgList
+from test_util import GenArgDict
 from oneflow.python.nn.parameter import Parameter
 def compare_with_numpy_sgd(
-    test_case, device, x_shape, scale, momentum, learning_rate, train_iters,
+    test_case,
+    device,
+    x_shape,
+    scale,
+    momentum,
+    weight_decay,
+    learning_rate,
+    train_iters,
 ):
    # generate random number sequences
    random_grad_seq = []
@@ -36,7 +43,15 @@ def compare_with_numpy_sgd(
    def train_by_oneflow():
        x = Parameter(flow.Tensor(init_value, device=flow.device(device)))
        sgd = flow.optim.SGD(
-            [{"params": [x], "lr": learning_rate, "momentum": momentum, "scale": scale}]
+            [
+                {
+                    "params": [x],
+                    "lr": learning_rate,
+                    "momentum": momentum,
+                    "scale": scale,
+                    "weight_decay": weight_decay,
+                }
+            ]
        )
        def train_one_iter(grad):
@@ -57,8 +72,9 @@ def compare_with_numpy_sgd(
        vt = np.zeros_like(x)
        def train_one_iter(grad):
-            v = momentum * vt + learning_rate * scale * grad
+            grad = grad * scale + weight_decay * x
-            param = x - v
+            v = momentum * vt - learning_rate * grad
+            param = x + v
            return param, v
        for i in range(train_iters):
@@ -80,10 +96,11 @@ class TestOptimizers(flow.unittest.TestCase):
        arg_dict["x_shape"] = [(10,)]
        arg_dict["scale"] = [1.0, 0.9]
        arg_dict["momentum"] = [0.0, 0.9]
-        arg_dict["learning_rate"] = [1]
+        arg_dict["weight_decay"] = [0.0, 0.9]
+        arg_dict["learning_rate"] = [1, 0.1]
        arg_dict["train_iters"] = [10]
-        for arg in GenArgList(arg_dict):
+        for arg in GenArgDict(arg_dict):
-            compare_with_numpy_sgd(test_case, *arg)
+            compare_with_numpy_sgd(test_case, **arg)
 if __name__ == "__main__":