Fix mse gradient

Samsung · Sep 6, 2024 · cf23704 · cf23704
1 parent 93b5d13
commit cf23704
Show file tree

Hide file tree

Showing 12 changed files with 149 additions and 32 deletions.
diff --git a/compute/cker/include/cker/train/Types.h b/compute/cker/include/cker/train/Types.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_TRAIN_TYPES_H__
+#define __NNFW_CKER_TRAIN_TYPES_H__
+
+namespace nnfw
+{
+namespace cker
+{
+namespace train
+{
+
+enum class LossReductionType
+{
+  SUM_OVER_BATCH_SIZE,
+  SUM,
+};
+
+} // namespace train
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_TYPES_H__
diff --git a/compute/cker/include/cker/train/operation/Loss.h b/compute/cker/include/cker/train/operation/Loss.h
@@ -25,6 +25,7 @@
 #include "cker/eigen/Utils.h"
 #include "cker/eigen/xent_op.h"
 #include "cker/operation/Helper/BCast.h"
+#include "cker/train/Types.h"
 
 namespace nnfw
 {
@@ -63,17 +64,39 @@ inline void MSE(const Shape &y_pred_shape, const T *y_pred_data, const Shape &y_
 
 template <typename T>
 inline void MSEGrad(const Shape &y_pred_shape, const T *y_pred_data, const Shape &y_true_shape,
-                    const T *y_true_data, const Shape &grad_shape, T *grad_data)
+                    const T *y_true_data, const Shape &grad_shape, T *grad_data,
+                    LossReductionType reduction_type)
 {
   if (y_pred_shape != y_true_shape)
     throw std::runtime_error("cker::MSEGrad: y_pred_shape != y_true_shape");
   if (y_pred_shape != grad_shape)
     throw std::runtime_error("cker::MSEGrad: y_pred_shape != grad_shape");
 
-  const int size = grad_shape.FlatSize();
-  for (int i = 0; i < size; ++i)
+  // TODO Optimize
+  const int batch_size = grad_shape.Dims(0);
+  const auto flat_size = FlatSizeSkipDim(grad_shape, 0);
+  auto reduction_size = 1;
+  switch (reduction_type)
   {
-    grad_data[i] = static_cast<T>(-2 * (y_true_data[i] - y_pred_data[i]) / size);
+    case LossReductionType::SUM_OVER_BATCH_SIZE:
+      reduction_size = batch_size * flat_size;
+      break;
+    case LossReductionType::SUM:
+      reduction_size = flat_size;
+      break;
+    default:
+      throw std::runtime_error("Unsupported reduction type");
+  }
+
+  for (int b = 0; b < batch_size; ++b)
+  {
+    for (int i = 0; i < flat_size; ++i)
+    {
+      const int offset = b * flat_size + i;
+      assert(offset >= 0);
+      grad_data[offset] =
+        static_cast<T>(-2 * (y_true_data[offset] - y_pred_data[offset]) / reduction_size);
+    }
   }
 }
 

diff --git a/compute/cker/src/train/Loss.test.cc b/compute/cker/src/train/Loss.test.cc
@@ -264,7 +264,8 @@ TEST(CKer_Operation, LossMSEGrad)
     std::vector<int> expected = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
 
     nnfw::cker::train::MSEGrad(nnfw::cker::Shape{1, 10}, y_pred.data(), nnfw::cker::Shape{1, 10},
-                               y_true.data(), nnfw::cker::Shape{1, 10}, deriv_y_pred.data());
+                               y_true.data(), nnfw::cker::Shape{1, 10}, deriv_y_pred.data(),
+                               nnfw::cker::train::LossReductionType::SUM_OVER_BATCH_SIZE);
 
     for (size_t i = 0; i < deriv_y_pred.size(); ++i)
       EXPECT_EQ(deriv_y_pred[i], expected[i]);
@@ -278,21 +279,38 @@ TEST(CKer_Operation, LossMSEGrad)
     std::vector<float> expected = {0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2};
 
     nnfw::cker::train::MSEGrad(nnfw::cker::Shape{1, 10}, y_pred.data(), nnfw::cker::Shape{1, 10},
-                               y_true.data(), nnfw::cker::Shape{1, 10}, deriv_y_pred.data());
+                               y_true.data(), nnfw::cker::Shape{1, 10}, deriv_y_pred.data(),
+                               nnfw::cker::train::LossReductionType::SUM_OVER_BATCH_SIZE);
 
     for (size_t i = 0; i < deriv_y_pred.size(); ++i)
       EXPECT_FLOAT_EQ(deriv_y_pred[i], expected[i]);
   }
 
   {
-    // Shape: {2, 3} -> m_rows:3, m_cols:2
+    // Shape: {2, 3} -> m_rows:3, m_cols:2, LossReductionType::SUM_OVER_BATCH_SIZE
     std::vector<float> y_pred = {27.2, 31.8, 51.9, 10.2, 34.2, 12.4};
     std::vector<float> y_true = {31.3, 40.3, 29.7, 12.9, 25.8, 11.9};
     std::vector<float> deriv_y_pred(6);
     std::vector<float> expected = {-1.3666667, -2.8333333, 7.4, -0.9, 2.8, 0.1666667};
 
     nnfw::cker::train::MSEGrad(nnfw::cker::Shape{2, 3}, y_pred.data(), nnfw::cker::Shape{2, 3},
-                               y_true.data(), nnfw::cker::Shape{2, 3}, deriv_y_pred.data());
+                               y_true.data(), nnfw::cker::Shape{2, 3}, deriv_y_pred.data(),
+                               nnfw::cker::train::LossReductionType::SUM_OVER_BATCH_SIZE);
+
+    for (size_t i = 0; i < deriv_y_pred.size(); ++i)
+      EXPECT_FLOAT_EQ(deriv_y_pred[i], expected[i]);
+  }
+
+  {
+    // Shape: {2, 3} -> m_rows:3, m_cols:2, LossReductionType::SUM_OVER_BATCH_SIZE
+    std::vector<float> y_pred = {27.2, 31.8, 51.9, 10.2, 34.2, 12.4};
+    std::vector<float> y_true = {31.3, 40.3, 29.7, 12.9, 25.8, 11.9};
+    std::vector<float> deriv_y_pred(6);
+    std::vector<float> expected = {-2.7333324, -5.6666665, 14.8, -1.7999998, 5.6, 0.33333334};
+
+    nnfw::cker::train::MSEGrad(nnfw::cker::Shape{2, 3}, y_pred.data(), nnfw::cker::Shape{2, 3},
+                               y_true.data(), nnfw::cker::Shape{2, 3}, deriv_y_pred.data(),
+                               nnfw::cker::train::LossReductionType::SUM);
 
     for (size_t i = 0; i < deriv_y_pred.size(); ++i)
       EXPECT_FLOAT_EQ(deriv_y_pred[i], expected[i]);
@@ -309,7 +327,8 @@ TEST(CKer_Operation, neg_LossMSEGrad)
     std::vector<float> expected = {1., 1., 1., 1., 1., 1.};
 
     nnfw::cker::train::MSEGrad(nnfw::cker::Shape{2, 3}, y_pred.data(), nnfw::cker::Shape{2, 3},
-                               y_true.data(), nnfw::cker::Shape{2, 3}, deriv_y_pred.data());
+                               y_true.data(), nnfw::cker::Shape{2, 3}, deriv_y_pred.data(),
+                               nnfw::cker::train::LossReductionType::SUM_OVER_BATCH_SIZE);
 
     for (size_t i = 0; i < deriv_y_pred.size(); ++i)
       EXPECT_NE(deriv_y_pred[i], expected[i]);
@@ -321,9 +340,10 @@ TEST(CKer_Operation, neg_LossMSEGrad)
     std::vector<float> y_true = {0., 1., 2., 3., 4., 5.};
     std::vector<float> deriv_y_pred(10);
 
-    EXPECT_ANY_THROW(nnfw::cker::train::MSEGrad(nnfw::cker::Shape{1, 10}, y_pred.data(),
-                                                nnfw::cker::Shape{2, 3}, y_true.data(),
-                                                nnfw::cker::Shape{1, 10}, deriv_y_pred.data()));
+    EXPECT_ANY_THROW(
+      nnfw::cker::train::MSEGrad(nnfw::cker::Shape{1, 10}, y_pred.data(), nnfw::cker::Shape{2, 3},
+                                 y_true.data(), nnfw::cker::Shape{1, 10}, deriv_y_pred.data(),
+                                 nnfw::cker::train::LossReductionType::SUM_OVER_BATCH_SIZE));
   }
 
   {
@@ -332,9 +352,10 @@ TEST(CKer_Operation, neg_LossMSEGrad)
     std::vector<float> y_true = {0., 1., 2., 3., 4., 5., 6., 7., 8., 9.};
     std::vector<float> deriv_y_pred(6);
 
-    EXPECT_ANY_THROW(nnfw::cker::train::MSEGrad(nnfw::cker::Shape{1, 10}, y_pred.data(),
-                                                nnfw::cker::Shape{1, 10}, y_true.data(),
-                                                nnfw::cker::Shape{2, 3}, deriv_y_pred.data()));
+    EXPECT_ANY_THROW(
+      nnfw::cker::train::MSEGrad(nnfw::cker::Shape{1, 10}, y_pred.data(), nnfw::cker::Shape{1, 10},
+                                 y_true.data(), nnfw::cker::Shape{2, 3}, deriv_y_pred.data(),
+                                 nnfw::cker::train::LossReductionType::SUM_OVER_BATCH_SIZE));
   }
 }
 

diff --git a/runtime/onert/backend/train/KernelGenerator.cc b/runtime/onert/backend/train/KernelGenerator.cc
@@ -405,13 +405,15 @@ void KernelGenerator::visit(const ir::train::operation::Loss &node)
 
   auto loss_code = node.param().loss_code;
   auto loss_param = node.param().loss_param;
+  const auto reduction_type = node.param().reduction_type;
 
   switch (loss_code)
   {
     case ir::train::LossCode::MeanSquaredError:
     {
       auto fn = std::make_unique<ops::LossMeanSquaredErrorLayer>();
-      fn->configure(y_pred_tensor, y_true_tensor, output_tensor, back_prop_y_pred_tensor);
+      fn->configure(y_pred_tensor, y_true_tensor, output_tensor, back_prop_y_pred_tensor,
+                    reduction_type);
       _return_fn = std::move(fn);
       break;
     }
@@ -421,7 +423,8 @@ void KernelGenerator::visit(const ir::train::operation::Loss &node)
       bool is_required_normalization = (last_node != ir::OpCode::Softmax);
       auto fn = std::make_unique<ops::LossCategoricalCrossentropyLayer>();
       fn->configure(y_pred_tensor, y_true_tensor, output_tensor, back_prop_y_pred_tensor,
-                    loss_param.cce.axis, loss_param.cce.label_smoothing, is_required_normalization);
+                    reduction_type, loss_param.cce.axis, loss_param.cce.label_smoothing,
+                    is_required_normalization);
       _return_fn = std::move(fn);
       break;
     }

diff --git a/runtime/onert/backend/train/ops/LossCategoricalCrossentropyLayer.cc b/runtime/onert/backend/train/ops/LossCategoricalCrossentropyLayer.cc
@@ -28,14 +28,12 @@ namespace train
 namespace ops
 {
 
-void LossCategoricalCrossentropyLayer::configure(const IPortableTensor *y_pred,
-                                                 const IPortableTensor *y_true,
-                                                 IPortableTensor *output,
-                                                 IPortableTensor *back_prop_y_pred, int32_t axis,
-                                                 float label_smoothing,
-                                                 bool is_required_normalization)
+void LossCategoricalCrossentropyLayer::configure(
+  const IPortableTensor *y_pred, const IPortableTensor *y_true, IPortableTensor *output,
+  IPortableTensor *back_prop_y_pred, ir::train::LossReductionType reduction_type, int32_t axis,
+  float label_smoothing, bool is_required_normalization)
 {
-  LossLayer::configure(y_pred, y_true, output, back_prop_y_pred);
+  LossLayer::configure(y_pred, y_true, output, back_prop_y_pred, reduction_type);
 
   _axis = axis;
   _label_smoothing = label_smoothing;

diff --git a/runtime/onert/backend/train/ops/LossCategoricalCrossentropyLayer.h b/runtime/onert/backend/train/ops/LossCategoricalCrossentropyLayer.h
@@ -19,6 +19,7 @@
 
 #include "LossLayer.h"
 #include "../Tensor.h"
+#include <ir/train/LossInfo.h>
 
 namespace onert
 {
@@ -35,8 +36,9 @@ class LossCategoricalCrossentropyLayer : public LossLayer
   LossCategoricalCrossentropyLayer() = default;
 
   void configure(const IPortableTensor *y_pred, const IPortableTensor *y_true,
-                 IPortableTensor *output, IPortableTensor *back_prop_y_pred, int32_t axis,
-                 float label_smoothing, bool is_required_normalization);
+                 IPortableTensor *output, IPortableTensor *back_prop_y_pred,
+                 ir::train::LossReductionType reduction_type, int32_t axis, float label_smoothing,
+                 bool is_required_normalization);
   void forward(bool training) override;
   void backward() override;
 

diff --git a/runtime/onert/backend/train/ops/LossLayer.cc b/runtime/onert/backend/train/ops/LossLayer.cc
@@ -32,7 +32,8 @@ LossLayer::LossLayer()
 }
 
 void LossLayer::configure(const IPortableTensor *y_pred, const IPortableTensor *y_true,
-                          IPortableTensor *output, IPortableTensor *back_prop_y_pred)
+                          IPortableTensor *output, IPortableTensor *back_prop_y_pred,
+                          ir::train::LossReductionType reduction_type)
 {
   assert(y_pred != nullptr);
   assert(y_true != nullptr);
@@ -43,6 +44,7 @@ void LossLayer::configure(const IPortableTensor *y_pred, const IPortableTensor *
   _y_true = y_true;
   _output = output;
   _back_prop_y_pred = back_prop_y_pred;
+  _reduction_type = reduction_type;
 }
 
 } // namespace ops

diff --git a/runtime/onert/backend/train/ops/LossLayer.h b/runtime/onert/backend/train/ops/LossLayer.h
@@ -21,6 +21,7 @@
 #include <ops/ElementwiseActivationLayer.h>
 
 #include <exec/train/ITrainableFunction.h>
+#include <ir/train/LossInfo.h>
 
 namespace onert
 {
@@ -42,13 +43,15 @@ class LossLayer : public ::onert::exec::train::ITrainableFunction
   LossLayer();
 
   void configure(const IPortableTensor *y_pred, const IPortableTensor *y_true,
-                 IPortableTensor *output, IPortableTensor *back_prop_y_pred);
+                 IPortableTensor *output, IPortableTensor *back_prop_y_pred,
+                 ir::train::LossReductionType reduction_type);
 
 protected:
   const IPortableTensor *_y_pred;
   const IPortableTensor *_y_true;
   IPortableTensor *_output;
   IPortableTensor *_back_prop_y_pred;
+  ir::train::LossReductionType _reduction_type;
 };
 
 } // namespace ops

diff --git a/runtime/onert/backend/train/ops/LossMeanSquaredErrorLayer.cc b/runtime/onert/backend/train/ops/LossMeanSquaredErrorLayer.cc
@@ -30,9 +30,10 @@ namespace ops
 
 void LossMeanSquaredErrorLayer::configure(const IPortableTensor *y_pred,
                                           const IPortableTensor *y_true, IPortableTensor *output,
-                                          IPortableTensor *back_prop_y_pred)
+                                          IPortableTensor *back_prop_y_pred,
+                                          ir::train::LossReductionType reduction_type)
 {
-  LossLayer::configure(y_pred, y_true, output, back_prop_y_pred);
+  LossLayer::configure(y_pred, y_true, output, back_prop_y_pred, reduction_type);
 }
 
 void LossMeanSquaredErrorLayer::forward(bool)
@@ -53,11 +54,12 @@ void LossMeanSquaredErrorLayer::backward()
 {
   assert(_back_prop_y_pred != nullptr);
 
+  const auto reduction_type = convertLossReductionType(_reduction_type);
   if (_y_pred->data_type() == OperandType::FLOAT32)
   {
     nnfw::cker::train::MSEGrad(getShape(_y_pred), getBuffer<float>(_y_pred), getShape(_y_true),
                                getBuffer<float>(_y_true), getShape(_back_prop_y_pred),
-                               getBuffer<float>(_back_prop_y_pred));
+                               getBuffer<float>(_back_prop_y_pred), reduction_type);
   }
   else
   {

diff --git a/runtime/onert/backend/train/ops/LossMeanSquaredErrorLayer.h b/runtime/onert/backend/train/ops/LossMeanSquaredErrorLayer.h
@@ -34,7 +34,8 @@ class LossMeanSquaredErrorLayer : public LossLayer
   LossMeanSquaredErrorLayer() = default;
 
   void configure(const IPortableTensor *y_pred, const IPortableTensor *y_true,
-                 IPortableTensor *output, IPortableTensor *back_prop_y_pred);
+                 IPortableTensor *output, IPortableTensor *back_prop_y_pred,
+                 ir::train::LossReductionType reduction_type);
   void forward(bool training) override;
   void backward() override;
 };

diff --git a/runtime/onert/backend/train/ops/OperationUtils.cc b/runtime/onert/backend/train/ops/OperationUtils.cc
@@ -97,6 +97,21 @@ void biasGrad(const IPortableTensor *input_backprop, IPortableTensor *bias_grad)
                                            bias_grad_buffer, bias_grad_shape);
 }
 
+nnfw::cker::train::LossReductionType convertLossReductionType(ir::train::LossReductionType type)
+{
+  switch (type)
+  {
+    case ir::train::LossReductionType::SumOverBatchSize:
+      return nnfw::cker::train::LossReductionType::SUM_OVER_BATCH_SIZE;
+      break;
+    case ir::train::LossReductionType::Sum:
+      return nnfw::cker::train::LossReductionType::SUM;
+      break;
+    default:
+      throw std::runtime_error("Unsupported LossReductionType");
+  }
+}
+
 } // namespace ops
 } // namespace train
 } // namespace backend

diff --git a/runtime/onert/backend/train/ops/OperationUtils.h b/runtime/onert/backend/train/ops/OperationUtils.h
@@ -17,6 +17,8 @@
 #ifndef __ONERT_BACKEND_TRAIN_OPS_OPERATION_UTILS_H__
 #define __ONERT_BACKEND_TRAIN_OPS_OPERATION_UTILS_H__
 
+#include <cker/train/Types.h>
+#include <ir/train/LossInfo.h>
 #include <ops/OperationUtils.h>
 
 namespace onert
@@ -77,6 +79,14 @@ const IPortableTensor *backpropActivation(const ir::Activation &activation,
  */
 void biasGrad(const IPortableTensor *input_backprop, IPortableTensor *bias_grad);
 
+/**
+ * @brief convert loss reduction type
+ *
+ * @param type loss reduction type defined in ir::train::LossReductionType
+ * @return corresponding type defined in cker::train::LossReductionType
+ */
+nnfw::cker::train::LossReductionType convertLossReductionType(ir::train::LossReductionType type);
+
 } // namespace ops
 } // namespace train
 } // namespace backend