From a984ea4163ed98c0b557a72cf641fe6901fab1cf Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 3 Dec 2021 13:57:28 +0000 Subject: [PATCH 01/23] add reduce operation --- include/mxnet/c_api.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h index 0aff74772c47..551643afa6a5 100644 --- a/include/mxnet/c_api.h +++ b/include/mxnet/c_api.h @@ -502,6 +502,15 @@ MXNET_DLL int MXSetNumOMPThreads(int thread_num); * \param bulk_size new bulk_size * \param prev_bulk_size previous bulk_size */ + + /*! + * \brief Get the compute capability of a given GPU. + * \param dev the GPU number to query + * \param out pointer to integer that will hold the compute capability of the queried GPU. + * \return 0 when success, -1 when failure happens. + */ +MXNET_DLL int MXGetGPUSMArch(int dev, int* out); + MXNET_DLL int MXEngineSetBulkSize(int bulk_size, int* prev_bulk_size); /*! @@ -521,6 +530,22 @@ MXNET_DLL int MXGetGPUCount(int* out); */ MXNET_DLL int MXGetGPUMemoryInformation(int dev, int *free_mem, int *total_mem); +/*! + * \brief Get the size of the NCCL unique id (in bytes). + * \param size pointer to integer that will hold the NCCL unique id size. + * \return 0 when success, -1 when failure happens. + */ +MXNET_DLL int MXNCCLGetUniqueIdSize(int* size); + +/*! + * \brief Get the NCCL unique id. + * \param out pointer to an array that will hold the NCCL unique id. It has to be at least of the + * size returned by MXNCCLGetUniqueIdSize. + * \return 0 when success, -1 when failure happens. + */ +MXNET_DLL int MXNCCLGetUniqueId(void* out); + + /*! * \brief get the free and total available memory on a GPU * \param dev the GPU number to query From d31c7b2b999ea7c136c0980a33f4f435864f0485 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 3 Dec 2021 13:57:42 +0000 Subject: [PATCH 02/23] add reduce operation --- src/c_api/c_api.cc | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc index 8bb2b54bcc8d..07a0ef3353b6 100644 --- a/src/c_api/c_api.cc +++ b/src/c_api/c_api.cc @@ -61,6 +61,13 @@ #include "miniz.h" #include "nnvm/pass_functions.h" +#if MXNET_USE_CUDA +#include "../common/cuda/utils.h" +#endif +#if MXNET_USE_NCCL +#include +#endif + // FTZ only applies to SSE and AVX instructions. #if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || \ (defined(_M_IX86_FP) && _M_IX86_FP >= 1) From da37fbc34ec3d7c134bdb5a3edf72a3180164dc7 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 3 Dec 2021 13:58:37 +0000 Subject: [PATCH 03/23] add reduce operation --- src/operator/contrib/nn/reduce_op-inl.h | 97 +++++++++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 src/operator/contrib/nn/reduce_op-inl.h diff --git a/src/operator/contrib/nn/reduce_op-inl.h b/src/operator/contrib/nn/reduce_op-inl.h new file mode 100644 index 000000000000..771a4fa6ecce --- /dev/null +++ b/src/operator/contrib/nn/reduce_op-inl.h @@ -0,0 +1,97 @@ +#ifndef MXNET_OPERATOR_CONTRIB_NCCLREDUCE_H_ +#define MXNET_OPERATOR_CONTRIB_NCCLREDUCE_H_ + +#include +#include "../../mshadow_op.h" +#include "../../mxnet_op.h" +#include "../../operator_common.h" +#include "../../elemwise_op_common.h" +#include "../../tensor/init_op.h" + +#if MXNET_USE_NCCL +#include +#include +#include + +namespace mxnet { +namespace op { + +struct NCCLReduceParam : public dmlc::Parameter { + int32_t num_gpus; + int32_t root_rank; + int32_t rank; + uintptr_t nccl_unique_id; + + DMLC_DECLARE_PARAMETER(NCCLReduceParam) { + DMLC_DECLARE_FIELD(num_gpus) + .set_default(1) + .describe("Number of all gpus."); + DMLC_DECLARE_FIELD(root_rank) + .set_default(0) + .describe("root rank of reduce operation"); + DMLC_DECLARE_FIELD(rank) + .set_default(0) + .describe("rank of current process"); + DMLC_DECLARE_FIELD(nccl_unique_id) + .describe("NCCL unique ID"); + } +}; + +template +struct ncclreduce_backward { + template + MSHADOW_XINLINE static void Map(int i, + DType* in_grad, + const DType* out_grad) { + KERNEL_ASSIGN(in_grad[i], req, out_grad[i] * 1); + } +}; + +template +void NCCLReduceBackward(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK_EQ(inputs.size(), 1U); + CHECK_EQ(outputs.size(), 1U); + CHECK_EQ(req.size(), 1U); + mshadow::Stream* s = ctx.get_stream(); + const TBlob& out_grad = inputs[0]; + const TBlob& in_grad = outputs[0]; + const NCCLReduceParam& param = nnvm::get(attrs.parsed); + using namespace mxnet_op; + MSHADOW_TYPE_SWITCH(out_grad.type_flag_, DType, { + MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, { + Kernel, xpu>::Launch(s, + in_grad.Size(), + in_grad.dptr(), + out_grad.dptr()); + }); + }); +} + + + + + +class NCCLCommContainer { + public: + struct Param { + int num_gpus; + int rank; + uintptr_t nccl_unique_id; + }; + static inline std::unordered_map> comm_map; + + static void Init(const Param& param); +}; + +} // namespace op +} // namespace mxnet + +#else +static_assert(false, "You need to compile with NCCL support to use reduce operation!"); +#endif // MXNET_USE_NCCL + +#endif // MXNET_OPERATOR_CONTRIB_SPATIAL_PARALLEL_SUPPORT_H_ \ No newline at end of file From 2a18ca9245e9dbbd8ae4e83e1b73aefbed9a36bf Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 3 Dec 2021 13:58:56 +0000 Subject: [PATCH 04/23] add reduce operation --- src/operator/contrib/nn/reduce_op.cc | 89 ++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 src/operator/contrib/nn/reduce_op.cc diff --git a/src/operator/contrib/nn/reduce_op.cc b/src/operator/contrib/nn/reduce_op.cc new file mode 100644 index 000000000000..b21c290a505e --- /dev/null +++ b/src/operator/contrib/nn/reduce_op.cc @@ -0,0 +1,89 @@ +#include "reduce_op-inl.h" +#include +#include +#include +#include +#include +#include +#include "../../operator_common.h" +#include "../../elemwise_op_common.h" + + +namespace mxnet { +namespace op { + +void NCCLCommContainer::Init(const NCCLCommContainer::Param& param) { + std::lock_guard l(Storage::Get()->GetMutex(Context::kGPU)); + if (NCCLCommContainer::comm_map.count(param.num_gpus) == 0) { + auto [it, inserted] = NCCLCommContainer::comm_map.emplace(param.num_gpus, // NOLINT(*) + std::make_unique()); + CHECK(inserted) << "Could not insert new NCCL communicator!"; + ncclComm_t* comm = it->second.get(); + ncclUniqueId id = *(reinterpret_cast( + reinterpret_cast(param.nccl_unique_id))); + auto result = ncclCommInitRank(comm, param.num_gpus, id, param.rank); + CHECK_EQ(result, ncclSuccess) << "ncclCommInitRank failed!"; + } +} + +bool NCCLReduceShape(const nnvm::NodeAttrs& attrs, + mxnet::ShapeVector* in_attrs, + mxnet::ShapeVector* out_attrs) { + CHECK_EQ(in_attrs->size(), 1U); + CHECK_EQ(out_attrs->size(), 1U); + + SHAPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0)); + SHAPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0)); + return out_attrs->at(0).ndim() != 0U && out_attrs->at(0).Size() != 0U; +} + +inline bool NCCLReduceType(const nnvm::NodeAttrs& attrs, + std::vector* in_attrs, + std::vector* out_attrs) { + CHECK_EQ(in_attrs->size(), 1U); + CHECK_EQ(out_attrs->size(), 1U); + + TYPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0)); + TYPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0)); + return out_attrs->at(0) != -1; +} + +DMLC_REGISTER_PARAMETER(NCCLReduceParam); + +NNVM_REGISTER_OP(_contrib_NCCLReduce) +.describe(R"code(Reduce operation +)code" ADD_FILELINE) +.set_attr_parser(ParamParser) +.set_num_inputs(1) +.set_num_outputs(1) +.set_attr("FListInputNames", + [](const NodeAttrs& attrs) { + return std::vector{"data"}; + }) +.set_attr("FInferShape", NCCLReduceShape) +.set_attr("FInferType", NCCLReduceType) +.set_attr("FGradient", ElemwiseGradUseIn{"_backward_NCCLReduce"}) +.set_attr("FInplaceOption", + [](const NodeAttrs& attrs) { + return std::vector >{{0, 0}}; + }) +.set_attr("FInplaceIdentity", + [](const NodeAttrs& attrs){ + const NCCLReduceParam& param = nnvm::get(attrs.parsed); + if (param.num_gpus == 1) { + return std::vector{true}; + } else { + return std::vector{false}; + } + }) +.add_argument("data", "NDArray-or-Symbol", "Input ndarray") +.add_arguments(NCCLReduceParam::__FIELDS__()); + +NNVM_REGISTER_OP(_backward_NCCLReduce) +.set_attr_parser(ParamParser) +.set_num_inputs(1) +.set_num_outputs(1) +.set_attr("TIsBackward", true) +.set_attr("FCompute", NCCLReduceBackward); +} +} \ No newline at end of file From 11f6e6cb9fefeb89160c30117fa6400f4129b6f0 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 3 Dec 2021 13:59:12 +0000 Subject: [PATCH 05/23] add reduce operation --- src/operator/contrib/nn/reduce_op.cu | 60 ++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 src/operator/contrib/nn/reduce_op.cu diff --git a/src/operator/contrib/nn/reduce_op.cu b/src/operator/contrib/nn/reduce_op.cu new file mode 100644 index 000000000000..6c0a1a92f2fa --- /dev/null +++ b/src/operator/contrib/nn/reduce_op.cu @@ -0,0 +1,60 @@ +/*! + * Copyright (c) 2020 by Contributors + * \file spatial_parallel_support.cu + * \brief Support operators for spatial parallelism + * \author Przemyslaw Tredak +*/ + +#include "reduce_op-inl.h" +#include +#include +#include +#include +#include "../../operator_common.h" +#include "../../../common/utils.h" +#include "../../tensor/elemwise_binary_op.h" + +namespace mxnet { +namespace op { + + +void NCCLReduceCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const NCCLReduceParam& param = nnvm::get(attrs.parsed); + if (req[0] == OpReqType::kNullOp) return; + if (param.num_gpus == 1 && req[0] == OpReqType::kWriteInplace) return; + NCCLCommContainer::Param p = {param.num_gpus, + param.rank, + param.nccl_unique_id}; + NCCLCommContainer::Init(p); + + std::lock_guard l(Storage::Get()->GetMutex(Context::kGPU)); + ncclComm_t comm = *(NCCLCommContainer::comm_map.at(param.num_gpus)); + const index_t size = inputs[0].shape_.Size() * + common::mshadow_type_info(inputs[0].type_flag_).size; + if (req[0] != OpReqType::kAddTo) { + + ncclResult_t result = ncclReduce(inputs[0].dptr_, + outputs[0].dptr_, + size, ncclInt8, ncclSum, param.root_rank, + comm, + mshadow::Stream::GetStream(ctx.get_stream())); + + + CHECK_EQ(result, ncclSuccess) << "NCCL Reduce failed!"; + } else { + LOG(FATAL) << "kAddTo not supported yet!"; + } +} + +NNVM_REGISTER_OP(_contrib_NCCLReduce) +.set_attr("FCompute", NCCLReduceCompute); + +NNVM_REGISTER_OP(_backward_NCCLReduce) +.set_attr("FCompute", NCCLReduceBackward); + +} // namespace op +} // namespace mxnet \ No newline at end of file From 357b73a627f1d39967c15fe50186f0615e64235f Mon Sep 17 00:00:00 2001 From: "xinyual@amazon.com" Date: Tue, 14 Dec 2021 10:13:46 +0000 Subject: [PATCH 06/23] pass way --- example/extensions/lib_pass/pass_lib.cc | 54 ++++++++++++++++++++++++ example/extensions/lib_pass/test_pass.py | 37 ++++++++++++++++ 2 files changed, 91 insertions(+) diff --git a/example/extensions/lib_pass/pass_lib.cc b/example/extensions/lib_pass/pass_lib.cc index fb9a2d42f8d3..f7101a2cb3c8 100644 --- a/example/extensions/lib_pass/pass_lib.cc +++ b/example/extensions/lib_pass/pass_lib.cc @@ -26,6 +26,7 @@ #include #include #include +#include #include "mxnet/lib_api.h" using namespace mxnet::ext; @@ -36,7 +37,60 @@ MXReturnValue myPass(mxnet::ext::Graph *g, for (auto kv : options) { std::cout << "option: " << kv.first << " ==> " << kv.second << std::endl; } + //g->addNode("myConv","Convolution"); + mxnet::ext::Node *copy_old_layer = g->getNode(g->size()-2); + mxnet::ext::Node *old_layer = g->getNode(g->size()-1); + std::cout<name<addNode("myweight3","null"); + new_weight->alloc_arg({3,2}, MXContext::CPU(0),kFloat32); + + mxnet::ext::Node *new_bias = g->addNode("mybias3","null"); + new_bias->alloc_arg({3,}, MXContext::CPU(0),kFloat32); + + + + + //new_bias->alloc_aux({2}, MXContext::CPU(0), kInt32); + + + auto attrs = copy_old_layer->attrs; + mxnet::ext::Node *new_layer = g->addNode("mylinaer","FullyConnected"); + auto new_attrs = &new_layer->attrs; + for (auto it = attrs.begin(); it!=attrs.end(); it ++ ) + { + + std::cout<first<<" : "<second<first!="num_hidden"){ + new_attrs->insert({{it->first,it->second}});} + } + new_attrs->insert({{"num_hidden","3"}}); + + std::cout<<"current:"<begin(); it!=new_attrs->end(); it ++ ) + { + std::cout<first<<" : "<second<inputs.push_back({old_layer, 0}); + new_layer->inputs.push_back({new_weight, 0}); + new_layer->inputs.push_back({new_bias, 0}); + old_layer->outputs.push_back({new_layer, 0}); + new_weight->outputs.push_back({new_layer, 1}); + new_bias->outputs.push_back({new_layer, 2}); + auto it = g->outputs.begin(); + g->outputs.erase(it); + mxnet::ext::NodeEntry new_entry; + new_entry.node = new_layer; + new_entry.entry = 0; + g->outputs.push_back(new_entry); + g->print(); + + return MX_SUCCESS; } diff --git a/example/extensions/lib_pass/test_pass.py b/example/extensions/lib_pass/test_pass.py index ab89f9566ebe..7e65eb60c75c 100644 --- a/example/extensions/lib_pass/test_pass.py +++ b/example/extensions/lib_pass/test_pass.py @@ -27,6 +27,8 @@ import mxnet as mx from mxnet.gluon import nn from mxnet import nd +from mxnet import np, npx +from gluonnlp.layers import get_activation from mxnet.base import _LIB, check_call, mx_uint, c_str, c_str_array, SymbolHandle # load library @@ -47,7 +49,42 @@ d = mx.sym.exp(c) sym = mx.sym.log(d) + +class Easynet(nn.HybridBlock): + def __init__(self): + super().__init__() + self.l1 = nn.Dense(in_units=2, units=2, flatten=False) + #self.l2 = nn.Dense(in_units=2, units=2, flatten=False) + self.act1 = get_activation('relu') + + #self.seq.add(nn.Dense(in_units=2, units=2, flatten=False)) + #self.seq.add(get_activation('relu')) + #self.seq.add(nn.Dense(in_units=2, units=2, flatten=False)) + #self.seq.register_op_hook(mon_callback, monitor_all=True) + #self.l1.register_op_hook(mon_callback, monitor_all=True) + + + def forward(self, input): + input = self.l1(input) + input = self.act1(input) + return input + + def test_model(pass_name): + model = Easynet() + model.initialize() + model.hybridize() + + + print('try on model') + x = np.array([[1,2]]) + + model.optimize_for(x, backend = pass_name) + + out = model(x) + model.export("my_model") + print(out.shape) + return args={'a':mx.nd.ones((3,2)), 'b':mx.nd.ones((3,2))} # execute in MXNet print('-------------------------------') From 87ee2c2e8a487230ebcdefbcf9baa1a8b7fc7382 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 14 Dec 2021 10:50:18 +0000 Subject: [PATCH 07/23] merge master --- 3rdparty/onednn | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/3rdparty/onednn b/3rdparty/onednn index f40443c41342..e2d45252ae9c 160000 --- a/3rdparty/onednn +++ b/3rdparty/onednn @@ -1 +1 @@ -Subproject commit f40443c413429c29570acd6cf5e3d1343cf647b4 +Subproject commit e2d45252ae9c3e91671339579e3c0f0061f81d49 From 1054b505e8293e73ffef30acdc2b623c9f863088 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 16 Dec 2021 07:12:10 +0000 Subject: [PATCH 08/23] add nccl reduce inside mxnet --- include/mxnet/c_api.h | 8 ++++++ include/mxnet/resource.h | 14 +++++++--- python/mxnet/gluon/contrib/__init__.py | 2 ++ src/c_api/c_api.cc | 34 +++++++++++++++++++++++ src/imperative/attach_op_resource_pass.cc | 6 ++++ src/operator/contrib/nn/reduce_op.cc | 2 +- src/operator/contrib/nn/reduce_op.cu | 3 +- 7 files changed, 62 insertions(+), 7 deletions(-) diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h index c496a84e234f..7d50c7e068f2 100644 --- a/include/mxnet/c_api.h +++ b/include/mxnet/c_api.h @@ -525,6 +525,14 @@ MXNET_DLL int MXEngineSetBulkSize(int bulk_size, int* prev_bulk_size); */ MXNET_DLL int MXGetGPUCount(int* out); +/*! + * \brief Get the compute capability of a given GPU. + * \param dev the GPU number to query + * \param out pointer to integer that will hold the compute capability of the queried GPU. + * \return 0 when success, -1 when failure happens. + */ +MXNET_DLL int MXGetGPUSMArch(int dev, int* out); + /*! * \brief get the free and total available memory on a GPU * Note: Deprecated, use MXGetGPUMemoryInformation64 instead. diff --git a/include/mxnet/resource.h b/include/mxnet/resource.h index b856002cb76f..038d7c20fcb2 100644 --- a/include/mxnet/resource.h +++ b/include/mxnet/resource.h @@ -39,16 +39,22 @@ struct ResourceRequest { /*! \brief Resource type, indicating what the pointer type is */ enum Type { /*! \brief mshadow::Random object */ - kRandom, + kRandom = 0, /*! \brief A dynamic temp space that can be arbitrary size */ - kTempSpace, + kTempSpace = 1, /*! \brief common::RandGenerator object, which can be used in GPU kernel functions */ - kParallelRandom + kParallelRandom = 2 #if MXNET_USE_CUDNN == 1 , /*! \brief cudnnDropoutDescriptor_t object for GPU dropout kernel functions */ - kCuDNNDropoutDesc + kCuDNNDropoutDesc = 3 #endif // MXNET_USE_CUDNN == 1 +#if MXNET_USE_CUDA + , + /*! \brief Resource indicating the usage of multi GPU communication, used to prevent + * multiple ops of doing it at the same time */ + kMultiGPUComm = 4 +#endif // MXNET_USE_CUDA == 1 }; /*! \brief type of resources */ Type type; diff --git a/python/mxnet/gluon/contrib/__init__.py b/python/mxnet/gluon/contrib/__init__.py index 9b9f0ce8d696..8d421a137336 100644 --- a/python/mxnet/gluon/contrib/__init__.py +++ b/python/mxnet/gluon/contrib/__init__.py @@ -21,3 +21,5 @@ from . import data from . import estimator + +from . import nn \ No newline at end of file diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc index f29ed768c121..a8e478d1a0ba 100644 --- a/src/c_api/c_api.cc +++ b/src/c_api/c_api.cc @@ -1943,6 +1943,16 @@ int MXGetGPUCount(int* out) { API_END(); } +int MXGetGPUSMArch(int dev, int* out) { + API_BEGIN(); +#if MXNET_USE_CUDA == 1 + *out = SMArch(dev); +#else + LOG(FATAL) << "Compile with USE_CUDA=1 to query CUDA device properties."; +#endif + API_END(); +} + // Deprecated: use MXGetGPUMemoryInformation64() instead. int MXGetGPUMemoryInformation(int dev, int* free_mem, int* total_mem) { API_BEGIN(); @@ -1960,6 +1970,30 @@ int MXGetGPUMemoryInformation64(int dev, uint64_t* free_mem, uint64_t* total_mem API_END(); } +int MXNCCLGetUniqueIdSize(int* size) { + API_BEGIN(); +#if MXNET_USE_CUDA && MXNET_USE_NCCL + *size = sizeof(ncclUniqueId); +#else + LOG(FATAL) << "Compile with USE_CUDA=1 and USE_NCCL=1 to have NCCL support."; +#endif + API_END(); +} + +int MXNCCLGetUniqueId(void* out) { + API_BEGIN(); +#if MXNET_USE_CUDA && MXNET_USE_NCCL + auto ret = ncclGetUniqueId(reinterpret_cast(out)); + if (ret != ncclSuccess) { + LOG(FATAL) << "Failed to get the NCCL unique id"; + } +#else + LOG(FATAL) << "Compile with USE_CUDA=1 and USE_NCCL=1 to have NCCL support."; +#endif + API_END(); +} + + int MXGetVersion(int* out) { API_BEGIN(); *out = static_cast(MXNET_VERSION); diff --git a/src/imperative/attach_op_resource_pass.cc b/src/imperative/attach_op_resource_pass.cc index 17d6d7a41dc3..5b8e3ebf4649 100644 --- a/src/imperative/attach_op_resource_pass.cc +++ b/src/imperative/attach_op_resource_pass.cc @@ -83,6 +83,12 @@ void AttachOpResources(const Graph& g, break; } #endif // MXNET_USE_CUDNN == 1 +#if MXNET_USE_CUDA == 1 + case ResourceRequest::kMultiGPUComm: { + requested.push_back(ResourceManager::Get()->Request(ctx, req)); + break; + } +#endif // MXNET_USE_NCCL == 1 default: LOG(FATAL) << "resource type " << req.type << " is not yet supported"; } diff --git a/src/operator/contrib/nn/reduce_op.cc b/src/operator/contrib/nn/reduce_op.cc index b21c290a505e..35f840affed8 100644 --- a/src/operator/contrib/nn/reduce_op.cc +++ b/src/operator/contrib/nn/reduce_op.cc @@ -62,7 +62,7 @@ NNVM_REGISTER_OP(_contrib_NCCLReduce) }) .set_attr("FInferShape", NCCLReduceShape) .set_attr("FInferType", NCCLReduceType) -.set_attr("FGradient", ElemwiseGradUseIn{"_backward_NCCLReduce"}) +.set_attr("FGradient", ElemwiseGradUseNone{"_backward_NCCLReduce"}) .set_attr("FInplaceOption", [](const NodeAttrs& attrs) { return std::vector >{{0, 0}}; diff --git a/src/operator/contrib/nn/reduce_op.cu b/src/operator/contrib/nn/reduce_op.cu index 6c0a1a92f2fa..2b7d9ff0c935 100644 --- a/src/operator/contrib/nn/reduce_op.cu +++ b/src/operator/contrib/nn/reduce_op.cu @@ -36,10 +36,9 @@ void NCCLReduceCompute(const nnvm::NodeAttrs& attrs, const index_t size = inputs[0].shape_.Size() * common::mshadow_type_info(inputs[0].type_flag_).size; if (req[0] != OpReqType::kAddTo) { - ncclResult_t result = ncclReduce(inputs[0].dptr_, outputs[0].dptr_, - size, ncclInt8, ncclSum, param.root_rank, + size, ncclFloat32, ncclAvg, param.root_rank, comm, mshadow::Stream::GetStream(ctx.get_stream())); From 76c069e1d2424c8e9d0b65737fb0ffc4097ffa68 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 21 Dec 2021 12:36:59 +0000 Subject: [PATCH 09/23] add reduce into graph --- example/extensions/lib_pass/pass_lib.cc | 92 ++++++++++++++++++++++++- 1 file changed, 91 insertions(+), 1 deletion(-) diff --git a/example/extensions/lib_pass/pass_lib.cc b/example/extensions/lib_pass/pass_lib.cc index 31b362addb8d..7170199ba380 100644 --- a/example/extensions/lib_pass/pass_lib.cc +++ b/example/extensions/lib_pass/pass_lib.cc @@ -31,11 +31,29 @@ using namespace mxnet::ext; /* \brief a basic pass that prints out the options and the graph */ +/* MXReturnValue myPass(mxnet::ext::Graph* g, const std::unordered_map& options) { + int cur_rank = -1; + std::string nccl_unique_id = "asd"; for (auto kv : options) { - std::cout << "option: " << kv.first << " ==> " << kv.second << std::endl; + std::cout << "option: " << kv.first << " ==> " << kv.second << std::endl;} + + std::cout<<" print options" << std::endl; + size_t length = g->size(); + mxnet::ext::Node *tmp; + for (int i = 0;i < length; i += 1) + { + tmp = g->getNode(i); + std::cout<name<attrs; + for (auto it = attrs.begin(); it!=attrs.end(); it ++ ) + { + std::cout<name<<":attrs:"<first<<" : "<second<addNode("myConv","_contrib_NCCLReduce"); + /* //g->addNode("myConv","Convolution"); mxnet::ext::Node *copy_old_layer = g->getNode(g->size()-2); mxnet::ext::Node *old_layer = g->getNode(g->size()-1); @@ -92,6 +110,78 @@ MXReturnValue myPass(mxnet::ext::Graph* g, return MX_SUCCESS; } +*/ + + +MXReturnValue myPass(mxnet::ext::Graph* g, + const std::unordered_map& options) { + std::string cur_rank = ""; + std::string nccl_unique_id = ""; + std::string num_gpus = ""; + for (auto kv : options) { + std::cout << "option: " << kv.first << " ==> " << kv.second << std::endl; + if (kv.first == "rank") + { + cur_rank = kv.second.c_str(); + } + if (kv.first == "nccl_unique_id") + nccl_unique_id = kv.second.c_str(); + if (kv.first == "num_gpus") + num_gpus = kv.second.c_str(); + } + std::cout<<"nccl:"<size(); + mxnet::ext::Node *tmp; + std::string root_rank; + mxnet::ext::Node *target_node; + int index = 0; + for (int i = 0;i < length; i += 1) + { + target_node = g->getNode(i); + std::cout<<"deal with:" << target_node->name<name); + if (it == options.end()) {continue;} // req_grad == null + root_rank = it->second; + mxnet::ext::Node *new_reduce = g->addNode("ncclreduce_" + std::to_string(index),"_contrib_NCCLReduce"); + index += 1; + auto new_attrs = &new_reduce->attrs; + auto old_attrs = target_node->attrs; + for (auto it = old_attrs.begin(); it!=old_attrs.end(); it++) + { + if (it->first == "__ext_dtype__" || it->first == "__ext_shape__" || it->first == "__profiler_scope__") + { + new_attrs ->insert({{it->first, it->second}}); + } + } + new_attrs->insert({{"nccl_unique_id", nccl_unique_id}}); + new_attrs->insert({{"num_gpus", num_gpus}}); + new_attrs->insert({{"rank", cur_rank}}); + new_attrs->insert({{"root_rank", root_rank}}); + + for (int i=0;ioutputs.size(); i++) + { + new_reduce->outputs.push_back(target_node->outputs[i]); + mxnet::ext::Node *output_node = target_node->outputs[i].node; + int index = target_node->outputs[i].entry; + //std::cout<<"try change:"<name<<":"<inputs.size()<inputs[index].node = new_reduce; + } + for (int i=0;ioutputs.size(); i++) + { + target_node->outputs.pop_back(); + } + target_node->outputs.push_back({new_reduce, 0}); + new_reduce->inputs.push_back({target_node, 0}); + + } + g->print(); + + + return MX_SUCCESS; +} + + REGISTER_PASS(myPass).setBody(myPass); From dff8bf47a35f34f9f74ec83a5e1e7115e4c0044b Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 21 Dec 2021 13:03:26 +0000 Subject: [PATCH 10/23] reverse operation --- src/operator/contrib/nn/reduce_op-inl.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/operator/contrib/nn/reduce_op-inl.h b/src/operator/contrib/nn/reduce_op-inl.h index 771a4fa6ecce..8b08d7d6994f 100644 --- a/src/operator/contrib/nn/reduce_op-inl.h +++ b/src/operator/contrib/nn/reduce_op-inl.h @@ -38,7 +38,7 @@ struct NCCLReduceParam : public dmlc::Parameter { }; template -struct ncclreduce_backward { +struct ncclreduce_compute { template MSHADOW_XINLINE static void Map(int i, DType* in_grad, @@ -48,7 +48,7 @@ struct ncclreduce_backward { }; template -void NCCLReduceBackward(const nnvm::NodeAttrs& attrs, +void NCCLReduceCompute(const nnvm::NodeAttrs& attrs, const OpContext& ctx, const std::vector& inputs, const std::vector& req, @@ -63,7 +63,7 @@ void NCCLReduceBackward(const nnvm::NodeAttrs& attrs, using namespace mxnet_op; MSHADOW_TYPE_SWITCH(out_grad.type_flag_, DType, { MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, { - Kernel, xpu>::Launch(s, + Kernel, xpu>::Launch(s, in_grad.Size(), in_grad.dptr(), out_grad.dptr()); From f30f8e53989a7ac6861f9ad61ef7660a3b03874a Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 21 Dec 2021 13:03:36 +0000 Subject: [PATCH 11/23] reverse operation --- src/operator/contrib/nn/reduce_op.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/operator/contrib/nn/reduce_op.cc b/src/operator/contrib/nn/reduce_op.cc index 35f840affed8..e00245b06ed1 100644 --- a/src/operator/contrib/nn/reduce_op.cc +++ b/src/operator/contrib/nn/reduce_op.cc @@ -56,6 +56,7 @@ NNVM_REGISTER_OP(_contrib_NCCLReduce) .set_attr_parser(ParamParser) .set_num_inputs(1) .set_num_outputs(1) +.set_attr("FCompute", NCCLReduceCompute) .set_attr("FListInputNames", [](const NodeAttrs& attrs) { return std::vector{"data"}; @@ -83,7 +84,6 @@ NNVM_REGISTER_OP(_backward_NCCLReduce) .set_attr_parser(ParamParser) .set_num_inputs(1) .set_num_outputs(1) -.set_attr("TIsBackward", true) -.set_attr("FCompute", NCCLReduceBackward); +.set_attr("TIsBackward", true); } } \ No newline at end of file From f0e2f72d29673e82734ce318cb10a2ee2da71bcf Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 21 Dec 2021 13:03:40 +0000 Subject: [PATCH 12/23] reverse operation --- src/operator/contrib/nn/reduce_op.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/operator/contrib/nn/reduce_op.cu b/src/operator/contrib/nn/reduce_op.cu index 2b7d9ff0c935..655023c6c964 100644 --- a/src/operator/contrib/nn/reduce_op.cu +++ b/src/operator/contrib/nn/reduce_op.cu @@ -18,7 +18,7 @@ namespace mxnet { namespace op { -void NCCLReduceCompute(const nnvm::NodeAttrs& attrs, +void NCCLReduceBackward(const nnvm::NodeAttrs& attrs, const OpContext& ctx, const std::vector& inputs, const std::vector& req, @@ -50,10 +50,10 @@ void NCCLReduceCompute(const nnvm::NodeAttrs& attrs, } NNVM_REGISTER_OP(_contrib_NCCLReduce) -.set_attr("FCompute", NCCLReduceCompute); +.set_attr("FCompute", NCCLReduceCompute); NNVM_REGISTER_OP(_backward_NCCLReduce) -.set_attr("FCompute", NCCLReduceBackward); +.set_attr("FCompute", NCCLReduceBackward); } // namespace op } // namespace mxnet \ No newline at end of file From f5afd608f3ecb1d64c5872b07f0455744aff3c2d Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Mon, 17 Jan 2022 11:33:04 +0000 Subject: [PATCH 13/23] print version --- cpp-package/include/mxnet-cpp/executor.h | 9 +- example/extensions/lib_pass/pass_lib.cc | 2 +- example/extensions/lib_pass/test_pass.py | 218 ++++++++++++++++++++--- include/mxnet/c_api.h | 5 +- include/mxnet/imperative.h | 3 +- python/mxnet/ndarray/ndarray.py | 17 +- src/c_api/c_api_ndarray.cc | 20 ++- src/imperative/cached_op.cc | 64 +++++-- src/imperative/cached_op.h | 12 +- src/imperative/imperative.cc | 20 ++- src/imperative/imperative_utils.cc | 20 ++- src/imperative/imperative_utils.h | 3 +- src/imperative/naive_cached_op.h | 3 +- src/operator/subgraph_op_common.cc | 2 +- 14 files changed, 333 insertions(+), 65 deletions(-) diff --git a/cpp-package/include/mxnet-cpp/executor.h b/cpp-package/include/mxnet-cpp/executor.h index fff559b79df3..bb602fc282c9 100644 --- a/cpp-package/include/mxnet-cpp/executor.h +++ b/cpp-package/include/mxnet-cpp/executor.h @@ -132,7 +132,9 @@ class Executor { 1, nullptr, nullptr), - 0); + 0, + nullptr, + nullptr); } else { CHECK_EQ(MXAutogradBackwardEx(out_handles.size(), out_handles.data(), @@ -144,7 +146,10 @@ class Executor { 1, nullptr, nullptr), - 0); + 0, + 0, + nullptr, + nullptr); } grad_arrays.clear(); grad_arrays.reserve(arg_arrays.size()); diff --git a/example/extensions/lib_pass/pass_lib.cc b/example/extensions/lib_pass/pass_lib.cc index 7170199ba380..df7ac8fa079f 100644 --- a/example/extensions/lib_pass/pass_lib.cc +++ b/example/extensions/lib_pass/pass_lib.cc @@ -143,7 +143,7 @@ MXReturnValue myPass(mxnet::ext::Graph* g, auto it = options.find(target_node->name); if (it == options.end()) {continue;} // req_grad == null root_rank = it->second; - mxnet::ext::Node *new_reduce = g->addNode("ncclreduce_" + std::to_string(index),"_contrib_NCCLReduce"); + mxnet::ext::Node *new_reduce = g->addNode("ncclreduce_" + target_node->name,"_contrib_NCCLReduce"); index += 1; auto new_attrs = &new_reduce->attrs; auto old_attrs = target_node->attrs; diff --git a/example/extensions/lib_pass/test_pass.py b/example/extensions/lib_pass/test_pass.py index 7e65eb60c75c..df115eee7853 100644 --- a/example/extensions/lib_pass/test_pass.py +++ b/example/extensions/lib_pass/test_pass.py @@ -27,8 +27,15 @@ import mxnet as mx from mxnet.gluon import nn from mxnet import nd +import numpy as np +from mxnet.lr_scheduler import PolyScheduler from mxnet import np, npx -from gluonnlp.layers import get_activation +from pos_trainer import POS_Trainer +try: + import horovod.mxnet as hvd +except ImportError: + pass +#from gluonnlp.layers import get_activation from mxnet.base import _LIB, check_call, mx_uint, c_str, c_str_array, SymbolHandle # load library @@ -39,6 +46,40 @@ path = os.path.abspath('libpass_lib.dll') mx.library.load(path) + +class _NCCLReduceHelper(object): + _init = False + nccl_id = None + num_gpus = None + rank = None + + @staticmethod + def init(num_gpus, root_rank): + """Communicate the NCCL unique id""" + cls = _NCCLReduceHelper + if not cls._init: + cls._init = True + import ctypes + try: + from mpi4py import MPI + except: + raise ImportError("Spatial parallel modules require mpi4py package.") + import numpy as np + nccl_id_size = ctypes.c_int() + check_call(_LIB.MXNCCLGetUniqueIdSize(ctypes.byref(nccl_id_size))) + nccl_id_size = nccl_id_size.value + cls.nccl_id = np.zeros(nccl_id_size, np.byte) + check_call(_LIB.MXNCCLGetUniqueId( + cls.nccl_id.ctypes.data_as(ctypes.c_void_p))) + global_comm = MPI.COMM_WORLD + rank = global_comm.rank + color = rank / num_gpus + comm = global_comm.Split(color, rank) + comm.Bcast([cls.nccl_id, nccl_id_size, MPI.BYTE], root=0) + cls.num_gpus = num_gpus + cls.rank = rank % num_gpus + cls.root_rank = root_rank % num_gpus + assert num_gpus == cls.num_gpus ############################################### # Test with not consuming params ############################################### @@ -55,7 +96,7 @@ def __init__(self): super().__init__() self.l1 = nn.Dense(in_units=2, units=2, flatten=False) #self.l2 = nn.Dense(in_units=2, units=2, flatten=False) - self.act1 = get_activation('relu') + #self.act1 = get_activation('relu') #self.seq.add(nn.Dense(in_units=2, units=2, flatten=False)) #self.seq.add(get_activation('relu')) @@ -66,42 +107,167 @@ def __init__(self): def forward(self, input): input = self.l1(input) - input = self.act1(input) + #print(input) + #input = self.act1(input) return input def test_model(pass_name): + from mxnet import gluon + from mxnet.gluon import Block, nn, HybridBlock + from mxnet import init + + + + hvd.init() + rank = hvd.rank() + size = hvd.size() + ctx = mx.gpu(rank) + + np.random.seed(1234 + 10 * rank) + mx.random.seed(1234 + 10 * rank) + + num_gpus = size + root_rank = 0 + helper = _NCCLReduceHelper + helper.init(num_gpus, 0) + kwargs = { + 'num_gpus': num_gpus, + 'rank': helper.rank, + 'root_rank': helper.root_rank, + 'nccl_unique_id': helper.nccl_id.ctypes.data} + + options = {"num_gpus":num_gpus, "rank": rank, "nccl_unique_id": helper.nccl_id.ctypes.data} + model = Easynet() - model.initialize() + + if rank == 0: + model.l1.weight.initialize(init=init.One(), ctx=ctx) + model.l1.bias.initialize(init=init.One(), ctx=ctx) + else: + model.l1.weight.initialize(init = init.Zero(), ctx = ctx) + model.l1.bias.initialize(init=init.Zero(), ctx=ctx) model.hybridize() + #param_dict = classify_net.collect_params() + #params = [p for p in param_dict.values() if p.grad_req != 'null'] + params = model.collect_params() + index = 0 + dic = {} + lr_scheduler = PolyScheduler(max_update=1, + base_lr=1e-3, + warmup_begin_lr=0.0, + pwr=1, + final_lr=0.0, + warmup_steps=0, + warmup_mode='linear') + optimizer_params = {'learning_rate': 1e-3, + 'wd': 1e-2, + 'lr_scheduler': lr_scheduler} + trainer = POS_Trainer(params, "adam", optimizer_params) + cor_rank = trainer.correspond_ranks() + for name in params: + type = name.split('.')[-1] + index = cor_rank[params[name]._uuid] + new_name = params[name]._uuid.replace('-', '_') + '_' + type + dic[new_name] = index - print('try on model') - x = np.array([[1,2]]) + options.update(dic) + print(options) + #return + backward_options = {"partition_grad":True, "current_rank":rank} + for k in options: + if k in ['num_gpus', 'rank', 'nccl_unique_id']: + continue + backward_options['ncclreduce_' + k + '_backward'] = options[k] + print(backward_options) + x = np.ones((1,2), ctx = ctx) + label = np.ones((2, ), ctx = ctx) * rank + #print(options) - model.optimize_for(x, backend = pass_name) + loss_function = gluon.loss.L2Loss() - out = model(x) - model.export("my_model") - print(out.shape) - return - args={'a':mx.nd.ones((3,2)), 'b':mx.nd.ones((3,2))} - # execute in MXNet - print('-------------------------------') - print('Testing regular MXNet execution') - inputs = [a,b] + model.optimize_for(x, backend = pass_name, **options) + #model.export("my_reduce_" + str(rank)) + for i in range(1): + with mx.autograd.record(): + out = model(x) + loss = loss_function(out, label).mean() / size + print("now call backward in python") + #print(loss.backward) + #print(loss) + #print(loss.backward) + loss.backward(backward_option = backward_options) + mx.npx.waitall() + mx.nd.waitall() + for name in params: + print(name, params[name].list_grad()[0]) + + + #print(out) + + +def test_reduce(pass_name): + from mxnet import gluon + from mxnet.gluon import Block, nn, HybridBlock + from mxnet import init + + hvd.init() + rank = hvd.rank() + size = hvd.size() + ctx = mx.gpu(rank) + + num_gpus = size + root_rank = 0 + helper = _NCCLReduceHelper + helper.init(num_gpus, root_rank) + kwargs = { + 'num_gpus': num_gpus, + 'rank': helper.rank, + 'root_rank': helper.root_rank, + 'nccl_unique_id': helper.nccl_id.ctypes.data} + + options = {"rank": rank, "nccl_unique_id":helper.nccl_id.ctypes.data} + + a = mx.sym.var('a') + sym = mx.sym.contrib.NCCLReduce(a, **kwargs) + + inputs = [a] sym_block = nn.SymbolBlock(sym, inputs) - sym_block.initialize() - out = sym_block(mx.nd.ones((3,2)),mx.nd.ones((3,2))) + sym_block.initialize(ctx = ctx) + p = mx.nd.ones((2, 3), ctx=ctx) * (rank + 2) + sym_block.optimize_for(p, backend = pass_name, **options) + sym_block.export("my_choice") + return + + print(p) + out = sym_block(p) + mx.nd.waitall() + mx.npx.waitall() print(out) + return + class ReduceNet(nn.HybridBlock): + def __init__(self): + super().__init__() + self.reduce_layer = NCCLReduce(num_gpus=2, root_rank=0) + def forward(self, x): + x = self.reduce_layer(x) + return x + + net = ReduceNet() + net.initialize(ctx=ctx) + net.hybridize() + data = mx.nd.ones((3,2), ctx = ctx) + net(data) + #data = mx.np.ones((1, 10), ctx=ctx) * rank + a = mx.sym.var('a') + out = net(a) + inputs = [a] + out_sym = nn.SymbolBlock(out, inputs) + out_sym.initialize(ctx = ctx) + true_out = out_sym(mx.nd.ones((3,2))) + + print(true_out) - # Gluon optimize_for - print('-------------------------------') - print('Testing pass "%s" Gluon Hybridize with shapes/types without inference' % pass_name) - inputs = [a,b] - sym_block2 = nn.SymbolBlock(sym, inputs) - sym_block2.initialize() - sym_block2.optimize_for(mx.nd.ones((3,2)), mx.nd.ones((3,2)), backend=pass_name) - sym_block2.export('modified') test_model('myPass') diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h index 7d50c7e068f2..550bb22c2aa7 100644 --- a/include/mxnet/c_api.h +++ b/include/mxnet/c_api.h @@ -1320,7 +1320,10 @@ MXNET_DLL int MXAutogradBackwardEx(uint32_t num_output, int create_graph, int is_train, NDArrayHandle** grad_handles, - int** grad_stypes); + int** grad_stypes, + const mx_uint num_options, + const char** keys, + const char** vals); /* * \brief get the graph constructed by autograd. * \param handle ndarray handle diff --git a/include/mxnet/imperative.h b/include/mxnet/imperative.h index e4e3f6a938d0..f49637dbf263 100644 --- a/include/mxnet/imperative.h +++ b/include/mxnet/imperative.h @@ -279,7 +279,8 @@ class Imperative { const std::vector& variables, bool is_train, bool retain_graph, - bool create_graph); + bool create_graph, + const std::unordered_map backward_options_map = {}); /*! \brief Return the marked nonleaf nodes. */ std::vector ListNonleafVariables(const nnvm::Symbol& sym) const; /*! \return AutogradRuntime singleton */ diff --git a/python/mxnet/ndarray/ndarray.py b/python/mxnet/ndarray/ndarray.py index fd76789918a1..7ac9f13113cb 100644 --- a/python/mxnet/ndarray/ndarray.py +++ b/python/mxnet/ndarray/ndarray.py @@ -34,7 +34,7 @@ from functools import reduce # pylint: disable=redefined-builtin import numpy as np from ..base import _LIB, numeric_types, integer_types -from ..base import c_array, c_array_buf, c_handle_array, mx_real_t +from ..base import c_array, c_array_buf, c_handle_array, mx_real_t, c_str_array from ..base import mx_uint, NDArrayHandle, check_call, mx_int, mx_int64 from ..base import ctypes2buffer from ..dlpack import ndarray_to_dlpack_for_read, ndarray_to_dlpack_for_write @@ -2924,7 +2924,7 @@ def detach(self): check_call(_LIB.MXNDArrayDetach(self.handle, ctypes.byref(hdl))) return _ndarray_cls(hdl) - def backward(self, out_grad=None, retain_graph=False, train_mode=True): + def backward(self, out_grad=None, retain_graph=False, train_mode=True, backward_option = {}): """Compute the gradients of this NDArray w.r.t variables. Parameters @@ -2942,7 +2942,13 @@ def backward(self, out_grad=None, retain_graph=False, train_mode=True): ograd_handles = [NDArrayHandle(0)] else: ograd_handles = [out_grad.handle] - + print("in ndarray.py in line 2945") + key_list = [] + val_list = [] + for key, val in backward_option.items(): + key_list.append(key) + val_list.append(str(val)) + print("in ndarray.py in line 2951", key_list, val_list) check_call(_LIB.MXAutogradBackwardEx( 1, c_handle_array([self]), c_array(NDArrayHandle, ograd_handles), @@ -2952,7 +2958,10 @@ def backward(self, out_grad=None, retain_graph=False, train_mode=True): ctypes.c_int(0), ctypes.c_int(train_mode), ctypes.c_void_p(0), - ctypes.c_void_p(0))) + ctypes.c_void_p(0), + mx_uint(len(key_list)), + c_str_array(key_list), + c_str_array(val_list))) def tostype(self, stype): """Return a copy of the array with chosen storage type. diff --git a/src/c_api/c_api_ndarray.cc b/src/c_api/c_api_ndarray.cc index 2e9c0a373621..b436f7991ac8 100644 --- a/src/c_api/c_api_ndarray.cc +++ b/src/c_api/c_api_ndarray.cc @@ -362,6 +362,9 @@ int MXAutogradBackward(uint32_t num_output, false, true, nullptr, + nullptr, + 0, + nullptr, nullptr); } @@ -374,10 +377,15 @@ int MXAutogradBackwardEx(uint32_t num_output, int create_graph, int is_train, NDArrayHandle** grad_handles, - int** grad_stypes) { + int** grad_stypes, + const mx_uint num_options, + const char** keys, + const char** vals) { MXAPIThreadLocalEntry<>* ret = MXAPIThreadLocalStore<>::Get(); API_BEGIN(); - + std::unordered_map backward_options_map; + for (mx_uint i = 0; i < num_options; ++i) + backward_options_map.emplace(keys[i], vals[i]); std::vector outputs, ograds, variables; outputs.reserve(num_output); for (uint32_t i = 0; i < num_output; ++i) { @@ -397,9 +405,13 @@ int MXAutogradBackwardEx(uint32_t num_output, for (uint32_t i = 0; i < num_variables; ++i) { variables.emplace_back(reinterpret_cast(var_handles[i])); } - + std::cout<<"in c_api_ndarray.cc in line 400" << std::endl; + for (auto it = backward_options_map.begin(); it!=backward_options_map.end(); it ++ ) + { + std::cout<< "in line 408 in c_api_ndarray.cc:" << it->first << " : " << it->second<Backward(outputs, ograds, variables, is_train, retain_graph, create_graph); + Imperative::Get()->Backward(outputs, ograds, variables, is_train, retain_graph, create_graph, backward_options_map); if (num_variables != 0) { ret->ret_handles.clear(); ret->out_types.clear(); diff --git a/src/imperative/cached_op.cc b/src/imperative/cached_op.cc index 894ef09a1d16..4ad23b72a9c0 100644 --- a/src/imperative/cached_op.cc +++ b/src/imperative/cached_op.cc @@ -259,7 +259,8 @@ void SetBackwardInputEid(const std::vector& bwd_in_dep, bool CachedOp::SetBackwardGraph(GraphInfo* info, const std::vector& reqs, const std::vector& inputs, - bool detect_inplace_addto) { + bool detect_inplace_addto, + const std::unordered_map backward_options_map) { using namespace nnvm; using namespace imperative; std::lock_guard lock(mutex_); @@ -279,7 +280,27 @@ bool CachedOp::SetBackwardGraph(GraphInfo* info, g.attrs["context"] = std::make_shared( std::vector(g.indexed_graph().num_nodes(), default_ctx)); } + //deal with delete + auto pg_option = backward_options_map.find("partition_grad"); + if (pg_option != backward_options_map.end()) + { + auto it = backward_options_map.find("current_rank"); + int cur_rank = std::atoi(it->second.c_str()); + auto help_it = g.outputs.begin(); + auto output_it = g.outputs.begin(); + while (output_it!=g.outputs.end()) + { + it = backward_options_map.find(output_it->node ->attrs.name); + if (it!=backward_options_map.end() && atoi(it->second.c_str()) != cur_rank) + { + g.outputs.erase(output_it); + } + else{ + output_it++; + } + } + } const auto& idx = g.indexed_graph(); if (info->bwd_input_eid.size() != inputs.size()) { @@ -378,7 +399,10 @@ bool CachedOp::SetBackwardGraph(GraphInfo* info, {num_forward_entries, idx.num_node_entries()}, detect_inplace_addto); g.attrs[AddPrefix(BACKWARD, MEM_PLAN)] = std::make_shared(std::move(mem_plan)); - + for (auto it = g.outputs.begin(); it != g.outputs.end(); it++ ) + { + std::cout<<"in line 383 graph node " << it->node ->attrs.name << std::endl; + } return false; } @@ -902,7 +926,8 @@ void CachedOp::DynamicBackward(const bool retain_graph, const OpStatePtr& op_state, const std::vector& inputs, const std::vector& reqs, - const std::vector& outputs) { + const std::vector& outputs, + const std::unordered_map backward_options_map) { using namespace nnvm; using namespace imperative; @@ -915,11 +940,20 @@ void CachedOp::DynamicBackward(const bool retain_graph, std::lock_guard lock(state.mutex); state.info.fwd_graph = runtime.info.fwd_graph; state.info.input_map = runtime.info.input_map; - SetBackwardGraph(&state.info, reqs, inputs); + std::cout<<"in line 918 in cached_op.cc" << std::endl; + SetBackwardGraph(&state.info, reqs, inputs, false, backward_options_map); runtime.info.full_graph = state.info.full_graph; runtime.info.bwd_input_eid = state.info.bwd_input_eid; } nnvm::Graph& g = runtime.info.full_graph; + for (auto it = backward_options_map.begin(); it!=backward_options_map.end(); it ++) + { + std::cout<<"in line 930 backward:" << it->first<<":"<second << std::endl; + } + for (auto it = g.outputs.begin(); it != g.outputs.end(); it++ ) + { + std::cout<<"in line 926 graph node " << it->node ->attrs.name << std::endl; + } const auto& idx = g.indexed_graph(); auto& buff = runtime.buff; auto& states = runtime.op_states; @@ -1005,7 +1039,8 @@ void CachedOp::StaticBackward(const bool retain_graph, const OpStatePtr& state_ptr, const std::vector& inputs, const std::vector& reqs, - const std::vector& outputs) { + const std::vector& outputs, + const std::unordered_map backward_options_map) { using namespace nnvm; using namespace imperative; @@ -1014,9 +1049,15 @@ void CachedOp::StaticBackward(const bool retain_graph, auto& state = state_ptr.get_state(); std::lock_guard lock(state.mutex); - bool match = SetBackwardGraph(&state.info, reqs, inputs, true); + bool match = SetBackwardGraph(&state.info, reqs, inputs, true, backward_options_map); nnvm::Graph& g = state.info.full_graph; + //deal with options + // + for (auto it = backward_options_map.begin(); it!=backward_options_map.end(); it ++ ) + { + std::cout<< "in line 1034 in cachedop:" << it->first << " : " << it->second<& inputs, const std::vector& reqs, - const std::vector& outputs) { + const std::vector& outputs, + const std::unordered_map backward_options_map) { const auto& fwd_idx = fwd_graph_.indexed_graph(); const auto& full_idx = full_graph_.indexed_graph(); const auto& mutable_input_nodes = fwd_idx.mutable_input_nodes(); @@ -1121,9 +1163,11 @@ void CachedOp::Backward(const bool retain_graph, try { if (config_.static_alloc) { - StaticBackward(retain_graph, state, inputs, reqs, outputs); + std::cout<<"call static back"<set_bulk_size(prev_bulk_size); @@ -1261,7 +1305,7 @@ void CachedOpBackward(const OpStatePtr& state_ptr, // pass a flag to determine whether to record computation inside an operator. // Let's use false here for now and design a solution when the second-order // differentiation is supported. - s.op->Backward(false, s.forward_state, in_ptrs, req, out_ptrs); + s.op->Backward(false, s.forward_state, in_ptrs, req, out_ptrs, {}); Imperative::Get()->set_is_training(orig_is_train); // Clean up what we recorded. diff --git a/src/imperative/cached_op.h b/src/imperative/cached_op.h index 079a56e20a12..e8d441114a0c 100644 --- a/src/imperative/cached_op.h +++ b/src/imperative/cached_op.h @@ -501,7 +501,8 @@ class CachedOp { const OpStatePtr& state, const std::vector& inputs, const std::vector& reqs, - const std::vector& outputs); + const std::vector& outputs, + const std::unordered_map backward_options_map); // backward storage type inference virtual bool BackwardStorageType(const nnvm::NodeAttrs& attrs, const int dev_mask, @@ -606,7 +607,8 @@ class CachedOp { bool SetBackwardGraph(GraphInfo* info, const std::vector& reqs, const std::vector& inputs, - bool detect_inplace_addto = false); + bool detect_inplace_addto = false, + const std::unordered_map backward_options_map = {}); bool CheckDynamicShapeExists(const Context& default_ctx, const std::vector& inputs, bool erase_result); @@ -632,12 +634,14 @@ class CachedOp { const OpStatePtr& op_state, const std::vector& inputs, const std::vector& reqs, - const std::vector& outputs); + const std::vector& outputs, + const std::unordered_map backward_options_map); void StaticBackward(const bool retain_graph, const OpStatePtr& state_ptr, const std::vector& inputs, const std::vector& reqs, - const std::vector& outputs); + const std::vector& outputs, + const std::unordered_map backward_options_map); size_t BwdOriginalInput(const std::vector& input_map, size_t new_i); CachedOpConfig config_; diff --git a/src/imperative/imperative.cc b/src/imperative/imperative.cc index b9bdaac9476f..bef07a635d17 100644 --- a/src/imperative/imperative.cc +++ b/src/imperative/imperative.cc @@ -440,7 +440,8 @@ std::vector Imperative::Backward(const std::vector& outputs, const std::vector& variables, bool is_train, bool retain_graph, - bool create_graph) { + bool create_graph, + const std::unordered_map backward_options_map) { using namespace nnvm; using namespace imperative; static const std::vector zero_ops{Op::Get("zeros_like"), Op::Get("_zeros")}; @@ -519,7 +520,7 @@ std::vector Imperative::Backward(const std::vector& outputs, for (const auto& i : nleaf_vars) { us.emplace_back(NodeEntry{i, 0, 0}); } - + std::cout<<"in imperative.cc in line 522"< Imperative::Backward(const std::vector& outputs, graph.outputs.push_back(e); } } + for (auto it = graph.outputs.begin(); it != graph.outputs.end(); it++ ) + { + std::cout<<"in line 547 in imperative.cc graph node " << it->node ->attrs.name << std::endl; + } const auto& idx = graph.indexed_graph(); // get number of nodes used in forward pass size_t num_forward_nodes = 0; @@ -712,7 +717,10 @@ std::vector Imperative::Backward(const std::vector& outputs, bool prev_recording = set_is_recording(create_graph); bool prev_training = set_is_training(is_train); int prev_bulk_size = Engine::Get()->set_bulk_size(backward_bulk_size_); - + for (auto it = backward_options_map.begin(); it!=backward_options_map.end(); it ++ ) + { + std::cout<< "in line 722 in imperative.cc:" << it->first << " : " << it->second< Imperative::Backward(const std::vector& outputs, std::move(ref_count), &states, dispatch_modes, - is_recording()); + is_recording(), + nullptr, + nullptr, + false, + backward_options_map); } catch (const dmlc::Error& e) { Engine::Get()->set_bulk_size(prev_bulk_size); set_is_recording(prev_recording); diff --git a/src/imperative/imperative_utils.cc b/src/imperative/imperative_utils.cc index e3a58804d8ac..812318dee5f5 100644 --- a/src/imperative/imperative_utils.cc +++ b/src/imperative/imperative_utils.cc @@ -75,7 +75,8 @@ void InvokeOperator(const nnvm::IndexedGraph& idx, const std::vector& ndoutputs, std::vector* p_req, std::vector* p_ref_count, - std::function invoke) { + std::function invoke, + const std::unordered_map backward_options_map) { static const auto bwd_cached_op = Op::Get("_backward_CachedOp"); static auto& createop = nnvm::Op::GetAttr("FCreateOpState"); static auto& is_layer_backward = Op::GetAttr("TIsLayerOpBackward"); @@ -88,7 +89,11 @@ void InvokeOperator(const nnvm::IndexedGraph& idx, const auto& cached_op = dmlc::get(node.source->attrs.parsed); nnvm::Node* fwd_node = node.source->control_deps[0].get(); auto fwd_node_id = idx.node_id(fwd_node); - cached_op->Backward(retain_graph, states[fwd_node_id], ndinputs, req, ndoutputs); + for (auto it = backward_options_map.begin(); it!=backward_options_map.end(); it ++ ) + { + std::cout<< "in line 94 in imperative_utils.cc:" << it->first << " : " << it->second<Backward(retain_graph, states[fwd_node_id], ndinputs, req, ndoutputs, backward_options_map); } else if (createop.count(node.source->op())) { mxnet::ShapeVector arg_shapes; nnvm::DTypeVector arg_dtypes; @@ -138,7 +143,8 @@ void RunGraph(const bool retain_graph, bool recording, mxnet::ShapeVector* shapes, const imperative::CachedOpMonCallback& callback, - const bool monitor_all) { + const bool monitor_all, + const std::unordered_map backward_options_map) { CHECK(shapes == nullptr); for (size_t i = node_start; i < node_end; ++i) { const nnvm::IndexedGraph::Node& node = idx[i]; @@ -161,8 +167,12 @@ void RunGraph(const bool retain_graph, Imperative::Get()->RecordOp(NodeAttrs(node.source->attrs), ndinputs, ndoutputs, state); } }; + for (auto it = backward_options_map.begin(); it!=backward_options_map.end(); it ++ ) + { + std::cout<< "in line 172 in rungraph in imperative_utils.cc:" << it->first << " : " << it->second< backward_options_map = {}); void NaiveRunGraph(const bool retain_graph, const Context& default_ctx, diff --git a/src/imperative/naive_cached_op.h b/src/imperative/naive_cached_op.h index cd1365508f97..037d4d489448 100644 --- a/src/imperative/naive_cached_op.h +++ b/src/imperative/naive_cached_op.h @@ -48,7 +48,8 @@ class NaiveCachedOp : public CachedOp { const OpStatePtr& state, const std::vector& inputs, const std::vector& reqs, - const std::vector& outputs) override { + const std::vector& outputs, + const std::unordered_map backward_options_map) override { LOG(FATAL) << "Backward is not supported in NaiveCachedOp."; } // backward storage type inference diff --git a/src/operator/subgraph_op_common.cc b/src/operator/subgraph_op_common.cc index ad12d3ded5d2..a5647bcad859 100644 --- a/src/operator/subgraph_op_common.cc +++ b/src/operator/subgraph_op_common.cc @@ -274,7 +274,7 @@ void LoopState::Backward(int iter_no, outputs.push_back(&igrad_bufs[i]); CHECK_EQ(outputs.size(), op->num_inputs()); auto state = all_states[iter_no]; - op->Backward(false, state, inputs, req, outputs); + op->Backward(false, state, inputs, req, outputs, {}); // If an input and an output share the array, the output array will be changed // by CachedOp. We need to copy data to the real output. for (size_t i = 0; i < igrads.size(); i++) From 33de36b7d582ac568b3d7f8abb7eff03bb13bfce Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Mon, 17 Jan 2022 12:45:34 +0000 Subject: [PATCH 14/23] add backward option --- example/extensions/lib_pass/test_pass.py | 25 ++++++++++++++++++------ python/mxnet/ndarray/ndarray.py | 2 -- src/c_api/c_api_ndarray.cc | 5 ----- src/imperative/cached_op.cc | 24 ++--------------------- src/imperative/imperative.cc | 10 +--------- src/imperative/imperative_utils.cc | 9 +-------- 6 files changed, 23 insertions(+), 52 deletions(-) diff --git a/example/extensions/lib_pass/test_pass.py b/example/extensions/lib_pass/test_pass.py index df115eee7853..e4087a185528 100644 --- a/example/extensions/lib_pass/test_pass.py +++ b/example/extensions/lib_pass/test_pass.py @@ -95,7 +95,7 @@ class Easynet(nn.HybridBlock): def __init__(self): super().__init__() self.l1 = nn.Dense(in_units=2, units=2, flatten=False) - #self.l2 = nn.Dense(in_units=2, units=2, flatten=False) + self.l2 = nn.Dense(in_units=2, units=2, flatten=False) #self.act1 = get_activation('relu') #self.seq.add(nn.Dense(in_units=2, units=2, flatten=False)) @@ -105,11 +105,12 @@ def __init__(self): #self.l1.register_op_hook(mon_callback, monitor_all=True) - def forward(self, input): - input = self.l1(input) + def forward(self, input1, input2): + output_1 = self.l1(input1) + output_2 = self.l2(input2) #print(input) #input = self.act1(input) - return input + return output_1 + output_2 def test_model(pass_name): @@ -144,9 +145,13 @@ def test_model(pass_name): if rank == 0: model.l1.weight.initialize(init=init.One(), ctx=ctx) model.l1.bias.initialize(init=init.One(), ctx=ctx) + model.l2.weight.initialize(init=init.One(), ctx=ctx) + model.l2.bias.initialize(init=init.One(), ctx=ctx) else: model.l1.weight.initialize(init = init.Zero(), ctx = ctx) model.l1.bias.initialize(init=init.Zero(), ctx=ctx) + model.l2.weight.initialize(init=init.Zero(), ctx=ctx) + model.l2.bias.initialize(init=init.Zero(), ctx=ctx) model.hybridize() #param_dict = classify_net.collect_params() #params = [p for p in param_dict.values() if p.grad_req != 'null'] @@ -174,24 +179,32 @@ def test_model(pass_name): options.update(dic) print(options) + #return + ''' backward_options = {"partition_grad":True, "current_rank":rank} for k in options: if k in ['num_gpus', 'rank', 'nccl_unique_id']: continue backward_options['ncclreduce_' + k + '_backward'] = options[k] print(backward_options) + ''' + options = trainer.generate_graph_pass_options() + print(options) + backward_options = trainer.generate_backward_options() + print(backward_options) x = np.ones((1,2), ctx = ctx) + x2 = np.ones((1, 2), ctx=ctx) label = np.ones((2, ), ctx = ctx) * rank #print(options) loss_function = gluon.loss.L2Loss() - model.optimize_for(x, backend = pass_name, **options) + model.optimize_for(x, x2, backend = "myPass", **options) #model.export("my_reduce_" + str(rank)) for i in range(1): with mx.autograd.record(): - out = model(x) + out = model(x, x2) loss = loss_function(out, label).mean() / size print("now call backward in python") #print(loss.backward) diff --git a/python/mxnet/ndarray/ndarray.py b/python/mxnet/ndarray/ndarray.py index 7ac9f13113cb..e56ff4e34c9e 100644 --- a/python/mxnet/ndarray/ndarray.py +++ b/python/mxnet/ndarray/ndarray.py @@ -2942,13 +2942,11 @@ def backward(self, out_grad=None, retain_graph=False, train_mode=True, backward_ ograd_handles = [NDArrayHandle(0)] else: ograd_handles = [out_grad.handle] - print("in ndarray.py in line 2945") key_list = [] val_list = [] for key, val in backward_option.items(): key_list.append(key) val_list.append(str(val)) - print("in ndarray.py in line 2951", key_list, val_list) check_call(_LIB.MXAutogradBackwardEx( 1, c_handle_array([self]), c_array(NDArrayHandle, ograd_handles), diff --git a/src/c_api/c_api_ndarray.cc b/src/c_api/c_api_ndarray.cc index b436f7991ac8..0b30373f68b2 100644 --- a/src/c_api/c_api_ndarray.cc +++ b/src/c_api/c_api_ndarray.cc @@ -405,11 +405,6 @@ int MXAutogradBackwardEx(uint32_t num_output, for (uint32_t i = 0; i < num_variables; ++i) { variables.emplace_back(reinterpret_cast(var_handles[i])); } - std::cout<<"in c_api_ndarray.cc in line 400" << std::endl; - for (auto it = backward_options_map.begin(); it!=backward_options_map.end(); it ++ ) - { - std::cout<< "in line 408 in c_api_ndarray.cc:" << it->first << " : " << it->second<Backward(outputs, ograds, variables, is_train, retain_graph, create_graph, backward_options_map); if (num_variables != 0) { diff --git a/src/imperative/cached_op.cc b/src/imperative/cached_op.cc index 4ad23b72a9c0..c29a7ab96c80 100644 --- a/src/imperative/cached_op.cc +++ b/src/imperative/cached_op.cc @@ -286,7 +286,6 @@ bool CachedOp::SetBackwardGraph(GraphInfo* info, { auto it = backward_options_map.find("current_rank"); int cur_rank = std::atoi(it->second.c_str()); - auto help_it = g.outputs.begin(); auto output_it = g.outputs.begin(); while (output_it!=g.outputs.end()) { @@ -399,10 +398,7 @@ bool CachedOp::SetBackwardGraph(GraphInfo* info, {num_forward_entries, idx.num_node_entries()}, detect_inplace_addto); g.attrs[AddPrefix(BACKWARD, MEM_PLAN)] = std::make_shared(std::move(mem_plan)); - for (auto it = g.outputs.begin(); it != g.outputs.end(); it++ ) - { - std::cout<<"in line 383 graph node " << it->node ->attrs.name << std::endl; - } + return false; } @@ -940,20 +936,11 @@ void CachedOp::DynamicBackward(const bool retain_graph, std::lock_guard lock(state.mutex); state.info.fwd_graph = runtime.info.fwd_graph; state.info.input_map = runtime.info.input_map; - std::cout<<"in line 918 in cached_op.cc" << std::endl; SetBackwardGraph(&state.info, reqs, inputs, false, backward_options_map); runtime.info.full_graph = state.info.full_graph; runtime.info.bwd_input_eid = state.info.bwd_input_eid; } nnvm::Graph& g = runtime.info.full_graph; - for (auto it = backward_options_map.begin(); it!=backward_options_map.end(); it ++) - { - std::cout<<"in line 930 backward:" << it->first<<":"<second << std::endl; - } - for (auto it = g.outputs.begin(); it != g.outputs.end(); it++ ) - { - std::cout<<"in line 926 graph node " << it->node ->attrs.name << std::endl; - } const auto& idx = g.indexed_graph(); auto& buff = runtime.buff; auto& states = runtime.op_states; @@ -1052,12 +1039,7 @@ void CachedOp::StaticBackward(const bool retain_graph, bool match = SetBackwardGraph(&state.info, reqs, inputs, true, backward_options_map); nnvm::Graph& g = state.info.full_graph; - //deal with options - // - for (auto it = backward_options_map.begin(); it!=backward_options_map.end(); it ++ ) - { - std::cout<< "in line 1034 in cachedop:" << it->first << " : " << it->second< Imperative::Backward(const std::vector& outputs, for (const auto& i : nleaf_vars) { us.emplace_back(NodeEntry{i, 0, 0}); } - std::cout<<"in imperative.cc in line 522"< Imperative::Backward(const std::vector& outputs, graph.outputs.push_back(e); } } - for (auto it = graph.outputs.begin(); it != graph.outputs.end(); it++ ) - { - std::cout<<"in line 547 in imperative.cc graph node " << it->node ->attrs.name << std::endl; - } const auto& idx = graph.indexed_graph(); // get number of nodes used in forward pass size_t num_forward_nodes = 0; @@ -717,10 +712,7 @@ std::vector Imperative::Backward(const std::vector& outputs, bool prev_recording = set_is_recording(create_graph); bool prev_training = set_is_training(is_train); int prev_bulk_size = Engine::Get()->set_bulk_size(backward_bulk_size_); - for (auto it = backward_options_map.begin(); it!=backward_options_map.end(); it ++ ) - { - std::cout<< "in line 722 in imperative.cc:" << it->first << " : " << it->second<(node.source->attrs.parsed); nnvm::Node* fwd_node = node.source->control_deps[0].get(); auto fwd_node_id = idx.node_id(fwd_node); - for (auto it = backward_options_map.begin(); it!=backward_options_map.end(); it ++ ) - { - std::cout<< "in line 94 in imperative_utils.cc:" << it->first << " : " << it->second<Backward(retain_graph, states[fwd_node_id], ndinputs, req, ndoutputs, backward_options_map); } else if (createop.count(node.source->op())) { mxnet::ShapeVector arg_shapes; @@ -167,10 +163,7 @@ void RunGraph(const bool retain_graph, Imperative::Get()->RecordOp(NodeAttrs(node.source->attrs), ndinputs, ndoutputs, state); } }; - for (auto it = backward_options_map.begin(); it!=backward_options_map.end(); it ++ ) - { - std::cout<< "in line 172 in rungraph in imperative_utils.cc:" << it->first << " : " << it->second< Date: Fri, 21 Jan 2022 08:51:39 +0000 Subject: [PATCH 15/23] return to intial version --- example/extensions/lib_pass/pass_lib.cc | 146 +----------------------- 1 file changed, 1 insertion(+), 145 deletions(-) diff --git a/example/extensions/lib_pass/pass_lib.cc b/example/extensions/lib_pass/pass_lib.cc index df7ac8fa079f..d219299ac9ad 100644 --- a/example/extensions/lib_pass/pass_lib.cc +++ b/example/extensions/lib_pass/pass_lib.cc @@ -25,164 +25,20 @@ #include #include #include -#include #include "mxnet/lib_api.h" using namespace mxnet::ext; /* \brief a basic pass that prints out the options and the graph */ -/* MXReturnValue myPass(mxnet::ext::Graph* g, const std::unordered_map& options) { - int cur_rank = -1; - std::string nccl_unique_id = "asd"; - for (auto kv : options) { - std::cout << "option: " << kv.first << " ==> " << kv.second << std::endl;} - - std::cout<<" print options" << std::endl; - size_t length = g->size(); - mxnet::ext::Node *tmp; - for (int i = 0;i < length; i += 1) - { - tmp = g->getNode(i); - std::cout<name<attrs; - for (auto it = attrs.begin(); it!=attrs.end(); it ++ ) - { - std::cout<name<<":attrs:"<first<<" : "<second<addNode("myConv","_contrib_NCCLReduce"); - /* - //g->addNode("myConv","Convolution"); - mxnet::ext::Node *copy_old_layer = g->getNode(g->size()-2); - mxnet::ext::Node *old_layer = g->getNode(g->size()-1); - std::cout<name<addNode("myweight3","null"); - new_weight->alloc_arg({3,2}, MXContext::CPU(0),kFloat32); - - mxnet::ext::Node *new_bias = g->addNode("mybias3","null"); - new_bias->alloc_arg({3,}, MXContext::CPU(0),kFloat32); - - - - - //new_bias->alloc_aux({2}, MXContext::CPU(0), kInt32); - - - auto attrs = copy_old_layer->attrs; - mxnet::ext::Node *new_layer = g->addNode("mylinaer","FullyConnected"); - auto new_attrs = &new_layer->attrs; - for (auto it = attrs.begin(); it!=attrs.end(); it ++ ) - { - - std::cout<first<<" : "<second<first!="num_hidden"){ - new_attrs->insert({{it->first,it->second}});} - } - new_attrs->insert({{"num_hidden","3"}}); - - std::cout<<"current:"<begin(); it!=new_attrs->end(); it ++ ) - { - std::cout<first<<" : "<second<inputs.push_back({old_layer, 0}); - new_layer->inputs.push_back({new_weight, 0}); - new_layer->inputs.push_back({new_bias, 0}); - old_layer->outputs.push_back({new_layer, 0}); - new_weight->outputs.push_back({new_layer, 1}); - new_bias->outputs.push_back({new_layer, 2}); - auto it = g->outputs.begin(); - g->outputs.erase(it); - mxnet::ext::NodeEntry new_entry; - new_entry.node = new_layer; - new_entry.entry = 0; - g->outputs.push_back(new_entry); - - g->print(); - - - return MX_SUCCESS; -} -*/ - - -MXReturnValue myPass(mxnet::ext::Graph* g, - const std::unordered_map& options) { - std::string cur_rank = ""; - std::string nccl_unique_id = ""; - std::string num_gpus = ""; for (auto kv : options) { std::cout << "option: " << kv.first << " ==> " << kv.second << std::endl; - if (kv.first == "rank") - { - cur_rank = kv.second.c_str(); - } - if (kv.first == "nccl_unique_id") - nccl_unique_id = kv.second.c_str(); - if (kv.first == "num_gpus") - num_gpus = kv.second.c_str(); - } - std::cout<<"nccl:"<size(); - mxnet::ext::Node *tmp; - std::string root_rank; - mxnet::ext::Node *target_node; - int index = 0; - for (int i = 0;i < length; i += 1) - { - target_node = g->getNode(i); - std::cout<<"deal with:" << target_node->name<name); - if (it == options.end()) {continue;} // req_grad == null - root_rank = it->second; - mxnet::ext::Node *new_reduce = g->addNode("ncclreduce_" + target_node->name,"_contrib_NCCLReduce"); - index += 1; - auto new_attrs = &new_reduce->attrs; - auto old_attrs = target_node->attrs; - for (auto it = old_attrs.begin(); it!=old_attrs.end(); it++) - { - if (it->first == "__ext_dtype__" || it->first == "__ext_shape__" || it->first == "__profiler_scope__") - { - new_attrs ->insert({{it->first, it->second}}); - } - } - new_attrs->insert({{"nccl_unique_id", nccl_unique_id}}); - new_attrs->insert({{"num_gpus", num_gpus}}); - new_attrs->insert({{"rank", cur_rank}}); - new_attrs->insert({{"root_rank", root_rank}}); - - for (int i=0;ioutputs.size(); i++) - { - new_reduce->outputs.push_back(target_node->outputs[i]); - mxnet::ext::Node *output_node = target_node->outputs[i].node; - int index = target_node->outputs[i].entry; - //std::cout<<"try change:"<name<<":"<inputs.size()<inputs[index].node = new_reduce; - } - for (int i=0;ioutputs.size(); i++) - { - target_node->outputs.pop_back(); - } - target_node->outputs.push_back({new_reduce, 0}); - new_reduce->inputs.push_back({target_node, 0}); - } g->print(); - - return MX_SUCCESS; } - - REGISTER_PASS(myPass).setBody(myPass); MXReturnValue initialize(int version) { @@ -193,4 +49,4 @@ MXReturnValue initialize(int version) { MX_ERROR_MSG << "MXNet version " << version << " not supported" << std::endl; return MX_FAIL; } -} +} \ No newline at end of file From 9f21bdcf83753fd9f479d51dcac6c2bd88b4bf86 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 21 Jan 2022 08:51:52 +0000 Subject: [PATCH 16/23] return to intial version --- example/extensions/lib_pass/test_pass.py | 248 ++--------------------- 1 file changed, 16 insertions(+), 232 deletions(-) diff --git a/example/extensions/lib_pass/test_pass.py b/example/extensions/lib_pass/test_pass.py index e4087a185528..3fa1fd22b891 100644 --- a/example/extensions/lib_pass/test_pass.py +++ b/example/extensions/lib_pass/test_pass.py @@ -27,15 +27,6 @@ import mxnet as mx from mxnet.gluon import nn from mxnet import nd -import numpy as np -from mxnet.lr_scheduler import PolyScheduler -from mxnet import np, npx -from pos_trainer import POS_Trainer -try: - import horovod.mxnet as hvd -except ImportError: - pass -#from gluonnlp.layers import get_activation from mxnet.base import _LIB, check_call, mx_uint, c_str, c_str_array, SymbolHandle # load library @@ -46,40 +37,6 @@ path = os.path.abspath('libpass_lib.dll') mx.library.load(path) - -class _NCCLReduceHelper(object): - _init = False - nccl_id = None - num_gpus = None - rank = None - - @staticmethod - def init(num_gpus, root_rank): - """Communicate the NCCL unique id""" - cls = _NCCLReduceHelper - if not cls._init: - cls._init = True - import ctypes - try: - from mpi4py import MPI - except: - raise ImportError("Spatial parallel modules require mpi4py package.") - import numpy as np - nccl_id_size = ctypes.c_int() - check_call(_LIB.MXNCCLGetUniqueIdSize(ctypes.byref(nccl_id_size))) - nccl_id_size = nccl_id_size.value - cls.nccl_id = np.zeros(nccl_id_size, np.byte) - check_call(_LIB.MXNCCLGetUniqueId( - cls.nccl_id.ctypes.data_as(ctypes.c_void_p))) - global_comm = MPI.COMM_WORLD - rank = global_comm.rank - color = rank / num_gpus - comm = global_comm.Split(color, rank) - comm.Bcast([cls.nccl_id, nccl_id_size, MPI.BYTE], root=0) - cls.num_gpus = num_gpus - cls.rank = rank % num_gpus - cls.root_rank = root_rank % num_gpus - assert num_gpus == cls.num_gpus ############################################### # Test with not consuming params ############################################### @@ -90,197 +47,24 @@ def init(num_gpus, root_rank): d = mx.sym.exp(c) sym = mx.sym.log(d) - -class Easynet(nn.HybridBlock): - def __init__(self): - super().__init__() - self.l1 = nn.Dense(in_units=2, units=2, flatten=False) - self.l2 = nn.Dense(in_units=2, units=2, flatten=False) - #self.act1 = get_activation('relu') - - #self.seq.add(nn.Dense(in_units=2, units=2, flatten=False)) - #self.seq.add(get_activation('relu')) - #self.seq.add(nn.Dense(in_units=2, units=2, flatten=False)) - #self.seq.register_op_hook(mon_callback, monitor_all=True) - #self.l1.register_op_hook(mon_callback, monitor_all=True) - - - def forward(self, input1, input2): - output_1 = self.l1(input1) - output_2 = self.l2(input2) - #print(input) - #input = self.act1(input) - return output_1 + output_2 - - def test_model(pass_name): - from mxnet import gluon - from mxnet.gluon import Block, nn, HybridBlock - from mxnet import init - - - - hvd.init() - rank = hvd.rank() - size = hvd.size() - ctx = mx.gpu(rank) - - np.random.seed(1234 + 10 * rank) - mx.random.seed(1234 + 10 * rank) - - num_gpus = size - root_rank = 0 - helper = _NCCLReduceHelper - helper.init(num_gpus, 0) - kwargs = { - 'num_gpus': num_gpus, - 'rank': helper.rank, - 'root_rank': helper.root_rank, - 'nccl_unique_id': helper.nccl_id.ctypes.data} - - options = {"num_gpus":num_gpus, "rank": rank, "nccl_unique_id": helper.nccl_id.ctypes.data} - - model = Easynet() - - if rank == 0: - model.l1.weight.initialize(init=init.One(), ctx=ctx) - model.l1.bias.initialize(init=init.One(), ctx=ctx) - model.l2.weight.initialize(init=init.One(), ctx=ctx) - model.l2.bias.initialize(init=init.One(), ctx=ctx) - else: - model.l1.weight.initialize(init = init.Zero(), ctx = ctx) - model.l1.bias.initialize(init=init.Zero(), ctx=ctx) - model.l2.weight.initialize(init=init.Zero(), ctx=ctx) - model.l2.bias.initialize(init=init.Zero(), ctx=ctx) - model.hybridize() - #param_dict = classify_net.collect_params() - #params = [p for p in param_dict.values() if p.grad_req != 'null'] - params = model.collect_params() - index = 0 - dic = {} - lr_scheduler = PolyScheduler(max_update=1, - base_lr=1e-3, - warmup_begin_lr=0.0, - pwr=1, - final_lr=0.0, - warmup_steps=0, - warmup_mode='linear') - optimizer_params = {'learning_rate': 1e-3, - 'wd': 1e-2, - 'lr_scheduler': lr_scheduler} - trainer = POS_Trainer(params, "adam", optimizer_params) - cor_rank = trainer.correspond_ranks() - - for name in params: - type = name.split('.')[-1] - index = cor_rank[params[name]._uuid] - new_name = params[name]._uuid.replace('-', '_') + '_' + type - dic[new_name] = index - - options.update(dic) - print(options) - - #return - ''' - backward_options = {"partition_grad":True, "current_rank":rank} - for k in options: - if k in ['num_gpus', 'rank', 'nccl_unique_id']: - continue - backward_options['ncclreduce_' + k + '_backward'] = options[k] - print(backward_options) - ''' - options = trainer.generate_graph_pass_options() - print(options) - backward_options = trainer.generate_backward_options() - print(backward_options) - x = np.ones((1,2), ctx = ctx) - x2 = np.ones((1, 2), ctx=ctx) - label = np.ones((2, ), ctx = ctx) * rank - #print(options) - - loss_function = gluon.loss.L2Loss() - - model.optimize_for(x, x2, backend = "myPass", **options) - #model.export("my_reduce_" + str(rank)) - for i in range(1): - with mx.autograd.record(): - out = model(x, x2) - loss = loss_function(out, label).mean() / size - print("now call backward in python") - #print(loss.backward) - #print(loss) - #print(loss.backward) - loss.backward(backward_option = backward_options) - mx.npx.waitall() - mx.nd.waitall() - for name in params: - print(name, params[name].list_grad()[0]) - - - #print(out) - - -def test_reduce(pass_name): - from mxnet import gluon - from mxnet.gluon import Block, nn, HybridBlock - from mxnet import init - - hvd.init() - rank = hvd.rank() - size = hvd.size() - ctx = mx.gpu(rank) - - num_gpus = size - root_rank = 0 - helper = _NCCLReduceHelper - helper.init(num_gpus, root_rank) - kwargs = { - 'num_gpus': num_gpus, - 'rank': helper.rank, - 'root_rank': helper.root_rank, - 'nccl_unique_id': helper.nccl_id.ctypes.data} - - options = {"rank": rank, "nccl_unique_id":helper.nccl_id.ctypes.data} - - a = mx.sym.var('a') - sym = mx.sym.contrib.NCCLReduce(a, **kwargs) - - inputs = [a] + args={'a':mx.nd.ones((3,2)), 'b':mx.nd.ones((3,2))} + # execute in MXNet + print('-------------------------------') + print('Testing regular MXNet execution') + inputs = [a,b] sym_block = nn.SymbolBlock(sym, inputs) - sym_block.initialize(ctx = ctx) - p = mx.nd.ones((2, 3), ctx=ctx) * (rank + 2) - sym_block.optimize_for(p, backend = pass_name, **options) - sym_block.export("my_choice") - return - - print(p) - out = sym_block(p) - mx.nd.waitall() - mx.npx.waitall() + sym_block.initialize() + out = sym_block(mx.nd.ones((3,2)),mx.nd.ones((3,2))) print(out) - return - class ReduceNet(nn.HybridBlock): - def __init__(self): - super().__init__() - self.reduce_layer = NCCLReduce(num_gpus=2, root_rank=0) - def forward(self, x): - x = self.reduce_layer(x) - return x - - net = ReduceNet() - net.initialize(ctx=ctx) - net.hybridize() - data = mx.nd.ones((3,2), ctx = ctx) - net(data) - #data = mx.np.ones((1, 10), ctx=ctx) * rank - a = mx.sym.var('a') - out = net(a) - inputs = [a] - out_sym = nn.SymbolBlock(out, inputs) - out_sym.initialize(ctx = ctx) - true_out = out_sym(mx.nd.ones((3,2))) - - print(true_out) + # Gluon optimize_for + print('-------------------------------') + print('Testing pass "%s" Gluon Hybridize with shapes/types without inference' % pass_name) + inputs = [a,b] + sym_block2 = nn.SymbolBlock(sym, inputs) + sym_block2.initialize() + sym_block2.optimize_for(mx.nd.ones((3,2)), mx.nd.ones((3,2)), backend=pass_name) + sym_block2.export('modified') -test_model('myPass') +test_model('myPass') \ No newline at end of file From ca2749942f6aba1c5bce24f28f8b46575b58faf0 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 21 Jan 2022 09:21:57 +0000 Subject: [PATCH 17/23] add new dir --- .../extensions/lib_reduce_gradient/Makefile | 24 +++ .../extensions/lib_reduce_gradient/README.md | 66 ++++++ .../lib_reduce_gradient/add_reduce_op.cc | 115 ++++++++++ .../lib_reduce_gradient/pos_trainer.py | 201 ++++++++++++++++++ .../lib_reduce_gradient/test_reduce.py | 129 +++++++++++ 5 files changed, 535 insertions(+) create mode 100644 example/extensions/lib_reduce_gradient/Makefile create mode 100644 example/extensions/lib_reduce_gradient/README.md create mode 100644 example/extensions/lib_reduce_gradient/add_reduce_op.cc create mode 100644 example/extensions/lib_reduce_gradient/pos_trainer.py create mode 100644 example/extensions/lib_reduce_gradient/test_reduce.py diff --git a/example/extensions/lib_reduce_gradient/Makefile b/example/extensions/lib_reduce_gradient/Makefile new file mode 100644 index 000000000000..95b2fdc4d9bc --- /dev/null +++ b/example/extensions/lib_reduce_gradient/Makefile @@ -0,0 +1,24 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +all: pass_lib + +pass_lib: + g++ -shared -fPIC -std=c++11 add_reduce_op.cc ../../../src/lib_api.cc -o add_reduce_op_lib.so -I ../../../include + +clean: + rm -rf libpass_lib.so diff --git a/example/extensions/lib_reduce_gradient/README.md b/example/extensions/lib_reduce_gradient/README.md new file mode 100644 index 000000000000..85cfb3d7b16f --- /dev/null +++ b/example/extensions/lib_reduce_gradient/README.md @@ -0,0 +1,66 @@ + + +Add Reduce operation to computation Graph +======================================= + +## Introduction +This is the part of work of transferring [DeepSpeed's work](https://arxiv.org/abs/1910.02054) into MXNet. +Since the difference between symbolic and imperative, we divide the whole proecss into two phases: + +phase 1: Add reduce operation into graph. The reduce operation will do nothing +in forward but reduce the gradient to the right GPU(according to POS-trainer). + +phase2: In backward graph, delete the outputs in arrays so the memory planner can reuse such memory. + + ## Getting start + ### Prepare NCCL and horovod + Since we use horovod to communicate, please firstly install horovod. And we use NCCL reduce, please also install it. + + ### Complie the Graph Pass and load + Please firstly compile it like [lib pass](../lib_pass/). Run `make` and it will generate dynamic library + **add_reduce_op_lib.so** which is compiled from the `add_reduce_op.cc` file. Then load such file in your python code like +```python +import mxnet as mx +mx.library.load('add_reduce_op_lib.so') +``` + + ### Prepare options + Then we need know the correct partition of parameters and gradients about their GPUs. + So please use **POS_Trainer** from `pos_trainer.py` like normal trainer in MXNet. + ```python +from pos_trainer import POS_Trainer +trainer = POS_Trainer(params_dict, "adam", optimizer_params) +``` +Then trainer can generate corresponding options like: + ```python +options = trainer.generate_graph_pass_options() +backward_options = trainer.generate_backward_options()] +``` +### modify graph +Before forward, we use + ```python +model.optimize_for(x, backend = "add_reduce_op", **options) +``` +to insert reduce operation into graphs. +Then we call backward option as + ```python +loss.backward(backward_option = backward_options) +``` \ No newline at end of file diff --git a/example/extensions/lib_reduce_gradient/add_reduce_op.cc b/example/extensions/lib_reduce_gradient/add_reduce_op.cc new file mode 100644 index 000000000000..8faad796029a --- /dev/null +++ b/example/extensions/lib_reduce_gradient/add_reduce_op.cc @@ -0,0 +1,115 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file subgraph_lib.cc + * \brief subgraph operator implementation library file + */ + +#include +#include +#include +#include +#include "mxnet/lib_api.h" + +using namespace mxnet::ext; + + + +MXReturnValue add_reduce_op(mxnet::ext::Graph* g, + const std::unordered_map& options) { + std::string cur_rank = ""; + + std::string num_gpus = ""; + std::string nccl_unique_id = ""; + + for (auto kv : options) { + std::cout << "option: " << kv.first << " ==> " << kv.second << std::endl; + if (kv.first == "rank") + { + cur_rank = kv.second.c_str(); + } + if (kv.first == "nccl_unique_id") + nccl_unique_id = kv.second.c_str(); + if (kv.first == "num_gpus") + num_gpus = kv.second.c_str(); + } + size_t length = g->size(); + mxnet::ext::Node *tmp; + std::string root_rank; + mxnet::ext::Node *target_node; + int index = 0; + for (int i = 0;i < length; i += 1) + { + target_node = g->getNode(i); + //std::cout<<"deal with:" << target_node->name<name); + if (it == options.end()) {continue;} // req_grad == null + root_rank = it->second; + mxnet::ext::Node *new_reduce = g->addNode("ncclreduce_" + target_node->name,"_contrib_NCCLReduce"); + index += 1; + auto new_attrs = &new_reduce->attrs; + auto old_attrs = target_node->attrs; + for (auto it = old_attrs.begin(); it!=old_attrs.end(); it++) + { + if (it->first == "__ext_dtype__" || it->first == "__ext_shape__" || it->first == "__profiler_scope__") + { + new_attrs ->insert({{it->first, it->second}}); + } + } + new_attrs->insert({{"nccl_unique_id", nccl_unique_id}}); + new_attrs->insert({{"num_gpus", num_gpus}}); + new_attrs->insert({{"rank", cur_rank}}); + new_attrs->insert({{"root_rank", root_rank}}); + + for (int i=0;ioutputs.size(); i++) + { + new_reduce->outputs.push_back(target_node->outputs[i]); + mxnet::ext::Node *output_node = target_node->outputs[i].node; + int index = target_node->outputs[i].entry; + //std::cout<<"try change:"<name<<":"<inputs.size()<inputs[index].node = new_reduce; + } + for (int i=0;ioutputs.size(); i++) + { + target_node->outputs.pop_back(); + } + target_node->outputs.push_back({new_reduce, 0}); + new_reduce->inputs.push_back({target_node, 0}); + + } + g->print(); + + + return MX_SUCCESS; +} + + + +REGISTER_PASS(add_reduce_op).setBody(add_reduce_op); + +MXReturnValue initialize(int version) { + if (version >= 10700) { + std::cout << "MXNet version " << version << " supported" << std::endl; + return MX_SUCCESS; + } else { + MX_ERROR_MSG << "MXNet version " << version << " not supported" << std::endl; + return MX_FAIL; + } +} diff --git a/example/extensions/lib_reduce_gradient/pos_trainer.py b/example/extensions/lib_reduce_gradient/pos_trainer.py new file mode 100644 index 000000000000..357289a8ab07 --- /dev/null +++ b/example/extensions/lib_reduce_gradient/pos_trainer.py @@ -0,0 +1,201 @@ +# POS_Trainer is the stage one : partition optmizer status in DeepSpeed's work +# It can reduce memory consumption in distributed Trainer but slower +# since we can not solve overlapping problem when calling broadcast and optimize parameters. +# The usage of this trainer is totally same with original one +# I test some benchmark Here: +# For 4 V100 Gpu with 16GB memory, the maximum batch size for bert-large and bert-base: +# bert-large: Original: 16 Pos: 24 +# bert-base: Original: 64 Pos: 80 +# The ideal average saving memory for each GPU is: (N-1)/N * P * K +# where N is the GPU number, P is the parameter number and K is the memory +# multiplier of optimizer states(E.g. for Adam, K = 12) +#TODO add group_num +from horovod.mxnet.mpi_ops import allreduce, allreduce_ +from horovod.mxnet.mpi_ops import broadcast, broadcast_ +from horovod.mxnet.mpi_ops import init, shutdown +from horovod.mxnet.mpi_ops import size, local_size, rank, local_rank +from mxnet.base import _LIB, check_call, mx_uint, c_str, c_str_array, SymbolHandle + +import mxnet as mx +from collections import OrderedDict, defaultdict +import types +import time +import warnings +from mxnet.gluon.parameter import Parameter +from horovod.mxnet.mpi_ops import ProcessSet, global_process_set, add_process_set, remove_process_set + +class _NCCLReduceHelper(object): + _init = False + nccl_id = None + num_gpus = None + rank = None + + @staticmethod + def init(num_gpus, root_rank): + """Communicate the NCCL unique id""" + cls = _NCCLReduceHelper + if not cls._init: + cls._init = True + import ctypes + try: + from mpi4py import MPI + except: + raise ImportError("Spatial parallel modules require mpi4py package.") + import numpy as np + nccl_id_size = ctypes.c_int() + check_call(_LIB.MXNCCLGetUniqueIdSize(ctypes.byref(nccl_id_size))) + nccl_id_size = nccl_id_size.value + cls.nccl_id = np.zeros(nccl_id_size, np.byte) + check_call(_LIB.MXNCCLGetUniqueId( + cls.nccl_id.ctypes.data_as(ctypes.c_void_p))) + global_comm = MPI.COMM_WORLD + rank = global_comm.rank + color = rank / num_gpus + comm = global_comm.Split(color, rank) + comm.Bcast([cls.nccl_id, nccl_id_size, MPI.BYTE], root=0) + cls.num_gpus = num_gpus + cls.rank = rank % num_gpus + cls.root_rank = root_rank % num_gpus + assert num_gpus == cls.num_gpus + + +class POS_Trainer(mx.gluon.Trainer): + def __init__(self, params, optimizer, optimizer_params=None, + gradient_predivide_factor=1.0, prefix=None, partition_gradients = False): + + self._world_size = size() + self._world_rank = rank() + + self._partition_gradients = partition_gradients + + self._all_params = [] + self._all_param2idx = {} + self._all_params_with_names = params + param_list = [] + if isinstance(params, (dict, OrderedDict)): + for key in sorted(list(params.keys())): + param_list.append(params[key]) + params = param_list + if not isinstance(params, (list, tuple)): + raise ValueError( + "First argument must be a list or dict of Parameters, " \ + "got %s." % (type(params))) + for i, param in enumerate(params): + if not isinstance(param, Parameter): + raise ValueError( + "First argument must be a list or dict of Parameters, " \ + "got list of %s." % (type(param))) + if param._uuid in self._all_param2idx: + # Shared parameters have same uuid; only need to store one of the shared versions + continue + self._all_param2idx[param._uuid] = i + self._all_params.append(param) + self._partition_params, self._param2rank = self._partition_parameters(self._all_params) + self._own_part = self._partition_params[self._world_rank] + super(POS_Trainer, self).__init__( + self._own_part, optimizer, optimizer_params=optimizer_params, kvstore=None) + self._prefix = prefix if prefix else "" + self._scale = gradient_predivide_factor / size() + self._gradient_predivide_factor = gradient_predivide_factor + + + + def _partition_parameters(self, params): + """ + partition all the parameters by their size and try to average them. + """ + world_size = self._world_size + ## list for rank each would be + partition_params = [[] for _ in range(world_size)] + param2rank = {} + sizes = [0 for _ in range(world_size)] + for param in params: + if param.grad_req != 'null': + current_rank = sizes.index(min(sizes)) + partition_params[current_rank].append(param) + num = 1 + param2rank[param._uuid] = current_rank + for p in param.shape: + num *= p + sizes[current_rank] += num + return partition_params, param2rank + + def _allreduce_grads(self): + """ + rewrite allreduce here because we need to communicate using horovod. + Actually we should use reduce here, but since it is not available yet, + I use allreduce instead. + """ + if not self._partition_gradients: + for i, param in enumerate(self._all_params): + if param.grad_req != 'null': + allreduce_(param.list_grad()[0], average=False, + name=self._prefix + str(i), priority=-i, + prescale_factor=1.0 / self._gradient_predivide_factor) + + + + + + def step(self, batch_size, ignore_stale_grad=False): + """ + inherit from trainer, only call boardcast to make sure all parameter are consistent + Makes one step of parameter update. + Since each process main their own part, we need to brodcast after calculation + """ + super(POS_Trainer, self).step(batch_size, ignore_stale_grad) + self._broadcast_partition_params() + + if not self._kv_initialized: + self._init_kvstore() + if self._params_to_init: + self._init_params() + + def update(self, batch_size, ignore_stale_grad=False): + ''' + assert not (self._kvstore and self._update_on_kvstore), \ + 'update() when parameters are updated on kvstore ' \ + 'is not supported. Try setting `update_on_kvstore` ' \ + 'to False when creating trainer.' + Since each process main their own part, we need to brodcast after calculation + ''' + + + super(POS_Trainer, self).update(batch_size, ignore_stale_grad) + self._broadcast_partition_params() + + def _broadcast_partition_params(self): + """ + This function is to broadcast parameter since each process will maintain their own part + """ + for param in self._all_params: + broadcast_(param.data(), self._param2rank[param._uuid], name=str(self._all_param2idx[param._uuid])) + + def correspond_ranks(self): + return self._param2rank + + def generate_graph_pass_options(self): + #helper = _NCCLReduceHelper + #helper.init(size(), 0) + #helper2 = _NCCLReduceHelper + #helper2.init(size(), 0) + options = {} + for name in self._all_params_with_names: + type = name.split('.')[-1] + index = self._param2rank[self._all_params_with_names[name]._uuid] + new_name = self._all_params_with_names[name]._uuid.replace('-', '_') + '_' + type + options[new_name] = index + + helper = _NCCLReduceHelper + helper.init(size(), 0) + options.update({"num_gpus": size(), "rank": rank(), "nccl_unique_id":helper.nccl_id.ctypes.data}) + return options + + def generate_backward_options(self): + backward_option = {"partition_grad":True, "current_rank":rank()} + for name in self._all_params_with_names: + type = name.split('.')[-1] + index = self._param2rank[self._all_params_with_names[name]._uuid] + new_name = 'ncclreduce_' + self._all_params_with_names[name]._uuid.replace('-', '_') + '_' + type + "_backward" + backward_option[new_name] = index + return backward_option diff --git a/example/extensions/lib_reduce_gradient/test_reduce.py b/example/extensions/lib_reduce_gradient/test_reduce.py new file mode 100644 index 000000000000..b8e40580f0f0 --- /dev/null +++ b/example/extensions/lib_reduce_gradient/test_reduce.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python3 + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# coding: utf-8 +# pylint: disable=arguments-differ + +# This test checks if dynamic loading of library into MXNet is successful +# and checks the end of end computation of custom operator + +import os, ctypes +import mxnet as mx +import time +from mxnet.gluon import nn +from mxnet import nd +import numpy as np +from mxnet.lr_scheduler import PolyScheduler +from mxnet import np, npx +from pos_trainer import POS_Trainer +try: + import horovod.mxnet as hvd +except ImportError: + pass +from mxnet.base import _LIB, check_call, mx_uint, c_str, c_str_array, SymbolHandle + +# load library +if (os.name=='posix'): + path = os.path.abspath('add_reduce_op_lib.so') + mx.library.load(path) +elif (os.name=='nt'): + path = os.path.abspath('add_reduce_op_lib.dll') + mx.library.load(path) + + +class Easynet(nn.HybridBlock): + def __init__(self, n): + super().__init__() + self.ls = nn.HybridSequential() + for i in range(n): + self.ls.add(nn.Dense(in_units=2, units=2, flatten=False)) + + + + def forward(self, input): + input = self.ls(input) + return input + + +def test_model(): + from mxnet import gluon + from mxnet.gluon import Block, nn, HybridBlock + from mxnet import init + + + + hvd.init() + rank = hvd.rank() + size = hvd.size() + ctx = mx.gpu(rank) + + np.random.seed(1234 + 10 * rank) + mx.random.seed(1234 + 10 * rank) + + + number = 2 + model = Easynet(number) + + if rank == 0: + for i in range(number): + model.ls[i].weight.initialize(init=init.One(), ctx=ctx) + model.ls[i].bias.initialize(init=init.One(), ctx=ctx) + else: + for i in range(number): + model.ls[i].weight.initialize(init=init.Zero(), ctx=ctx) + model.ls[i].bias.initialize(init=init.Zero(), ctx=ctx) + + model.hybridize() + + params = model.collect_params() + lr_scheduler = PolyScheduler(max_update=1, + base_lr=1e-3, + warmup_begin_lr=0.0, + pwr=1, + final_lr=0.0, + warmup_steps=0, + warmup_mode='linear') + optimizer_params = {'learning_rate': 1e-3, + 'wd': 1e-2, + 'lr_scheduler': lr_scheduler} + trainer = POS_Trainer(params, "adam", optimizer_params) + + options = trainer.generate_graph_pass_options() + backward_options = trainer.generate_backward_options() + x = np.ones((1,2), ctx = ctx) + label = np.ones((2, ), ctx = ctx) * rank + #print(options) + + loss_function = gluon.loss.L2Loss() + + model.optimize_for(x, backend = "add_reduce_op", **options) + for i in range(1): + with mx.autograd.record(): + out = model(x) + loss = loss_function(out, label).mean() / size + loss.backward(backward_option = backward_options) + mx.npx.waitall() + mx.nd.waitall() + + for name in params: + print(name, params[name].list_grad()[0]) + print('finish') + + +test_model() From 1153b2289ed2d5baaf33fcca67e2cf326f5996ed Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 21 Jan 2022 09:23:02 +0000 Subject: [PATCH 18/23] add example --- example/extensions/lib_reduce_gradient/README.md | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/example/extensions/lib_reduce_gradient/README.md b/example/extensions/lib_reduce_gradient/README.md index 85cfb3d7b16f..43d524e01ff3 100644 --- a/example/extensions/lib_reduce_gradient/README.md +++ b/example/extensions/lib_reduce_gradient/README.md @@ -52,7 +52,7 @@ trainer = POS_Trainer(params_dict, "adam", optimizer_params) Then trainer can generate corresponding options like: ```python options = trainer.generate_graph_pass_options() -backward_options = trainer.generate_backward_options()] +backward_options = trainer.generate_backward_options() ``` ### modify graph Before forward, we use @@ -63,4 +63,12 @@ to insert reduce operation into graphs. Then we call backward option as ```python loss.backward(backward_option = backward_options) -``` \ No newline at end of file +``` +###Simple Example +Please see `test_reduce.py` + +###Current problem +1. The reduce operation will cause deadlock (it won't happen in NaiveEngine). Moreover, it will meet invalid address +problem in complex model like Bert-Base. +2. We do remove outputs from backward graph using backward option. But we need to verify whether it decrease the memory +consumption. \ No newline at end of file From 1a7c627a04eb6c95b27ef067918cdde2018cf673 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 21 Jan 2022 09:25:06 +0000 Subject: [PATCH 19/23] add annotations --- example/extensions/lib_reduce_gradient/pos_trainer.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/example/extensions/lib_reduce_gradient/pos_trainer.py b/example/extensions/lib_reduce_gradient/pos_trainer.py index 357289a8ab07..eb22e2948973 100644 --- a/example/extensions/lib_reduce_gradient/pos_trainer.py +++ b/example/extensions/lib_reduce_gradient/pos_trainer.py @@ -171,14 +171,8 @@ def _broadcast_partition_params(self): for param in self._all_params: broadcast_(param.data(), self._param2rank[param._uuid], name=str(self._all_param2idx[param._uuid])) - def correspond_ranks(self): - return self._param2rank - def generate_graph_pass_options(self): - #helper = _NCCLReduceHelper - #helper.init(size(), 0) - #helper2 = _NCCLReduceHelper - #helper2.init(size(), 0) + #Generate options for graph pass, key is parameter name and value is its rank options = {} for name in self._all_params_with_names: type = name.split('.')[-1] @@ -192,6 +186,7 @@ def generate_graph_pass_options(self): return options def generate_backward_options(self): + #generate backward option for deleting, key is the node name and value is its corresponding rank backward_option = {"partition_grad":True, "current_rank":rank()} for name in self._all_params_with_names: type = name.split('.')[-1] From 0bb29346ee66428593413c755e5848b609d29990 Mon Sep 17 00:00:00 2001 From: xinyual <74362153+xinyual@users.noreply.github.com> Date: Fri, 21 Jan 2022 22:35:08 +0800 Subject: [PATCH 20/23] add picture --- .../lib_reduce_gradient/addreduce.png | Bin 0 -> 12062 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 example/extensions/lib_reduce_gradient/addreduce.png diff --git a/example/extensions/lib_reduce_gradient/addreduce.png b/example/extensions/lib_reduce_gradient/addreduce.png new file mode 100644 index 0000000000000000000000000000000000000000..40338b972d454e585b9b5589f8005bddcd8a0048 GIT binary patch literal 12062 zcmdsdc|4Tu+xInu>?&J?y6bMStC);1Wot)vLUP-UC4-DGDumG8Lb6S$WSfMsHAW;w zF?NQrq?$2fz0HheEYEf6_q@;VeV_O9{_+0zzJJW;`pkJ9*LfW0aUSP+e9!ZHBwAcC z6BUvZ0suht{5j*R0Kh{B0GQEEez3*chr1tqz(TH?odwD}_f3O;c6gn+dMx;F?@WoK!uX;4f4-=Y0b+(rZ*^K6;3XIQ}cZ zfvBN^A^NtRukl&FHxkJ@ODN;OfWl;~7{C`Y-JUh4ClE7Z`(R%%uaN+C;`6Z@@0BX~ z(UX(u0Z~0_Bb%j86;rouS&ki5S4LFrx86Ecajp~Q=elXWZJ3g1YXGUdlMYw_@PN@E zt~v1s|DZw;FZ0uA58mLrLCvgBIFo8PUF1C;C%^SYjT-b(O;Ik?{k3anCk&t)@*sdi zb00J9sNX{csE;NSM&HeL8S^gtC#CAXoqRE~FYWY3<(dNw*f@(6z#4DRTj8TJ1f_6Z zW?KDAr!8H%@&*TVDxZ;feOI9F^Bj4Ath56SoFk99%TOnB5~vfuPIPNhJMu%sL=Dx|?qz&X3dJ;iQ<%|1&y0_HVPw`dvM+ z{Yo!|=wC%eaQ9E%GbZ*Oiw{u|nhx({d=8QU=)`6L^tTai(~5bfye!hqCp-K?i=&*B zXt<>_3?SOjv7Mt$K+=`UYE=aTMbzD^s`bIvq5J^r%7NcHd8>~+?|hqASzYC1J!s-5 z-Yw086qi!Roq%g%-_=HqR5Vt3RH_wMGQsj5)Cn+Ci zi36Ad30O{DEbK{Q!0ADvT7*b9)5emTF&gnOiw?sHJQClw-~Z&&lYoG%&)7u8NpX+Vr-1zn8eR8>5H-(p7!ULc$_WbVhn~L4o*W5}8zl%xs(jqAygF66v z?q9#HKdAr4Ufd|T`k;wwy|nWl02ld4WLs4!7=xMpy{T3z|8x5TFto4C%B4FiZYnnrjyu)e^w^LBST0w8OAPA^bk>=)ye>eQ z{heHXe$du4gw|O`;=ztxRNnU3@$J5|NF_OX@_0E_6-edn2mw^iz(Wt@GyxBdW^MyYdWU+O>W3`W z$$^S0VtR6O15Z_FLDbAXiYd#5iHal+B1#??i5c{F;U9Bsa%PBU>r#C}cVNQ~?ICM8 zN0gvLcwOt8!FUU(5B;l@BfMVOtpjV-648WNN{M28s&=dFuxpWZ{U{IYw=qm@mM#f{s)0g>bRW+UQVVVpOeU zxlT&2(`@9yA2QCp>y+x&l`})HB~UPDH~Yzf_~fD$?!8I-QoB)tUB%S&Q1pDw3bPfT zL13p%$xM;bgY;CwTogtOm4K>)nZE;2U6?*zD}g$ZbPXX$XH*b?5*(_<{LrY`he0p* zh7HAFLy7R+r(F5jzVA2HDK3(AXvD|X_`-W`c1A|e`B?Ti z+xf`XFy}MlhOU?F1^P)2z!4npvy?+T;98f-Xv8|QPs5(ES{szqStsA6Bn9m57k#Vc zdr&f9LteQBm`m{6u*L;SldDPJSot?v+-GEG`OykS+>vL&MDes28yoTl)mIMa&#-I! z4*h$-|_i>B(`S+{!X~HpP@_jW!_GfbqiTrT03WtAE}RA3}-D>F7KXcB$boad4B6 zGCQ9ThEqQI^EGt}()KX7)z0oS`TY=@kv49`EyDi!_XT=*6| zSwHE4Qwf3HT9QRkhy*_WvP#VW$=cbBJDNDZ2-(J28S(Dm4cFsX2QqS?b&djpVm6E%ZcZVJ|0p0I3eYH{Sx_N;&pWc#XbM{n%4ho}GEKzGt@ zvgPe1t=-{?l*l;0xKMWS$hY(|o_dmtVL$J@!1pk4d6rypOM9PkaS+qr;a$hL4$ol_ zUCl?7ir%#v6;!)Uqy{z)z=h_*41|Av`?cpVj<-`^bJMEkuay_z zZdQ04O6iUOKz(YA#D5=c+Rg6<^ao9S60NyK{cv{cr$y+XHvQk;gE_yL34w#AwSsO! z!A_44Md9*|iz`-90(w7w?a>4Xx-3g)vVeLMT$@&6^95DYrzJw&9U*3lH z#eZx!7yz!~^7839=~BbJsEb#X?;e~AxHUM%cH{*xe`aFUlM}==!L-{Ru}mGWwQ)6^ zXKaiG6y_H9%Q$390bO z8LnL*m^W|?Hv=64>oNUcG(5eH%y>fMZwy+M89w#xwacalkpNg+kz?d@u1xiU0#^62 z>c_f7OEpTgmm8bESpyf$Pc#&%t$*pl4`zVDTBZr3vO)r`=U7=C`}B){;MC=SS3DMc z1INxp3IIgQTCDoUb9sC51HL+4x^OF4kr4CA5IYUu4}JUcA^5F(z(gilM*4sPQmTk( zWnp(+0E9m;c6>AUb;M8dSV)CczNZ)fO<#ABLp)RB1o%EAQ5xUVuGu487%)z)JxoQ2;JsC!Mc#7~B%{8Z(>xP=;M(mcS)0+zE-f<8agC9aoCXs1;f=j{}@heYU}Kii)U*I(I1e%+&?e_N3rKfP6x zZw+6z=a=lyVcq8WJl6Eh)Jm`%}C`vD8bN!gn^}gOlkB)TzoTT{AGtj93 zS)7N+HSbt&DxR9IrMS`-2Xpiz2fk&&HJo1OO{u*q?Zb??v|V*xC-E1l3w_OVs+JvT_(2ZhRh-L)KL}UzI=v38KnICMDFd}y zzs$1U$gK>4ygSt-myr<-5!2A_)pb-=S`MjS*Cw#voOT>jwlw60L$StI2L0B%oY}pc zuq!ziX{@dn814xD#Bx`xZ|$n9=!Fz8P>;TJ#9}norC%0x>RL%yTU6ywFcOfK7F82I zl44SdYS$7Z534<7cvWHpCUs0d^ozgA-#;r5kf&6b!k{O(^|#LNlF4ad3^B5cmoMF^ z*s23{mR{lPjnAuaJ<`qDBqc}xeLsqGv?goo&w}0C6Ef5Q|CV*uNs666KkiDpf7>1@ z!)MGE;U2K4GO(3ARUISS*eYJQHmr){;*BU8VRJ*eCd?b_6Xj7Y5*lG&ZD|ks?ecQc zhZpv!944fd1+w>nmFsul#hzvf&RFl z;bI9z`-`ZhtkznsD$8!sf`+hujaSQ(Vl5A4#P%eL%4-SzUEwB8Y2)!x#VV62YeO1* zpT+|YqsM}7U64iFWDH5jlo*wS_&V1bh_Hnio4W4YAxueL^x6%hYiBDwj>0!vRUSaI zQ78{zh^}btkga=3r1*VmT`67m=m_&92Ll{&O6&! zHpmHkT$CQilq{T$I`?8=EAH$_NwiM%HI~8YSERIP9X&5qe?MXj5s|P}=bKYLOvBr4 zphad?5O%i?`mW3$`j`GxNPl*sAT<-d_h$07XlBh}{GS3scPbvf#!e?#5N*Bt32qwI zJcQ<#18?Y|L&B&NE#Aa|>GzwMxL|ow*=!ZDyUsr+dX3v!6h3NbTTA2%+j>@^$Tp?* zF&CZ{lMG}=3}(*QEfA)Lt5=yOq3^oRjjZKW+#9*LJ#)c->C+)UyOla>?+xm(Xk7of zX7XZ0l3WJgcl5a<6M%98-_;*kMlJclEV>{MBB^u$0X+6EM}uo>zXPrFsr#*Dsf~Aw zqao~MUhMhm0a)#qvuousx62xL0K4Bs)`XCj^m^|GfHw_-!0GCA*fVI%=X#!N_wojG zcVskLlTHE8to`<4PeDEE^73xCyRMsTdg97v9B60IomS~+MXU+5Eg05^_G&`TD&idAMd#~4UNsUl_y}Y5OB*|O4FT#EPwz@ zlpyfqfg&1^vH zz~Tpd)IZY2V}|NNXuk(h#;X|#CemLQ1jr#u8*~=VsV^Tq=*5EmM<={;{Q=sCnd@-P zRYWH{ZvF~n$nXL1en$aZB($`1vz{GoJB8l(6FJBOej*G*XMB6w32I|~Op)&-Zf*4P z26ne|$KVlafJ9NX1akRUq2DD02pPzG#@9V7+VYt)BF?;kai60g4!+&vDjT1ypOwrg z_XDu^9aUhg)U)-!&O-xtt_jD7D6NrM3t1S;9Gh^!8#izjAhvC*dmquu8^rjTwpHsA z0o*UCmP8sr-IYgvPyCXl=6>c>Ykmc-_ylm$(oYS$4*A!)CeT$YzP>1RdE6C$5;$q= zr-i+}xcwq3vPXY9ZR{Ucs<7*781WrIzE<`V_y^L2rlM+emt(~*3AaUQR3kQ+DA)E) z%;!>Dz%zf~Cb01Vl6m?{?LB*-3f$^5;)E;fwMwM0nZ?zL$gOR?;mcGXR$+;lQ_X-% z=wqtpR`S+L*V=C2S@FOfV54uVK#VaABRVX{7x;JNq5?>{oz-Z!jfIZ`mxSZ{&@v_P{lo ztb9ox+H*$ZzAqPO>T{IZuLcR3KS)I^N0MlxZ3a5}*TWsrstQX2hDCXWCmviBm?)`M zM`BJY=G67TSRdnrJ5ggZqel#fHy32|1+BTazwhZp4QWP{s#k=i1y2JY7<%|BZ+#sB zqT@g!=XuEZR%1v@OQ)L4%&3>a)M%Tb!QPKHn-k*MS@5p9pB*_)m7nS&=W|A1o6YRq z1?nqK!KM~Mb6`{8rVbsxG=SnnZ0e8grb2nMIwgUk!#oI~Sa2B^{V~@DgWfhzfH7M9 zIeu-raUal+M4G*T+-&n#j7i~Lo+5rw{L3d#9;6NV49r>prVs~t*!u74C{SN}GB)+> z6u&ZHQ@BX~*Z<~iID9{N8W{mz5C}m&k{Sjt|qp6Xe4eoA~;7}*+*f>rwbXzxP3e6N7a4_^IMn15Ne&^%8H$zMkwEpP} zjbomt01FdF_QbCU+^L=PB1z`??~6k~F=Voq1ldyek`X_y{p1hxuH@^Vl^&||Ae7IG zpS-HE8$#+P>)N}{$%Fm0Ui$!^Wx*XdV(&Rf4;0+iPZ7%Ays<0^a;zK(AmLX)Cr7&M znCgi=2{s|PcY0RPK#?5v6keVV+QU+UDJ*RJ`je_1Qf?<5iG@r*-Vo~&@`{%(&V!Kn zAo&+ioaz>X+yXn`U$N4_7FPyOubb@pECW;Uz5|IP5GvU9UF2A4XuTF!b_$6`!pM<3 z(AVQ{Jrlqd_ofsGF;1W0p{wyA$ovqPVmwQgrt5?M4tweV7NI)5QlNoj`*FMqH~~Rq z|CVR)@^ptck>`LFMi!60;n^K z=pG5J%)znXw!`Xm+)ugT;ZgGsxZkPb4NhjCpzBf(6TBUGUe%e+4T?;2`Aq0HK<)zV zR)1U9vo~1~*WTy$B9|3V9hXu5e!4Map$$4h!!^-l(bVjl$?@h5A1?0+m>xJn1xb~9 z@M^%z1YdoLS@rD>Tf70hpz0~o*+NA`1x!tOCBCjyYbrP z=3S;(ZEUD}z_guGaNY<9J)k#!zPnKvzWLaxe)7K4>6464S4i18yyy196zInTp7CI( zt$K>XiT$jqBc1OV(s(_3x`^SAbX3e?xmqtP^w(HXaSz(9AJcjLT|$KkSTPm=1__Ws z%_@jQpHz1?a+RZ&EEUL%2e}SKkBXKWGSUM!Xx_DRp540ABKmX#(ogTOfc@G#&^R-H zTV7~(+WiSCQgQ5mCw;8WLUhhPS;P-c&g~a2^7}0d@n|viA*fe;E_U=&L5Z66>sIIn$2Mx?p8USOdArlO z4&bfObmQ9%V_P_G9E?UW@OY&GFm)M+A$2nN1E_Z~B;9#rmxETn+`V7?iu(+{Kq6DIi+P^#nl{SQKcuL}_{?(bd(fC2&@I~Wo~pORXA#Qt`F{3Q z9OOnZb_~=acpY?u$5Q)KcqL8sBjwQ_>3l}+!$#k+r=dZ(LoW-DNHN{KE5Zuzn+)b3 zImH_RF3zZrptg43*3W-C=+B<2&iCuqhW%GNo$WW@7yY-PEXpwH8v1g*4sH;flsw3a zcL;NZ+69FP&#Dsy9wN@b6+@TW_az<8{6`0nW)E)^dmWcy`*1V_xA#+ zR2B?k11lrNvO~b7XEu@wk{plA)l6Z*WA7sz>Y_I=QPY=qu;#kG9Kyh06o)P-GpTIi ziGZM9<0|UWo88(dioPYO{f%U!)IKvaGiqm!9`j=0AGq+gpB)*3bO3V~BI|~W6^>AQ z%FIniQXm>d!Bx)z{6-bmvA(+WI&M)Dr{e9aCz@>OK*UrxVA#GII4sdgaYk zsV`iCsTUygp;rz6x5wr>`Xl6+L1 zlUkrzj53}NRn&{;SV`z0a^ps26`YuA7EVEgDW4pk@=f3Ulq)pCQZmDxX@Z{(**%M& zo!wW)n2G6p^|;G6FLO(GjRvjV&klQO`~%uBY@G+&pyo!~s&^Jd?mc&HAUxANr3|c1 z4vPOhi71fp8S#!Z97^IWw&0IWtifN!vv>OBM8`@fXxNH(Yq_^AG8IeGcq$lw;Ww`P zOXAcX`Z<-DAB@g%AAG$$8%>a&Na4YhV17U=_5n0M&eCL3@BwJ}ReQ)Sxz`Pi*YHu% zwGU7Qp>8YIP3#(BrWJ*<#AylvtX)!J&dj%Gvue5WLow((&ijOKY z7%}Ctg~74x0z<3l&X)t~w0w{o>9~|oD;txk1jxsI2oONax6CcS<+i0Lbe(jBKgI?S ze?>dX3G!k}5N)Z`;2D$)vh#Aq;B|X`yavbXN}@L5*iO>Pq@(>^(m$8~7bYuZGp15& zC>pS~w-$_R(VVGHQ?60j(^6`}ropW!P;zQ?AP^^rY< z>M}-Y-rCIVO|SlGFL|c{kkQ|71`!~A3l4z~v}a63^9NqP0=2sO9*g>btDlpPY|;Q5q_xna^FoU!;cqKH9zr$-nvFoMC4NVP#|?hW+=hKPUn$GNgQ@yVPB&o z{%2U~+hxjGf`ZWqw~c(L5(%)~BX#VDNwZE(|Y z&0EQPxY z*tSj@;kQN?ZZ(=R9r@5~TP}=BNJ->l*@lXxP2&RZT`!wLLRYkKw4Uw-76#3a-Z*$rbLYpeUiiIU#>~x^Dn^d6Ek1D zTTTxJnZXp=08_A-c=xPf1*3A;GD_eTkqM}H5(Xl za77+Gk3o?sc(-8e$qWr~zhJU=xZx{3#uFf(=%!K$SZAv!;KyH^nEs^Mm~Z5_LDN zL0;7aW`0Ja@U2|@CCi6Z!2{9QC#JC#RBZYF5_dmCwdKf1FCCCFF){txBE62~9$^YX z%*|0!eh5X2)oU2vb6mohN3 z_rCx{n(Fp=xrBnnK&D{2eFByNc@6?U5(6dnn|&3%rmzD&@oJ<-A>?jWm+?s2lVryF zU{=}yp69dp_F7 zJuSEj;i4vpx1&?TW2UyzqHM%T%Bw;P>za{yL~g$nT?RZ$2td_A#pT2e6}Sbr)u8oZ zIYRAPrD4fIGPs@n#s^Q20VPNDUrwsa4dmyBH0xePA_dm&L7Zh~{6l}Jcu8-lYuFhb zK$R%gqKWvO0V}bPN?IU-tk;w2%>2cfBFtk1oMAs~38Z~D@2nS0jvXoZF}%;mY=SG;P96xa<7PV+oG zH!UYxH>=sE%YocN_FQ6l%$B%|lp8)$2-$j*QCp|*w9HKR2#w>ryYb-zoO z>(Zr5V}{h^3}*S^mup`-<)J$WhD_U;Q+ZOvf#YQzguS{?XDj`u?86>hw7Z3qOS+2{ zJ9GyeHm>6Tg3c4SFb31Z0f?pqsUhSI$N^Yi(6&SfT|=jRu|rVi28zI=umu3gAw$94 z><)xFfnspRB_+W83s`_lA2+f6_00n~UZ|Y$zds&6TwDwWkz)38I|$mCesT@b8}%da zJd#14VJ?yj78KPtN+Y|W!oqX+?^G+1rQSO&E@#36bHTEupHDVej!cNqaOK(W0amF2 zyMsYiWBCpT#CDZ=45`4@@gt@$oSA}j=-3T*@sUoaN4hz=QC$3nE}kIdU^0rf5 zD;Oxi{#^rp0aI;dw1)0mH-*P4z6-3|3xY&LSi8ku_NbrtgHJUH{_;6S?mx=LzzU`} zfw1d3AXWelAvjWK_s)WG$b1qiN`7CjtsoXVS@lE+&drl7DW??s{u8|$l4{ok@*;@l}2 z85ar`p8*!ej93Ck?frp1@h>S9Tc>gGdqQ-Y_!X|;LzqK?k- z^$-<6GLv%BkJjwsAM-4^8S|iQ{t#}Y-nLcARndtb7rB#8zpvfN(6YOW-<<`+$*+gN zki$nGI_V7-7d`&t(?Hi@9CJu$G$o0IpDp+a)Td!nQ|+hragURaJ1KFGd)rllA9PRS zQwBP}6&9U1^cNE>NYeR4%+VQ01xJjz`&-_R+<$#!Z7AczB^srgU9nryu2O!5BwIv} zDsa;wlrf5z9HW|~znjl|lbr z@&h)y$dqaaas(Xbi8rMeRpvcPyfzN8SHXg@R{liOLQc&oq5^BldZ93YF4zC93iUR)*8Q!rKT zvxRr6tDT}P^}N=_1W|sdLgI3bl+(XCM1>KY7}afm-PV16`Q!bqpAzMjKPP3nE@5&@ z+s+$CZDyFyOe*CJNCA}YqLie?yJ16$j#-kz^!!dfy^JJ^5A&g>Z%xbf^3?l5dKD;e zb#EI_g4x|I|1PfGEptZpwtQ5@{c(3{(d)ybUGtZH`<8wr_%NMiwk{rh)mzU40AHoS zP^-Yo@!PW(R$|_DzK})mNH;E?33r z2-R+;n0(?`&ez{T6&&wXM5Fwe7%B2dnF;_j`R@XqDEu~sr=0J)l>3yD{A};Jf{;-6 z`xgu9(T;Xtaq;jUxPpVy7F@KcD<{hD!R;7C47DXw@FPdC>*x3(N|B;;6}$KK zto-Z|$Opi60u-llc6>IXOK-)>bZjRVh zirMe96Kp}3V8_6K0sv3&<}ua&7RpxguNHtQu4~!EHmVv^kxlTAQoY2Dt+TXj5~~)X z_niMPVT%+gK8MQ{Lq*|rIdx&Jh5-jB3pPV9ee)PQYCJ;tmU?XVKg|ODf;M8 cj%RbX>erfraV~Ye;CO-aCRdEh&tAXtAM||$egFUf literal 0 HcmV?d00001 From 7152c9c2bd37a962b2c259a05cc707257c3b4a05 Mon Sep 17 00:00:00 2001 From: xinyual <74362153+xinyual@users.noreply.github.com> Date: Fri, 21 Jan 2022 22:36:02 +0800 Subject: [PATCH 21/23] add picture --- example/extensions/lib_reduce_gradient/README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/example/extensions/lib_reduce_gradient/README.md b/example/extensions/lib_reduce_gradient/README.md index 43d524e01ff3..e1fc0efcd8b0 100644 --- a/example/extensions/lib_reduce_gradient/README.md +++ b/example/extensions/lib_reduce_gradient/README.md @@ -60,6 +60,7 @@ Before forward, we use model.optimize_for(x, backend = "add_reduce_op", **options) ``` to insert reduce operation into graphs. +![example add reduce](addreduce.png) Then we call backward option as ```python loss.backward(backward_option = backward_options) @@ -71,4 +72,4 @@ Please see `test_reduce.py` 1. The reduce operation will cause deadlock (it won't happen in NaiveEngine). Moreover, it will meet invalid address problem in complex model like Bert-Base. 2. We do remove outputs from backward graph using backward option. But we need to verify whether it decrease the memory -consumption. \ No newline at end of file +consumption. From 47508e798cee005741feb962d0846ffae1059b38 Mon Sep 17 00:00:00 2001 From: xinyual <74362153+xinyual@users.noreply.github.com> Date: Fri, 21 Jan 2022 22:36:26 +0800 Subject: [PATCH 22/23] Update README.md --- example/extensions/lib_reduce_gradient/README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/example/extensions/lib_reduce_gradient/README.md b/example/extensions/lib_reduce_gradient/README.md index e1fc0efcd8b0..0619f66f8b19 100644 --- a/example/extensions/lib_reduce_gradient/README.md +++ b/example/extensions/lib_reduce_gradient/README.md @@ -60,7 +60,8 @@ Before forward, we use model.optimize_for(x, backend = "add_reduce_op", **options) ``` to insert reduce operation into graphs. -![example add reduce](addreduce.png) +![example add reduce](addreduce.png) + Then we call backward option as ```python loss.backward(backward_option = backward_options) From 435d7a05c24a7d10040e63b6da941246d256539f Mon Sep 17 00:00:00 2001 From: xinyual <74362153+xinyual@users.noreply.github.com> Date: Fri, 21 Jan 2022 22:36:49 +0800 Subject: [PATCH 23/23] Update README.md --- example/extensions/lib_reduce_gradient/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/example/extensions/lib_reduce_gradient/README.md b/example/extensions/lib_reduce_gradient/README.md index 0619f66f8b19..bcc3617cb8e2 100644 --- a/example/extensions/lib_reduce_gradient/README.md +++ b/example/extensions/lib_reduce_gradient/README.md @@ -66,10 +66,10 @@ Then we call backward option as ```python loss.backward(backward_option = backward_options) ``` -###Simple Example +### Simple Example Please see `test_reduce.py` -###Current problem +### Current problem 1. The reduce operation will cause deadlock (it won't happen in NaiveEngine). Moreover, it will meet invalid address problem in complex model like Bert-Base. 2. We do remove outputs from backward graph using backward option. But we need to verify whether it decrease the memory