Skip to content
Snippets Groups Projects
Unverified Commit 17b259f4 authored by Xiaoyu Zhang's avatar Xiaoyu Zhang Committed by GitHub
Browse files

Add more upsample kernel (#5382)


* fix upsample nearest bug

* fix upsample nearest bug (#5347)

Co-authored-by: default avataroneflow-ci-bot <69100618+oneflow-ci-bot@users.noreply.github.com>

* fix upsample bilinear bug

* fix upsample bilinear bug

* recover code

* align with pytorch

* redesign upsample bilinear

* fix align corner bug

* align bilinear with pytorch

* fix conflict

* fix bug

* code reuse

* fix commets

* add upsample_linear_1d_user_op

* add upsample_linear_1d_user_op

* add linear1d interp cpu kernel

* add linear1d interp gpu kernel

* rename kernel name

* add upsample nearest_1d kernel

* add upsample nearest_1d cuda kernel

* add upsample nearest_1d op

* fix complie bug

* add bicubic interp cpu kernel forward

* add bicubic2d interp cpu kernel backward impl

* add bicubic2d gpu kernel impl

* fix bicubic kernel compile error

* add bicubic2d kernel register

* fix compile bug

* nearest3d interp cpu kernel

* add nearest3d gpu kernel

* add nearest3d kernel registry

* add nearest3d cpu kernel

* add nearest3d cpu kernel

* add nearest3d cpu kernel

* fix upsample nearest interp bug

* fix upsample nearest interp bug

* add trilinear3d cpu kernel forward

* add trilinear3d cpu kernel backward

* add trilinear gpu kernel impl

* add trilinear op registry

* init interpolate nn function

* add upsample functors

* add interolate module

* add interpolate test_case

* restruct upsample_nearest2d and upsample_bilinear2d

* add upsample nearest2d and bilinear2d op registry

* add upsample nearest2d and bilinear2d op registry

* add upsample kernel grad functor

* add upsample_nearest_2d backward

* add upsample_linear_1d backward

* add upsample_nearest_1d backward

* add upsample_bicubic_2d backward

* fix backward impl bug

* fix nearest1d backward bug

* try to fix backward segmentfault

* add data_format parammeter

* fix upsample kernel backward bug

* fix cuda kernel impl bug

* fix upsample nearest bug

* fix align_corner attr error

* align linear1d impl with pytorch

* fix trilinear register bug

* fix trilinear bug

* add trilinear aligncorners test

* add test for bicubic, still has bug

* reimpl bicubic2d kernel

* align cubic interpolate with pytorch

* add bicubic cuda kernel

* fix bug

* align api with pytorch

* add doctest for interpolate

* update interpolate docs

* fix comments

* fix check_of_return comment

* fix comment

* fix comments

* fix commnet

* fix comment

* fix comments

* fix comments

* fix comments

* fix comments

* fix comment

Co-authored-by: default avataroneflow-ci-bot <69100618+oneflow-ci-bot@users.noreply.github.com>
Co-authored-by: default avatarLuyang <flowingsun007@163.com>
parent c527d55e
No related branches found
No related tags found
No related merge requests found
Showing
with 4220 additions and 0 deletions
......@@ -204,6 +204,7 @@ Experimental features
.. autofunction:: oneflow.experimental.nn.Hardswish
.. autofunction:: oneflow.experimental.nn.PReLU
.. autofunction:: oneflow.experimental.nn.Hardtanh
.. autofunction:: oneflow.experimental.nn.functional.interpolate
.. autofunction:: oneflow.experimental.nn.Upsample
.. autofunction:: oneflow.experimental.nn.UpsamplingNearest2d
.. autofunction:: oneflow.experimental.nn.UpsamplingBilinear2d
......
......@@ -17,6 +17,7 @@ limitations under the License.
#include "oneflow/core/framework/op_builder.h"
#include "oneflow/core/framework/op_expr.h"
#include "oneflow/core/framework/op_expr_helper.h"
#include "oneflow/core/functional/functional.h"
#include "oneflow/core/framework/op_interpreter/op_interpreter_util.h"
namespace oneflow {
......@@ -93,5 +94,324 @@ Maybe<void> Upsample::Apply(const UpsampleInterpState* ctx, const TensorTuple& o
REGISTER_OP_EXPR_GRAD_FUNCTION("upsample", Upsample);
struct UpsampleNearest2DInterpState : public OpExprInterpState {
bool requires_grad;
float height_scale;
float width_scale;
std::string data_format;
};
class UpsampleNearest2D : public OpExprGradFunction<UpsampleNearest2DInterpState> {
public:
Maybe<void> Init(const OpExpr& op) override { return Maybe<void>::Ok(); }
Maybe<void> Capture(UpsampleNearest2DInterpState* ctx, const TensorTuple& inputs,
const TensorTuple& outputs, const AttrMap& attrs) const override {
CHECK_EQ_OR_RETURN(inputs.size(), 1);
CHECK_EQ_OR_RETURN(outputs.size(), 1);
ctx->requires_grad = inputs.at(0)->requires_grad();
if (!ctx->requires_grad) { return Maybe<void>::Ok(); }
ComposedAttrMap composed_attrs(attrs, base_attrs_);
ctx->height_scale = JUST(composed_attrs.GetAttr<float>("height_scale"));
ctx->width_scale = JUST(composed_attrs.GetAttr<float>("width_scale"));
ctx->data_format = JUST(composed_attrs.GetAttr<std::string>("data_format"));
ctx->SaveTensorForBackward(inputs.at(0));
return Maybe<void>::Ok();
}
Maybe<void> Apply(const UpsampleNearest2DInterpState* ctx, const TensorTuple& out_grads,
TensorTuple* in_grads) const override {
if (!ctx->requires_grad) { return Maybe<void>::Ok(); }
CHECK_EQ_OR_RETURN(out_grads.size(), 1);
MutableAttrMap attrs;
const std::shared_ptr<oneflow::one::Tensor>& x = ctx->SavedTensors().at(0);
in_grads->resize(1);
in_grads->at(0) = JUST(functional::UpsampleNearest2DGrad(out_grads.at(0), x, ctx->height_scale,
ctx->width_scale, ctx->data_format));
return Maybe<void>::Ok();
}
private:
AttrMap base_attrs_;
};
REGISTER_OP_EXPR_GRAD_FUNCTION("upsample_nearest_2d", UpsampleNearest2D);
struct UpsampleBilinear2DInterpState : public OpExprInterpState {
bool requires_grad;
float height_scale;
float width_scale;
bool align_corners;
std::string data_format;
};
class UpsampleBilinear2D : public OpExprGradFunction<UpsampleBilinear2DInterpState> {
public:
Maybe<void> Init(const OpExpr& op) override { return Maybe<void>::Ok(); }
Maybe<void> Capture(UpsampleBilinear2DInterpState* ctx, const TensorTuple& inputs,
const TensorTuple& outputs, const AttrMap& attrs) const override {
CHECK_EQ_OR_RETURN(inputs.size(), 1);
CHECK_EQ_OR_RETURN(outputs.size(), 1);
ctx->requires_grad = inputs.at(0)->requires_grad();
if (!ctx->requires_grad) { return Maybe<void>::Ok(); }
ComposedAttrMap composed_attrs(attrs, base_attrs_);
ctx->height_scale = JUST(composed_attrs.GetAttr<float>("height_scale"));
ctx->width_scale = JUST(composed_attrs.GetAttr<float>("width_scale"));
ctx->align_corners = JUST(composed_attrs.GetAttr<bool>("align_corners"));
ctx->data_format = JUST(composed_attrs.GetAttr<std::string>("data_format"));
ctx->SaveTensorForBackward(inputs.at(0));
return Maybe<void>::Ok();
}
Maybe<void> Apply(const UpsampleBilinear2DInterpState* ctx, const TensorTuple& out_grads,
TensorTuple* in_grads) const override {
if (!ctx->requires_grad) { return Maybe<void>::Ok(); }
CHECK_EQ_OR_RETURN(out_grads.size(), 1);
MutableAttrMap attrs;
const std::shared_ptr<oneflow::one::Tensor>& x = ctx->SavedTensors().at(0);
in_grads->resize(1);
in_grads->at(0) = JUST(functional::UpsampleBilinear2DGrad(out_grads.at(0), x, ctx->height_scale,
ctx->width_scale, ctx->align_corners,
ctx->data_format));
return Maybe<void>::Ok();
}
private:
AttrMap base_attrs_;
};
REGISTER_OP_EXPR_GRAD_FUNCTION("upsample_bilinear_2d", UpsampleBilinear2D);
struct UpsampleLinear1DInterpState : public OpExprInterpState {
bool requires_grad;
float scale_factor;
bool align_corners;
std::string data_format;
};
class UpsampleLinear1D : public OpExprGradFunction<UpsampleLinear1DInterpState> {
public:
Maybe<void> Init(const OpExpr& op) override { return Maybe<void>::Ok(); }
Maybe<void> Capture(UpsampleLinear1DInterpState* ctx, const TensorTuple& inputs,
const TensorTuple& outputs, const AttrMap& attrs) const override {
CHECK_EQ_OR_RETURN(inputs.size(), 1);
CHECK_EQ_OR_RETURN(outputs.size(), 1);
ctx->requires_grad = inputs.at(0)->requires_grad();
if (!ctx->requires_grad) { return Maybe<void>::Ok(); }
ComposedAttrMap composed_attrs(attrs, base_attrs_);
ctx->scale_factor = JUST(composed_attrs.GetAttr<float>("scale_factor"));
ctx->align_corners = JUST(composed_attrs.GetAttr<bool>("align_corners"));
ctx->data_format = JUST(composed_attrs.GetAttr<std::string>("data_format"));
ctx->SaveTensorForBackward(inputs.at(0));
return Maybe<void>::Ok();
}
Maybe<void> Apply(const UpsampleLinear1DInterpState* ctx, const TensorTuple& out_grads,
TensorTuple* in_grads) const override {
if (!ctx->requires_grad) { return Maybe<void>::Ok(); }
CHECK_EQ_OR_RETURN(out_grads.size(), 1);
MutableAttrMap attrs;
const std::shared_ptr<oneflow::one::Tensor>& x = ctx->SavedTensors().at(0);
in_grads->resize(1);
in_grads->at(0) = JUST(functional::UpsampleLinear1DGrad(out_grads.at(0), x, ctx->scale_factor,
ctx->align_corners, ctx->data_format));
return Maybe<void>::Ok();
}
private:
AttrMap base_attrs_;
};
REGISTER_OP_EXPR_GRAD_FUNCTION("upsample_linear_1d", UpsampleLinear1D);
struct UpsampleNearest1DInterpState : public OpExprInterpState {
bool requires_grad;
float scale_factor;
std::string data_format;
};
class UpsampleNearest1D : public OpExprGradFunction<UpsampleNearest1DInterpState> {
public:
Maybe<void> Init(const OpExpr& op) override { return Maybe<void>::Ok(); }
Maybe<void> Capture(UpsampleNearest1DInterpState* ctx, const TensorTuple& inputs,
const TensorTuple& outputs, const AttrMap& attrs) const override {
CHECK_EQ_OR_RETURN(inputs.size(), 1);
CHECK_EQ_OR_RETURN(outputs.size(), 1);
ctx->requires_grad = inputs.at(0)->requires_grad();
if (!ctx->requires_grad) { return Maybe<void>::Ok(); }
ComposedAttrMap composed_attrs(attrs, base_attrs_);
ctx->scale_factor = JUST(composed_attrs.GetAttr<float>("scale_factor"));
ctx->data_format = JUST(composed_attrs.GetAttr<std::string>("data_format"));
ctx->SaveTensorForBackward(inputs.at(0));
return Maybe<void>::Ok();
}
Maybe<void> Apply(const UpsampleNearest1DInterpState* ctx, const TensorTuple& out_grads,
TensorTuple* in_grads) const override {
if (!ctx->requires_grad) { return Maybe<void>::Ok(); }
CHECK_EQ_OR_RETURN(out_grads.size(), 1);
MutableAttrMap attrs;
const std::shared_ptr<oneflow::one::Tensor>& x = ctx->SavedTensors().at(0);
in_grads->resize(1);
in_grads->at(0) = JUST(
functional::UpsampleNearest1DGrad(out_grads.at(0), x, ctx->scale_factor, ctx->data_format));
return Maybe<void>::Ok();
}
private:
AttrMap base_attrs_;
};
REGISTER_OP_EXPR_GRAD_FUNCTION("upsample_nearest_1d", UpsampleNearest1D);
struct UpsampleBicubic2DInterpState : public OpExprInterpState {
bool requires_grad;
float height_scale;
float width_scale;
bool align_corners;
std::string data_format;
};
class UpsampleBicubic2D : public OpExprGradFunction<UpsampleBicubic2DInterpState> {
public:
Maybe<void> Init(const OpExpr& op) override { return Maybe<void>::Ok(); }
Maybe<void> Capture(UpsampleBicubic2DInterpState* ctx, const TensorTuple& inputs,
const TensorTuple& outputs, const AttrMap& attrs) const override {
CHECK_EQ_OR_RETURN(inputs.size(), 1);
CHECK_EQ_OR_RETURN(outputs.size(), 1);
ctx->requires_grad = inputs.at(0)->requires_grad();
if (!ctx->requires_grad) { return Maybe<void>::Ok(); }
ComposedAttrMap composed_attrs(attrs, base_attrs_);
ctx->height_scale = JUST(composed_attrs.GetAttr<float>("height_scale"));
ctx->width_scale = JUST(composed_attrs.GetAttr<float>("width_scale"));
ctx->align_corners = JUST(composed_attrs.GetAttr<bool>("align_corners"));
ctx->data_format = JUST(composed_attrs.GetAttr<std::string>("data_format"));
ctx->SaveTensorForBackward(inputs.at(0));
return Maybe<void>::Ok();
}
Maybe<void> Apply(const UpsampleBicubic2DInterpState* ctx, const TensorTuple& out_grads,
TensorTuple* in_grads) const override {
if (!ctx->requires_grad) { return Maybe<void>::Ok(); }
CHECK_EQ_OR_RETURN(out_grads.size(), 1);
MutableAttrMap attrs;
const std::shared_ptr<oneflow::one::Tensor>& x = ctx->SavedTensors().at(0);
in_grads->resize(1);
in_grads->at(0) = JUST(functional::UpsampleBicubic2DGrad(out_grads.at(0), x, ctx->height_scale,
ctx->width_scale, ctx->align_corners,
ctx->data_format));
return Maybe<void>::Ok();
}
private:
AttrMap base_attrs_;
};
REGISTER_OP_EXPR_GRAD_FUNCTION("upsample_bicubic_2d", UpsampleBicubic2D);
struct UpsampleNearest3DInterpState : public OpExprInterpState {
bool requires_grad;
float depth_scale;
float height_scale;
float width_scale;
std::string data_format;
};
class UpsampleNearest3D : public OpExprGradFunction<UpsampleNearest3DInterpState> {
public:
Maybe<void> Init(const OpExpr& op) override { return Maybe<void>::Ok(); }
Maybe<void> Capture(UpsampleNearest3DInterpState* ctx, const TensorTuple& inputs,
const TensorTuple& outputs, const AttrMap& attrs) const override {
CHECK_EQ_OR_RETURN(inputs.size(), 1);
CHECK_EQ_OR_RETURN(outputs.size(), 1);
ctx->requires_grad = inputs.at(0)->requires_grad();
if (!ctx->requires_grad) { return Maybe<void>::Ok(); }
ComposedAttrMap composed_attrs(attrs, base_attrs_);
ctx->depth_scale = JUST(composed_attrs.GetAttr<float>("depth_scale"));
ctx->height_scale = JUST(composed_attrs.GetAttr<float>("height_scale"));
ctx->width_scale = JUST(composed_attrs.GetAttr<float>("width_scale"));
ctx->data_format = JUST(composed_attrs.GetAttr<std::string>("data_format"));
ctx->SaveTensorForBackward(inputs.at(0));
return Maybe<void>::Ok();
}
Maybe<void> Apply(const UpsampleNearest3DInterpState* ctx, const TensorTuple& out_grads,
TensorTuple* in_grads) const override {
if (!ctx->requires_grad) { return Maybe<void>::Ok(); }
CHECK_EQ_OR_RETURN(out_grads.size(), 1);
MutableAttrMap attrs;
const std::shared_ptr<oneflow::one::Tensor>& x = ctx->SavedTensors().at(0);
in_grads->resize(1);
in_grads->at(0) = JUST(functional::UpsampleNearest3DGrad(out_grads.at(0), x, ctx->depth_scale,
ctx->height_scale, ctx->width_scale,
ctx->data_format));
return Maybe<void>::Ok();
}
private:
AttrMap base_attrs_;
};
REGISTER_OP_EXPR_GRAD_FUNCTION("upsample_nearest_3d", UpsampleNearest3D);
struct UpsampleTrilinear3DInterpState : public OpExprInterpState {
bool requires_grad;
float depth_scale;
float height_scale;
float width_scale;
bool align_corners;
std::string data_format;
};
class UpsampleTrilinear3D : public OpExprGradFunction<UpsampleTrilinear3DInterpState> {
public:
Maybe<void> Init(const OpExpr& op) override { return Maybe<void>::Ok(); }
Maybe<void> Capture(UpsampleTrilinear3DInterpState* ctx, const TensorTuple& inputs,
const TensorTuple& outputs, const AttrMap& attrs) const override {
CHECK_EQ_OR_RETURN(inputs.size(), 1);
CHECK_EQ_OR_RETURN(outputs.size(), 1);
ctx->requires_grad = inputs.at(0)->requires_grad();
if (!ctx->requires_grad) { return Maybe<void>::Ok(); }
ComposedAttrMap composed_attrs(attrs, base_attrs_);
ctx->depth_scale = JUST(composed_attrs.GetAttr<float>("depth_scale"));
ctx->height_scale = JUST(composed_attrs.GetAttr<float>("height_scale"));
ctx->width_scale = JUST(composed_attrs.GetAttr<float>("width_scale"));
ctx->align_corners = JUST(composed_attrs.GetAttr<bool>("align_corners"));
ctx->data_format = JUST(composed_attrs.GetAttr<std::string>("data_format"));
ctx->SaveTensorForBackward(inputs.at(0));
return Maybe<void>::Ok();
}
Maybe<void> Apply(const UpsampleTrilinear3DInterpState* ctx, const TensorTuple& out_grads,
TensorTuple* in_grads) const override {
if (!ctx->requires_grad) { return Maybe<void>::Ok(); }
CHECK_EQ_OR_RETURN(out_grads.size(), 1);
MutableAttrMap attrs;
const std::shared_ptr<oneflow::one::Tensor>& x = ctx->SavedTensors().at(0);
in_grads->resize(1);
in_grads->at(0) = JUST(functional::UpsampleTrilinear3DGrad(
out_grads.at(0), x, ctx->depth_scale, ctx->height_scale, ctx->width_scale,
ctx->align_corners, ctx->data_format));
return Maybe<void>::Ok();
}
private:
AttrMap base_attrs_;
};
REGISTER_OP_EXPR_GRAD_FUNCTION("upsample_trilinear_3d", UpsampleTrilinear3D);
} // namespace one
} // namespace oneflow
......@@ -476,6 +476,90 @@
String interpolation, String data_format=\"channels_first\")"
bind_python: True
- name: "upsample_linear_1d"
signature:
"Tensor UpsampleLinear1D(Tensor x, *, Float scale_factor, Bool align_corners,
String data_format=\"channels_first\")"
bind_python: True
- name: "upsample_linear_1d_grad"
signature:
"Tensor UpsampleLinear1DGrad(Tensor dy, Tensor x, *, Float scale_factor, Bool align_corners,
String data_format=\"channels_first\")"
bind_python: False
- name: "upsample_nearest_1d"
signature:
"Tensor UpsampleNearest1D(Tensor x, *, Float scale_factor,
String data_format=\"channels_first\")"
bind_python: True
- name: "upsample_nearest_1d_grad"
signature:
"Tensor UpsampleNearest1DGrad(Tensor dy, Tensor x, *, Float scale_factor,
String data_format=\"channels_first\")"
bind_python: False
- name: "upsample_nearest_2d"
signature:
"Tensor UpsampleNearest2D(Tensor x, *, Float height_scale, Float width_scale,
String data_format=\"channels_first\")"
bind_python: True
- name: "upsample_nearest_2d_grad"
signature:
"Tensor UpsampleNearest2DGrad(Tensor dy, Tensor x, *, Float height_scale, Float width_scale,
String data_format=\"channels_first\")"
bind_python: False
- name: "upsample_bilinear_2d"
signature:
"Tensor UpsampleBilinear2D(Tensor x, *, Float height_scale, Float width_scale, Bool align_corners,
String data_format=\"channels_first\")"
bind_python: True
- name: "upsample_bilinear_2d_grad"
signature:
"Tensor UpsampleBilinear2DGrad(Tensor dy, Tensor x, *, Float height_scale, Float width_scale, Bool align_corners,
String data_format=\"channels_first\")"
bind_python: False
- name: "upsample_bicubic_2d"
signature:
"Tensor UpsampleBicubic2D(Tensor x, *, Float height_scale, Float width_scale, Bool align_corners,
String data_format=\"channels_first\")"
bind_python: True
- name: "upsample_bicubic_2d_grad"
signature:
"Tensor UpsampleBicubic2DGrad(Tensor dy, Tensor x, *, Float height_scale, Float width_scale, Bool align_corners,
String data_format=\"channels_first\")"
bind_python: False
- name: "upsample_nearest_3d"
signature:
"Tensor UpsampleNearest3D(Tensor x, *, Float depth_scale, Float height_scale, Float width_scale,
String data_format=\"channels_first\")"
bind_python: True
- name: "upsample_nearest_3d_grad"
signature:
"Tensor UpsampleNearest3DGrad(Tensor dy, Tensor x, *, Float depth_scale, Float height_scale, Float width_scale,
String data_format=\"channels_first\")"
bind_python: False
- name: "upsample_trilinear_3d"
signature:
"Tensor UpsampleTrilinear3D(Tensor x, *, Float depth_scale, Float height_scale, Float width_scale,
Bool align_corners, String data_format=\"channels_first\")"
bind_python: True
- name: "upsample_trilinear_3d_grad"
signature:
"Tensor UpsampleTrilinear3DGrad(Tensor dy, Tensor x, *, Float depth_scale, Float height_scale, Float width_scale,
Bool align_corners, String data_format=\"channels_first\")"
bind_python: False
- name: "abs"
signature: "Tensor Abs(Tensor x)"
bind_python: True
......
......@@ -423,6 +423,288 @@ class UpsampleFunctor {
std::shared_ptr<OpExpr> op_;
};
class UpsampleLinear1DFunctor {
public:
UpsampleLinear1DFunctor() {
op_ = CHECK_JUST(one::OpBuilder("upsample_linear_1d").Input("x").Output("y").Build());
}
Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& x, const float& scale_factor,
const bool& align_corners, const std::string& data_format) const {
MutableAttrMap attrs;
JUST(attrs.SetAttr<float>("scale_factor", scale_factor));
JUST(attrs.SetAttr<bool>("align_corners", align_corners));
JUST(attrs.SetAttr<std::string>("data_format", data_format));
return OpInterpUtil::Dispatch<Tensor>(*op_, {x}, attrs);
}
private:
std::shared_ptr<OpExpr> op_;
};
class UpsampleLinear1DGradFunctor {
public:
UpsampleLinear1DGradFunctor() {
op_ = CHECK_JUST(
one::OpBuilder("upsample_linear_1d_grad").Input("dy").Input("x").Output("dx").Build());
}
Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& dy,
const std::shared_ptr<one::Tensor>& x, const float& scale_factor,
const bool& align_corners, const std::string& data_format) const {
MutableAttrMap attrs;
JUST(attrs.SetAttr<float>("scale_factor", scale_factor));
JUST(attrs.SetAttr<bool>("align_corners", align_corners));
JUST(attrs.SetAttr<std::string>("data_format", data_format));
return OpInterpUtil::Dispatch<Tensor>(*op_, {dy, x}, attrs);
}
private:
std::shared_ptr<OpExpr> op_;
};
class UpsampleNearest1DFunctor {
public:
UpsampleNearest1DFunctor() {
op_ = CHECK_JUST(one::OpBuilder("upsample_nearest_1d").Input("x").Output("y").Build());
}
Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& x, const float& scale_factor,
const std::string& data_format) const {
MutableAttrMap attrs;
JUST(attrs.SetAttr<float>("scale_factor", scale_factor));
JUST(attrs.SetAttr<std::string>("data_format", data_format));
return OpInterpUtil::Dispatch<Tensor>(*op_, {x}, attrs);
}
private:
std::shared_ptr<OpExpr> op_;
};
class UpsampleNearest1DGradFunctor {
public:
UpsampleNearest1DGradFunctor() {
op_ = CHECK_JUST(
one::OpBuilder("upsample_nearest_1d_grad").Input("dy").Input("x").Output("dx").Build());
}
Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& dy,
const std::shared_ptr<one::Tensor>& x, const float& scale_factor,
const std::string& data_format) const {
MutableAttrMap attrs;
JUST(attrs.SetAttr<float>("scale_factor", scale_factor));
JUST(attrs.SetAttr<std::string>("data_format", data_format));
return OpInterpUtil::Dispatch<Tensor>(*op_, {dy, x}, attrs);
}
private:
std::shared_ptr<OpExpr> op_;
};
class UpsampleNearest2DFunctor {
public:
UpsampleNearest2DFunctor() {
op_ = CHECK_JUST(one::OpBuilder("upsample_nearest_2d").Input("x").Output("y").Build());
}
Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& x, const float& height_scale,
const float& width_scale, const std::string& data_format) const {
MutableAttrMap attrs;
JUST(attrs.SetAttr<float>("height_scale", height_scale));
JUST(attrs.SetAttr<float>("width_scale", width_scale));
JUST(attrs.SetAttr<std::string>("data_format", data_format));
return OpInterpUtil::Dispatch<Tensor>(*op_, {x}, attrs);
}
private:
std::shared_ptr<OpExpr> op_;
};
class UpsampleNearest2DGradFunctor {
public:
UpsampleNearest2DGradFunctor() {
op_ = CHECK_JUST(
one::OpBuilder("upsample_nearest_2d_grad").Input("dy").Input("x").Output("dx").Build());
}
Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& dy,
const std::shared_ptr<one::Tensor>& x, const float& height_scale,
const float& width_scale, const std::string& data_format) const {
MutableAttrMap attrs;
JUST(attrs.SetAttr<float>("height_scale", height_scale));
JUST(attrs.SetAttr<float>("width_scale", width_scale));
JUST(attrs.SetAttr<std::string>("data_format", data_format));
return OpInterpUtil::Dispatch<Tensor>(*op_, {dy, x}, attrs);
}
private:
std::shared_ptr<OpExpr> op_;
};
class UpsampleBilinear2DFunctor {
public:
UpsampleBilinear2DFunctor() {
op_ = CHECK_JUST(one::OpBuilder("upsample_bilinear_2d").Input("x").Output("y").Build());
}
Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& x, const float& height_scale,
const float& width_scale, const bool& align_corners,
const std::string& data_format) const {
MutableAttrMap attrs;
JUST(attrs.SetAttr<float>("height_scale", height_scale));
JUST(attrs.SetAttr<float>("width_scale", width_scale));
JUST(attrs.SetAttr<bool>("align_corners", align_corners));
JUST(attrs.SetAttr<std::string>("data_format", data_format));
return OpInterpUtil::Dispatch<Tensor>(*op_, {x}, attrs);
}
private:
std::shared_ptr<OpExpr> op_;
};
class UpsampleBilinear2DGradFunctor {
public:
UpsampleBilinear2DGradFunctor() {
op_ = CHECK_JUST(
one::OpBuilder("upsample_bilinear_2d_grad").Input("dy").Input("x").Output("dx").Build());
}
Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& dy,
const std::shared_ptr<one::Tensor>& x, const float& height_scale,
const float& width_scale, const bool& align_corners,
const std::string& data_format) const {
MutableAttrMap attrs;
JUST(attrs.SetAttr<float>("height_scale", height_scale));
JUST(attrs.SetAttr<float>("width_scale", width_scale));
JUST(attrs.SetAttr<bool>("align_corners", align_corners));
JUST(attrs.SetAttr<std::string>("data_format", data_format));
return OpInterpUtil::Dispatch<Tensor>(*op_, {dy, x}, attrs);
}
private:
std::shared_ptr<OpExpr> op_;
};
class UpsampleBicubic2DFunctor {
public:
UpsampleBicubic2DFunctor() {
op_ = CHECK_JUST(one::OpBuilder("upsample_bicubic_2d").Input("x").Output("y").Build());
}
Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& x, const float& height_scale,
const float& width_scale, const bool& align_corners,
const std::string& data_format) const {
MutableAttrMap attrs;
JUST(attrs.SetAttr<float>("height_scale", height_scale));
JUST(attrs.SetAttr<float>("width_scale", width_scale));
JUST(attrs.SetAttr<bool>("align_corners", align_corners));
JUST(attrs.SetAttr<std::string>("data_format", data_format));
return OpInterpUtil::Dispatch<Tensor>(*op_, {x}, attrs);
}
private:
std::shared_ptr<OpExpr> op_;
};
class UpsampleBicubic2DGradFunctor {
public:
UpsampleBicubic2DGradFunctor() {
op_ = CHECK_JUST(
one::OpBuilder("upsample_bicubic_2d_grad").Input("dy").Input("x").Output("dx").Build());
}
Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& dy,
const std::shared_ptr<one::Tensor>& x, const float& height_scale,
const float& width_scale, const bool& align_corners,
const std::string& data_format) const {
MutableAttrMap attrs;
JUST(attrs.SetAttr<float>("height_scale", height_scale));
JUST(attrs.SetAttr<float>("width_scale", width_scale));
JUST(attrs.SetAttr<bool>("align_corners", align_corners));
JUST(attrs.SetAttr<std::string>("data_format", data_format));
return OpInterpUtil::Dispatch<Tensor>(*op_, {dy, x}, attrs);
}
private:
std::shared_ptr<OpExpr> op_;
};
class UpsampleNearest3DFunctor {
public:
UpsampleNearest3DFunctor() {
op_ = CHECK_JUST(one::OpBuilder("upsample_nearest_3d").Input("x").Output("y").Build());
}
Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& x, const float& depth_scale,
const float& height_scale, const float& width_scale,
const std::string& data_format) const {
MutableAttrMap attrs;
JUST(attrs.SetAttr<float>("depth_scale", depth_scale));
JUST(attrs.SetAttr<float>("height_scale", height_scale));
JUST(attrs.SetAttr<float>("width_scale", width_scale));
JUST(attrs.SetAttr<std::string>("data_format", data_format));
return OpInterpUtil::Dispatch<Tensor>(*op_, {x}, attrs);
}
private:
std::shared_ptr<OpExpr> op_;
};
class UpsampleNearest3DGradFunctor {
public:
UpsampleNearest3DGradFunctor() {
op_ = CHECK_JUST(
one::OpBuilder("upsample_nearest_3d_grad").Input("dy").Input("x").Output("dx").Build());
}
Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& dy,
const std::shared_ptr<one::Tensor>& x, const float& depth_scale,
const float& height_scale, const float& width_scale,
const std::string& data_format) const {
MutableAttrMap attrs;
JUST(attrs.SetAttr<float>("depth_scale", depth_scale));
JUST(attrs.SetAttr<float>("height_scale", height_scale));
JUST(attrs.SetAttr<float>("width_scale", width_scale));
JUST(attrs.SetAttr<std::string>("data_format", data_format));
return OpInterpUtil::Dispatch<Tensor>(*op_, {dy, x}, attrs);
}
private:
std::shared_ptr<OpExpr> op_;
};
class UpsampleTrilinear3DFunctor {
public:
UpsampleTrilinear3DFunctor() {
op_ = CHECK_JUST(one::OpBuilder("upsample_trilinear_3d").Input("x").Output("y").Build());
}
Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& x, const float& depth_scale,
const float& height_scale, const float& width_scale,
const bool& align_corners, const std::string& data_format) const {
MutableAttrMap attrs;
JUST(attrs.SetAttr<float>("depth_scale", depth_scale));
JUST(attrs.SetAttr<float>("height_scale", height_scale));
JUST(attrs.SetAttr<float>("width_scale", width_scale));
JUST(attrs.SetAttr<bool>("align_corners", align_corners));
JUST(attrs.SetAttr<std::string>("data_format", data_format));
return OpInterpUtil::Dispatch<Tensor>(*op_, {x}, attrs);
}
private:
std::shared_ptr<OpExpr> op_;
};
class UpsampleTrilinear3DGradFunctor {
public:
UpsampleTrilinear3DGradFunctor() {
op_ = CHECK_JUST(
one::OpBuilder("upsample_trilinear_3d_grad").Input("dy").Input("x").Output("dx").Build());
}
Maybe<Tensor> operator()(const std::shared_ptr<one::Tensor>& dy,
const std::shared_ptr<one::Tensor>& x, const float& depth_scale,
const float& height_scale, const float& width_scale,
const bool& align_corners, const std::string& data_format) const {
MutableAttrMap attrs;
JUST(attrs.SetAttr<float>("depth_scale", depth_scale));
JUST(attrs.SetAttr<float>("height_scale", height_scale));
JUST(attrs.SetAttr<float>("width_scale", width_scale));
JUST(attrs.SetAttr<bool>("align_corners", align_corners));
JUST(attrs.SetAttr<std::string>("data_format", data_format));
return OpInterpUtil::Dispatch<Tensor>(*op_, {dy, x}, attrs);
}
private:
std::shared_ptr<OpExpr> op_;
};
class UnsortedSegmentSumLikeFunctor {
public:
UnsortedSegmentSumLikeFunctor() {
......@@ -572,6 +854,20 @@ ONEFLOW_FUNCTION_LIBRARY(m) {
m.add_functor<impl::SqueezeFunctor>("Squeeze");
m.add_functor<impl::CopyFunctor>("Copy");
m.add_functor<impl::UpsampleFunctor>("Upsample");
m.add_functor<impl::UpsampleNearest2DFunctor>("UpsampleNearest2D");
m.add_functor<impl::UpsampleNearest2DGradFunctor>("UpsampleNearest2DGrad");
m.add_functor<impl::UpsampleBilinear2DFunctor>("UpsampleBilinear2D");
m.add_functor<impl::UpsampleBilinear2DGradFunctor>("UpsampleBilinear2DGrad");
m.add_functor<impl::UpsampleLinear1DFunctor>("UpsampleLinear1D");
m.add_functor<impl::UpsampleLinear1DGradFunctor>("UpsampleLinear1DGrad");
m.add_functor<impl::UpsampleNearest1DFunctor>("UpsampleNearest1D");
m.add_functor<impl::UpsampleNearest1DGradFunctor>("UpsampleNearest1DGrad");
m.add_functor<impl::UpsampleBicubic2DFunctor>("UpsampleBicubic2D");
m.add_functor<impl::UpsampleBicubic2DGradFunctor>("UpsampleBicubic2DGrad");
m.add_functor<impl::UpsampleNearest3DFunctor>("UpsampleNearest3D");
m.add_functor<impl::UpsampleNearest3DGradFunctor>("UpsampleNearest3DGrad");
m.add_functor<impl::UpsampleTrilinear3DFunctor>("UpsampleTrilinear3D");
m.add_functor<impl::UpsampleTrilinear3DGradFunctor>("UpsampleTrilinear3DGrad");
m.add_functor<impl::UnsortedSegmentSumLikeFunctor>("UnsortedSegmentSumLike");
m.add_functor<impl::TriuFunctor>("Triu");
m.add_functor<impl::DiagFunctor>("Diag");
......
"""
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import math
import warnings
import oneflow as flow
from oneflow.python.nn.module import Module
from oneflow.python.oneflow_export import oneflow_export, experimental_api
from oneflow.python.framework.tensor import register_tensor_op
from typing import Optional, Union, Tuple
class Interpolate(Module):
def __init__(
self,
size: Optional[Union[int, Tuple[int, ...]]] = None,
scale_factor: Optional[Union[float, Tuple[float, ...]]] = None,
mode: str = "nearest",
align_corners: Optional[bool] = None,
recompute_scale_factor: Optional[bool] = None,
):
super().__init__()
self.size = size
if isinstance(scale_factor, tuple):
self.scale_factor = tuple(float(factor) for factor in scale_factor)
else:
self.scale_factor = float(scale_factor) if scale_factor else None
if mode in ("nearest", "area") and align_corners is not None:
raise ValueError(
"align_corners option can only be set with the "
"interpolating modes: linear | bilinear | bicubic | trilinear"
)
self.mode = mode
self.recompute_scale_factor = recompute_scale_factor
if align_corners == None:
align_corners = False
self.align_corners = align_corners
self.height_scale = None
self.width_scale = None
if isinstance(self.scale_factor, float):
self.height_scale = self.scale_factor
self.width_scale = self.scale_factor
elif isinstance(self.scale_factor, tuple):
self.height_scale = self.scale_factor[0]
self.width_scale = self.scale_factor[1]
else:
pass
if self.mode not in (
"nearest",
"bilinear",
"linear",
"area",
"bicubic",
"trilinear",
):
raise ValueError(
'interpolation must be "nearest" or "bilinear" or "linear" or "area" or "bicubic" or "trilinear".'
)
if self.mode == "nearest" and self.align_corners:
raise ValueError('interpolation "nearest" does not support align_corners.')
def forward(self, x):
dim = len(x.shape) - 2
if self.size is not None and self.scale_factor is not None:
raise ValueError("only one of size or scale_factor should be defined")
elif self.size is not None:
assert self.scale_factor is None
scale_factors = None
if isinstance(self.size, (list, tuple)):
if len(self.size) != dim:
raise ValueError(
"size shape must match input shape. "
"Input is {}D, size is {}".format(dim, len(self.size))
)
output_size = self.size
else:
output_size = [self.size for _ in range(dim)]
elif self.scale_factor is not None:
assert self.size is None
output_size = None
if isinstance(self.scale_factor, (list, tuple)):
if len(self.scale_factor) != dim:
raise ValueError(
"scale_factor shape must match input shape. "
"Input is {}D, scale_factor is {}".format(
dim, len(self.scale_factor)
)
)
scale_factors = self.scale_factor
else:
scale_factors = [self.scale_factor for _ in range(dim)]
else:
raise ValueError("either size or scale_factor should be defined")
if self.recompute_scale_factor is None:
if scale_factors is not None:
for scale in scale_factors:
if math.floor(scale) != scale:
warnings.warn(
"The default behavior for interpolate/upsample with float scale_factor changed "
"in 1.6.0 to align with other frameworks/libraries, and now uses scale_factor directly, "
"instead of relying on the computed output size. "
"If you wish to restore the old behavior, please set recompute_scale_factor=True. "
"See the documentation of nn.Upsample for details. "
)
break
elif self.recompute_scale_factor and self.size is not None:
raise ValueError(
"recompute_scale_factor is not meaningful with an explicit size."
)
# "area" mode always requires an explicit size rather than scale factor.
# Re-use the recompute_scale_factor code path.
if self.mode == "area" and output_size is None:
self.recompute_scale_factor = True
if self.recompute_scale_factor is not None and self.recompute_scale_factor:
assert scale_factors is not None
output_size = [
int(math.floor(float(input.size(i + 2)) * scale_factors[i]))
for i in range(dim)
]
scale_factors = None
if len(x.shape) == 3 and self.mode == "nearest":
return flow.F.upsample_nearest_1d(
x, scale_factor=scale_factors[0], data_format="channels_first"
)
if len(x.shape) == 4 and self.mode == "nearest":
return flow.F.upsample_nearest_2d(
x,
height_scale=scale_factors[0],
width_scale=scale_factors[1],
data_format="channels_first",
)
if len(x.shape) == 5 and self.mode == "nearest":
return flow.F.upsample_nearest_3d(
x,
depth_scale=scale_factors[0],
height_scale=scale_factors[1],
width_scale=scale_factors[2],
data_format="channels_first",
)
# TODO(bbuf) Add adaptive_avg_pool op
if self.mode == "area":
raise NotImplementedError("adaptive_avg_pool1d not impleted now!")
if len(x.shape) == 3 and self.mode == "linear":
assert self.align_corners is not None
return flow.F.upsample_linear_1d(
x,
scale_factor=scale_factors[0],
align_corners=self.align_corners,
data_format="channels_first",
)
if len(x.shape) == 4 and self.mode == "bilinear":
assert self.align_corners is not None
return flow.F.upsample_bilinear_2d(
x,
height_scale=scale_factors[0],
width_scale=scale_factors[1],
align_corners=self.align_corners,
data_format="channels_first",
)
if len(x.shape) == 4 and self.mode == "bicubic":
assert self.align_corners is not None
return flow.F.upsample_bicubic_2d(
x,
height_scale=scale_factors[0],
width_scale=scale_factors[1],
align_corners=self.align_corners,
data_format="channels_first",
)
if len(x.shape) == 5 and self.mode == "trilinear":
assert self.align_corners is not None
return flow.F.upsample_trilinear_3d(
x,
depth_scale=scale_factors[0],
height_scale=scale_factors[1],
width_scale=scale_factors[2],
align_corners=self.align_corners,
data_format="channels_first",
)
@oneflow_export("nn.functional.interpolate")
@experimental_api
def interpolate(
input,
size=None,
scale_factor=None,
mode="nearest",
align_corners=None,
recompute_scale_factor=None,
):
r"""The interface is consistent with PyTorch.
The documentation is referenced from: https://pytorch.org/docs/1.9.0/_modules/torch/nn/functional.html#interpolate
Down/up samples the input to either the given :attr:`size` or the given
:attr:`scale_factor`
The algorithm used for interpolation is determined by :attr:`mode`.
Currently temporal, spatial and volumetric sampling are supported, i.e.
expected inputs are 3-D, 4-D or 5-D in shape.
The input dimensions are interpreted in the form:
`mini-batch x channels x [optional depth] x [optional height] x width`.
The modes available for resizing are: `nearest`, `linear` (3D-only),
`bilinear`, `bicubic` (4D-only), `trilinear` (5D-only), `area`
Args:
input (Tensor): the input tensor
size (int or Tuple[int] or Tuple[int, int] or Tuple[int, int, int]):
output spatial size.
scale_factor (float or Tuple[float]): multiplier for spatial size. Has to match input size if it is a tuple.
mode (str): algorithm used for upsampling:
``'nearest'`` | ``'linear'`` | ``'bilinear'`` | ``'bicubic'`` |
``'trilinear'`` | ``'area'``. Default: ``'nearest'``
align_corners (bool, optional): Geometrically, we consider the pixels of the
input and output as squares rather than points.
If set to ``True``, the input and output tensors are aligned by the
center points of their corner pixels, preserving the values at the corner pixels.
If set to ``False``, the input and output tensors are aligned by the corner
points of their corner pixels, and the interpolation uses edge value padding
for out-of-boundary values, making this operation *independent* of input size
when :attr:`scale_factor` is kept the same. This only has an effect when :attr:`mode`
is ``'linear'``, ``'bilinear'``, ``'bicubic'`` or ``'trilinear'``.
Default: ``False``
recompute_scale_factor (bool, optional): recompute the scale_factor for use in the
interpolation calculation. When `scale_factor` is passed as a parameter, it is used
to compute the `output_size`. If `recompute_scale_factor` is ``False`` or not specified,
the passed-in `scale_factor` will be used in the interpolation computation.
Otherwise, a new `scale_factor` will be computed based on the output and input sizes for
use in the interpolation computation (i.e. the computation will be identical to if the computed
`output_size` were passed-in explicitly). Note that when `scale_factor` is floating-point,
the recomputed scale_factor may differ from the one passed in due to rounding and precision
issues.
.. note::
With ``mode='bicubic'``, it's possible to cause overshoot, in other words it can produce
negative values or values greater than 255 for images.
Explicitly call ``result.clamp(min=0, max=255)`` if you want to reduce the overshoot
when displaying the image.
.. warning::
With ``align_corners = True``, the linearly interpolating modes
(`linear`, `bilinear`, and `trilinear`) don't proportionally align the
output and input pixels, and thus the output values can depend on the
input size. This was the default behavior for these modes up to version
0.3.1. Since then, the default behavior is ``align_corners = False``.
See :class:`~torch.nn.Upsample` for concrete examples on how this
affects the outputs.
.. warning::
When scale_factor is specified, if recompute_scale_factor=True,
scale_factor is used to compute the output_size which will then
be used to infer new scales for the interpolation.
The default behavior for recompute_scale_factor changed to False
in 1.6.0, and scale_factor is used in the interpolation
calculation.
For example:
.. code-block:: python
>>> import oneflow.experimental as flow
>>> import numpy as np
>>> flow.enable_eager_execution()
>>> input = flow.Tensor(np.arange(1, 5).reshape((1, 1, 4)), dtype=flow.float32)
>>> output = flow.nn.functional.interpolate(input, scale_factor=2.0, mode="linear")
>>> output
tensor([[[1. , 1.25, 1.75, 2.25, 2.75, 3.25, 3.75, 4. ]]],
dtype=oneflow.float32)
"""
return Interpolate(
size=size,
scale_factor=scale_factor,
mode=mode,
align_corners=align_corners,
recompute_scale_factor=recompute_scale_factor,
)(input)
if __name__ == "__main__":
import doctest
doctest.testmod(raise_on_error=True)
"""
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import unittest
from collections import OrderedDict
import numpy as np
import oneflow.experimental as flow
from test_util import GenArgList
def _test_interpolate_linear_1d(test_case, device):
input = flow.Tensor(
np.arange(1, 5).reshape((1, 1, 4)),
device=flow.device(device),
dtype=flow.float32,
requires_grad=True,
)
of_out = flow.nn.functional.interpolate(input, scale_factor=2.0, mode="linear")
np_out = [[[1.0, 1.25, 1.75, 2.25, 2.75, 3.25, 3.75, 4.0]]]
test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-5, 1e-5))
of_out = of_out.sum()
of_out.backward()
np_grad = [[[2.0, 2.0, 2.0, 2.0]]]
test_case.assertTrue(np.allclose(np_grad, input.grad.numpy(), 1e-4, 1e-4))
input.grad = None
of_out = flow.nn.functional.interpolate(
input, scale_factor=2.0, mode="linear", align_corners=True
)
np_out = [
[
[
1.0,
1.4285714626312256,
1.8571429252624512,
2.2857141494750977,
2.7142856121063232,
3.142857074737549,
3.5714285373687744,
4.0,
]
]
]
test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-5, 1e-5))
of_out = of_out.sum()
of_out.backward()
np_grad = [
[
[
1.7142856121063232,
2.2857141494750977,
2.2857143878936768,
1.7142856121063232,
]
]
]
test_case.assertTrue(np.allclose(np_grad, input.grad.numpy(), 1e-4, 1e-4))
def _test_interpolate_nearest_1d(test_case, device):
input = flow.Tensor(
np.arange(1, 5).reshape((1, 1, 4)),
device=flow.device(device),
dtype=flow.float32,
requires_grad=True,
)
of_out = flow.nn.functional.interpolate(input, scale_factor=2.0, mode="nearest")
np_out = [[[1.0, 1.0, 2.0, 2.0, 3.0, 3.0, 4.0, 4.0]]]
test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-5, 1e-5))
of_out = of_out.sum()
of_out.backward()
np_grad = [[[2.0, 2.0, 2.0, 2.0]]]
test_case.assertTrue(np.allclose(np_grad, input.grad.numpy(), 1e-4, 1e-4))
def _test_interpolate_nearest_2d(test_case, device):
input = flow.Tensor(
np.arange(1, 5).reshape((1, 1, 2, 2)),
device=flow.device(device),
dtype=flow.float32,
requires_grad=True,
)
of_out = flow.nn.functional.interpolate(input, scale_factor=2.0, mode="nearest")
np_out = np.array(
[
[
[
[1.0, 1.0, 2.0, 2.0],
[1.0, 1.0, 2.0, 2.0],
[3.0, 3.0, 4.0, 4.0],
[3.0, 3.0, 4.0, 4.0],
]
]
]
)
test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-4, 1e-4))
of_out = of_out.sum()
of_out.backward()
np_grad = [[[[4.0, 4.0], [4.0, 4.0]]]]
test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-5, 1e-5))
def _test_interpolate_nearest_3d(test_case, device):
input = flow.Tensor(
np.arange(1, 9).reshape((1, 1, 2, 2, 2)),
device=flow.device(device),
dtype=flow.float32,
requires_grad=True,
)
of_out = flow.nn.functional.interpolate(input, scale_factor=2.0, mode="nearest")
np_out = np.array(
[
[
[
[
[1.0, 1.0, 2.0, 2.0],
[1.0, 1.0, 2.0, 2.0],
[3.0, 3.0, 4.0, 4.0],
[3.0, 3.0, 4.0, 4.0],
],
[
[1.0, 1.0, 2.0, 2.0],
[1.0, 1.0, 2.0, 2.0],
[3.0, 3.0, 4.0, 4.0],
[3.0, 3.0, 4.0, 4.0],
],
[
[5.0, 5.0, 6.0, 6.0],
[5.0, 5.0, 6.0, 6.0],
[7.0, 7.0, 8.0, 8.0],
[7.0, 7.0, 8.0, 8.0],
],
[
[5.0, 5.0, 6.0, 6.0],
[5.0, 5.0, 6.0, 6.0],
[7.0, 7.0, 8.0, 8.0],
[7.0, 7.0, 8.0, 8.0],
],
]
]
]
)
test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-4, 1e-4))
of_out = of_out.sum()
of_out.backward()
np_grad = [[[[[8.0, 8.0], [8.0, 8.0]], [[8.0, 8.0], [8.0, 8.0]]]]]
test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-5, 1e-5))
def _test_interpolate_bilinear_2d(test_case, device):
input = flow.Tensor(
np.arange(1, 5).reshape((1, 1, 2, 2)),
device=flow.device(device),
dtype=flow.float32,
requires_grad=True,
)
of_out = flow.nn.functional.interpolate(input, scale_factor=2.0, mode="bilinear")
np_out = np.array(
[
[
[
[1.0, 1.25, 1.75, 2.0],
[1.5, 1.75, 2.25, 2.5],
[2.5, 2.75, 3.25, 3.5],
[3.0, 3.25, 3.75, 4.0],
]
]
]
)
test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-4, 1e-4))
of_out = of_out.sum()
of_out.backward()
np_grad = [[[[4.0, 4.0], [4.0, 4.0]]]]
test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-5, 1e-5))
def _test_interpolate_bicubic_2d(test_case, device):
input = flow.Tensor(
np.arange(1, 5).reshape((1, 1, 2, 2)).astype(np.float32),
device=flow.device(device),
dtype=flow.float32,
requires_grad=True,
)
of_out = flow.nn.functional.interpolate(input, scale_factor=2.0, mode="bicubic")
np_out = np.array(
[
[
[
[0.68359375, 1.015625, 1.5625, 1.89453125],
[1.34765625, 1.6796875, 2.2265625, 2.55859375],
[2.44140625, 2.7734375, 3.3203125, 3.65234375],
[3.10546875, 3.4375, 3.984375, 4.31640625],
]
]
]
)
test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-4, 1e-4))
of_out = of_out.sum()
of_out.backward()
np_grad = [[[[4.0, 4.0], [4.0, 4.0]]]]
test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-5, 1e-5))
def _test_interpolate_bicubic_same_dim_2d(test_case, device):
input = flow.Tensor(
np.arange(1, 5).reshape((1, 1, 2, 2)).astype(np.float32),
device=flow.device(device),
dtype=flow.float32,
requires_grad=True,
)
of_out = flow.nn.functional.interpolate(input, scale_factor=1.0, mode="bicubic")
np_out = [[[[1.0, 2.0], [3.0, 4.0]]]]
test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-4, 1e-4))
of_out = of_out.sum()
of_out.backward()
np_grad = [[[[1.0, 1.0], [1.0, 1.0]]]]
test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-5, 1e-5))
def _test_interpolate_trilinear_3d(test_case, device):
input = flow.Tensor(
np.arange(1, 9).reshape((1, 1, 2, 2, 2)),
device=flow.device(device),
dtype=flow.float32,
requires_grad=True,
)
of_out = flow.nn.functional.interpolate(input, scale_factor=2.0, mode="trilinear")
np_out = np.array(
[
[
[
[
[1.0, 1.25, 1.75, 2.0],
[1.5, 1.75, 2.25, 2.5],
[2.5, 2.75, 3.25, 3.5],
[3.0, 3.25, 3.75, 4.0],
],
[
[2.0, 2.25, 2.75, 3.0],
[2.5, 2.75, 3.25, 3.5],
[3.5, 3.75, 4.25, 4.5],
[4.0, 4.25, 4.75, 5.0],
],
[
[4.0, 4.25, 4.75, 5.0],
[4.5, 4.75, 5.25, 5.5],
[5.5, 5.75, 6.25, 6.5],
[6.0, 6.25, 6.75, 7.0],
],
[
[5.0, 5.25, 5.75, 6.0],
[5.5, 5.75, 6.25, 6.5],
[6.5, 6.75, 7.25, 7.5],
[7.0, 7.25, 7.75, 8.0],
],
]
]
]
)
test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-4, 1e-4))
of_out = of_out.sum()
of_out.backward()
np_grad = [[[[[8.0, 8.0], [8.0, 8.0]], [[8.0, 8.0], [8.0, 8.0]]]]]
test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-5, 1e-5))
def _test_interpolate_trilinear_3d_align_corners(test_case, device):
input = flow.Tensor(
np.arange(1, 9).reshape((1, 1, 2, 2, 2)),
device=flow.device(device),
dtype=flow.float32,
requires_grad=True,
)
of_out = flow.nn.functional.interpolate(
input, scale_factor=2.0, mode="trilinear", align_corners=True
)
np_out = np.array(
[
[
[
[
[1.0, 1.3333332538604736, 1.6666667461395264, 2.0],
[
1.6666666269302368,
2.0,
2.3333334922790527,
2.6666665077209473,
],
[
2.3333332538604736,
2.6666665077209473,
3.0,
3.3333334922790527,
],
[3.0, 3.3333332538604736, 3.6666667461395264, 4.0],
],
[
[
2.3333334922790527,
2.6666665077209473,
3.0,
3.3333332538604736,
],
[3.0, 3.3333330154418945, 3.6666665077209473, 4.0],
[
3.6666665077209473,
4.0,
4.333333492279053,
4.6666669845581055,
],
[4.333333492279053, 4.666666030883789, 5.0, 5.3333330154418945],
],
[
[3.6666667461395264, 4.0, 4.333333492279053, 4.666666507720947],
[4.333333492279053, 4.666666507720947, 5.0, 5.3333330154418945],
[5.0, 5.333333492279053, 5.6666669845581055, 6.0],
[
5.6666669845581055,
6.0,
6.333333492279053,
6.6666669845581055,
],
],
[
[5.0, 5.3333330154418945, 5.666666507720947, 6.0],
[
5.666666507720947,
5.999999523162842,
6.3333330154418945,
6.666666507720947,
],
[6.333333492279053, 6.666666030883789, 7.0, 7.333333492279053],
[7.0, 7.3333330154418945, 7.6666669845581055, 8.0],
],
]
]
]
)
test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-4, 1e-4))
of_out = of_out.sum()
of_out.backward()
np_grad = [
[
[
[[7.999999523162842, 8.0], [7.999999523162842, 8.0]],
[[8.0, 8.0], [8.0, 8.0]],
]
]
]
test_case.assertTrue(np.allclose(input.grad.numpy(), np_grad, 1e-5, 1e-5))
@unittest.skipIf(
not flow.unittest.env.eager_execution_enabled(),
".numpy() doesn't work in lazy mode",
)
class TestUpsample2d(flow.unittest.TestCase):
def test_upsample2d(test_case):
arg_dict = OrderedDict()
arg_dict["test_fun"] = [
_test_interpolate_linear_1d,
_test_interpolate_nearest_1d,
_test_interpolate_nearest_2d,
_test_interpolate_nearest_3d,
_test_interpolate_bilinear_2d,
_test_interpolate_bicubic_2d,
_test_interpolate_bicubic_same_dim_2d,
_test_interpolate_trilinear_3d,
_test_interpolate_trilinear_3d_align_corners,
]
arg_dict["device"] = [
"cpu",
"cuda",
]
for arg in GenArgList(arg_dict):
for i in range(100):
arg[0](test_case, *arg[1:])
if __name__ == "__main__":
unittest.main()
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "oneflow/core/framework/framework.h"
#include "oneflow/core/kernel/new_kernel_util.h"
#include "oneflow/core/common/nd_index_offset_helper.h"
#include "oneflow/user/kernels/upsample_kernel.h"
namespace oneflow {
template<typename T>
class UpsampleBicubic2dCPUKernel final : public user_op::OpKernel {
public:
UpsampleBicubic2dCPUKernel() = default;
~UpsampleBicubic2dCPUKernel() = default;
private:
void Compute(user_op::KernelComputeContext* ctx) const override {
const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0);
user_op::Tensor* y_tensor = ctx->Tensor4ArgNameAndIndex("y", 0);
const T* in_ptr = x_tensor->dptr<T>();
T* out_ptr = y_tensor->mut_dptr<T>();
const float height_scale = ctx->Attr<float>("height_scale");
const float width_scale = ctx->Attr<float>("width_scale");
const bool align_corners = ctx->Attr<bool>("align_corners");
const int nbatch = x_tensor->shape().At(0);
const int channels = x_tensor->shape().At(1);
const int64_t in_height = x_tensor->shape().At(2);
const int64_t in_width = x_tensor->shape().At(3);
const int64_t out_height = y_tensor->shape().At(2);
const int64_t out_width = y_tensor->shape().At(3);
if (in_height == out_height && in_width == out_width) {
memcpy(out_ptr, in_ptr, sizeof(T) * nbatch * channels * in_height * in_width);
} else {
const T scale_height = GetAreaPixelScale(in_height, out_height, align_corners, height_scale);
const T scale_width = GetAreaPixelScale(in_width, out_width, align_corners, width_scale);
for (int64_t output_y = 0; output_y < out_height; output_y++) {
for (int64_t output_x = 0; output_x < out_width; output_x++) {
const T* in = in_ptr;
T* out = out_ptr;
const T real_x = GetAreaPixel(scale_width, output_x, align_corners, /*cubic=*/true);
int64_t input_x = std::floor(real_x);
const T t_x = real_x - input_x;
const T real_y = GetAreaPixel(scale_height, output_y, align_corners, /*cubic=*/true);
int64_t input_y = std::floor(real_y);
const T t_y = real_y - input_y;
for (int64_t c = 0; c < channels * nbatch; c++) {
T coefficients[4];
// Interpolate 4 times in the x direction
for (int64_t i = 0; i < 4; i++) {
coefficients[i] =
cubic_interp1d<T>(upsample_get_value_bounded<T>(in, in_width, in_height,
input_x - 1, input_y - 1 + i),
upsample_get_value_bounded<T>(in, in_width, in_height,
input_x + 0, input_y - 1 + i),
upsample_get_value_bounded<T>(in, in_width, in_height,
input_x + 1, input_y - 1 + i),
upsample_get_value_bounded<T>(in, in_width, in_height,
input_x + 2, input_y - 1 + i),
t_x);
}
// Interpolate in the y direction using x interpolations
out[output_y * out_width + output_x] = cubic_interp1d<T>(
coefficients[0], coefficients[1], coefficients[2], coefficients[3], t_y);
// Move to next channel
in += in_width * in_height;
out += out_width * out_height;
}
}
}
}
}
bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
};
template<typename T>
class UpsampleBicubic2dGradCPUKernel final : public user_op::OpKernel {
public:
UpsampleBicubic2dGradCPUKernel() = default;
~UpsampleBicubic2dGradCPUKernel() = default;
private:
void Compute(user_op::KernelComputeContext* ctx) const override {
user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0);
Memset<DeviceType::kCPU>(ctx->device_ctx(), dx_tensor->mut_dptr<T>(), 0,
dx_tensor->shape().elem_cnt() * sizeof(T));
user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0);
T* in_ptr = dx_tensor->mut_dptr<T>();
const T* out_ptr = dy_tensor->dptr<T>();
const float height_scale = ctx->Attr<float>("height_scale");
const float width_scale = ctx->Attr<float>("width_scale");
const bool align_corners = ctx->Attr<bool>("align_corners");
const int nbatch = dx_tensor->shape().At(0);
int channels = dx_tensor->shape().At(1);
channels = channels * nbatch;
const int64_t in_height = dx_tensor->shape().At(2);
const int64_t in_width = dx_tensor->shape().At(3);
const int64_t out_height = dy_tensor->shape().At(2);
const int64_t out_width = dy_tensor->shape().At(3);
if (in_height == out_height && in_width == out_width) {
memcpy(in_ptr, out_ptr, sizeof(T) * nbatch * channels * in_height * in_width);
} else {
const T scale_height = GetAreaPixelScale(in_height, out_height, align_corners, height_scale);
const T scale_width = GetAreaPixelScale(in_width, out_width, align_corners, width_scale);
for (int64_t output_y = 0; output_y < out_height; output_y++) {
for (int64_t output_x = 0; output_x < out_width; output_x++) {
T* in = in_ptr;
const T* out = out_ptr;
T real_x = GetAreaPixel(scale_width, output_x, align_corners, true);
int64_t input_x = std::floor(real_x);
T t_x = real_x - input_x;
T real_y = GetAreaPixel(scale_height, output_y, align_corners, true);
int64_t input_y = std::floor(real_y);
T t_y = real_y - input_y;
T x_coeffs[4];
T y_coeffs[4];
get_cubic_upsample_coefficients<T>(x_coeffs, t_x);
get_cubic_upsample_coefficients<T>(y_coeffs, t_y);
for (int64_t c = 0; c < channels; c++) {
T out_value = out[output_y * out_width + output_x];
for (int64_t i = 0; i < 4; i++) {
for (int64_t j = 0; j < 4; j++) {
upsample_increment_value_bounded<T>(in, in_width, in_height, input_x - 1 + i,
input_y - 1 + j,
out_value * y_coeffs[j] * x_coeffs[i]);
}
}
in += in_width * in_height;
out += out_width * out_height;
}
}
}
}
}
bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
};
#define REGISTER_UPSAMPLE_BICUBIC_CPU_KERNEL(dtype) \
REGISTER_USER_KERNEL("upsample_bicubic_2d") \
.SetCreateFn<UpsampleBicubic2dCPUKernel<dtype>>() \
.SetIsMatchedHob((user_op::HobDeviceTag() == "cpu") \
& (user_op::HobDataType("y", 0) == GetDataType<dtype>::value)); \
REGISTER_USER_KERNEL("upsample_bicubic_2d_grad") \
.SetCreateFn<UpsampleBicubic2dGradCPUKernel<dtype>>() \
.SetIsMatchedHob((user_op::HobDeviceTag() == "cpu") \
& (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
REGISTER_UPSAMPLE_BICUBIC_CPU_KERNEL(float)
REGISTER_UPSAMPLE_BICUBIC_CPU_KERNEL(double)
REGISTER_UPSAMPLE_BICUBIC_CPU_KERNEL(int)
} // namespace oneflow
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "oneflow/core/framework/framework.h"
#include "oneflow/core/kernel/new_kernel_util.h"
#include "oneflow/core/common/nd_index_offset_helper.h"
#include "oneflow/core/cuda/atomic.cuh"
#include "oneflow/user/kernels/upsample_kernel.h"
namespace oneflow {
namespace {
template<typename T>
__device__ void upsample_increment_value_bounded_cuda(T* data, int64_t width, int64_t height,
int64_t x, int64_t y, T value) {
int64_t access_x = max(min(x, width - 1), static_cast<int64_t>(0));
int64_t access_y = max(min(y, height - 1), static_cast<int64_t>(0));
cuda::atomic::Add(data + access_y * width + access_x, value);
}
template<typename T>
__global__ void UpsampleBicubic2dForward(const int64_t elem_cnt, const T* in_dptr,
const int64_t nbatch, const int64_t channels,
const int64_t in_height, const int64_t in_width,
const int64_t out_height, const int64_t out_width,
const float scale_height, const float scale_width,
bool align_corners, T* out_dptr) {
CUDA_1D_KERNEL_LOOP(idx, elem_cnt) {
const int output_x = idx % out_width;
const int output_y = idx / out_width;
const T* in = in_dptr;
T* out = out_dptr;
const T real_x = GetAreaPixel(scale_width, output_x, align_corners, /*cubic=*/true);
int64_t input_x = std::floor(1.0 * real_x);
const T t_x = real_x - input_x;
const T real_y = GetAreaPixel(scale_height, output_y, align_corners, /*cubic=*/true);
int64_t input_y = std::floor(1.0 * real_y);
const T t_y = real_y - input_y;
for (int64_t c = 0; c < channels * nbatch; c++) {
T coefficients[4];
// Interpolate 4 times in the x direction
for (int64_t i = 0; i < 4; i++) {
coefficients[i] = cubic_interp1d<T>(
upsample_get_value_bounded<T>(in, in_width, in_height, input_x - 1, input_y - 1 + i),
upsample_get_value_bounded<T>(in, in_width, in_height, input_x + 0, input_y - 1 + i),
upsample_get_value_bounded<T>(in, in_width, in_height, input_x + 1, input_y - 1 + i),
upsample_get_value_bounded<T>(in, in_width, in_height, input_x + 2, input_y - 1 + i),
t_x);
}
// Interpolate in the y direction using x interpolations
out[output_y * out_width + output_x] = cubic_interp1d<T>(
coefficients[0], coefficients[1], coefficients[2], coefficients[3], t_y);
// Move to next channel
in += in_width * in_height;
out += out_width * out_height;
}
}
}
template<typename T>
__global__ void UpsampleBicubic2dBackward(const int64_t elem_cnt, const T* dy_dptr,
const int64_t nbatch, const int64_t channels,
const int64_t in_height, const int64_t in_width,
const int64_t out_height, const int64_t out_width,
const float scale_height, const float scale_width,
bool align_corners, T* dx_dptr) {
CUDA_1D_KERNEL_LOOP(idx, elem_cnt) {
const int output_x = idx % out_width;
const int output_y = idx / out_width;
T* in = dx_dptr;
const T* out = dy_dptr;
T real_x = GetAreaPixel(scale_width, output_x, align_corners, true);
int64_t input_x = std::floor(1.0 * real_x);
T t_x = real_x - input_x;
T real_y = GetAreaPixel(scale_height, output_y, align_corners, true);
int64_t input_y = std::floor(1.0 * real_y);
T t_y = real_y - input_y;
T x_coeffs[4];
T y_coeffs[4];
get_cubic_upsample_coefficients<T>(x_coeffs, t_x);
get_cubic_upsample_coefficients<T>(y_coeffs, t_y);
for (int64_t c = 0; c < channels; c++) {
T out_value = out[output_y * out_width + output_x];
for (int64_t i = 0; i < 4; i++) {
for (int64_t j = 0; j < 4; j++) {
upsample_increment_value_bounded_cuda<T>(in, in_width, in_height, input_x - 1 + i,
input_y - 1 + j,
out_value * y_coeffs[j] * x_coeffs[i]);
}
}
in += in_width * in_height;
out += out_width * out_height;
}
}
}
} // namespace
template<typename T>
class UpsampleBicubic2dGPUKernel final : public user_op::OpKernel {
public:
UpsampleBicubic2dGPUKernel() = default;
~UpsampleBicubic2dGPUKernel() = default;
private:
void Compute(user_op::KernelComputeContext* ctx) const override {
const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0);
user_op::Tensor* y_tensor = ctx->Tensor4ArgNameAndIndex("y", 0);
const T* in_ptr = x_tensor->dptr<T>();
T* out_ptr = y_tensor->mut_dptr<T>();
const float height_scale = ctx->Attr<float>("height_scale");
const float width_scale = ctx->Attr<float>("width_scale");
const bool align_corners = ctx->Attr<bool>("align_corners");
const int nbatch = x_tensor->shape().At(0);
const int channels = x_tensor->shape().At(1);
const int64_t in_height = x_tensor->shape().At(2);
const int64_t in_width = x_tensor->shape().At(3);
const int64_t out_height = y_tensor->shape().At(2);
const int64_t out_width = y_tensor->shape().At(3);
const int64_t elem_cnt = out_height * out_width;
if (in_height == out_height && in_width == out_width) {
Memcpy<DeviceType::kGPU>(
ctx->device_ctx(), y_tensor->mut_dptr<void>(), x_tensor->dptr<void>(),
x_tensor->shape().elem_cnt() * GetSizeOfDataType(x_tensor->data_type()));
} else {
const T scale_height = GetAreaPixelScale(in_height, out_height, align_corners, height_scale);
const T scale_width = GetAreaPixelScale(in_width, out_width, align_corners, width_scale);
RUN_CUDA_KERNEL((UpsampleBicubic2dForward<T>), ctx->device_ctx(), elem_cnt, elem_cnt,
x_tensor->dptr<T>(), nbatch, channels, in_height, in_width, out_height,
out_width, scale_height, scale_width, align_corners, y_tensor->mut_dptr<T>());
}
}
bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
};
template<typename T>
class UpsampleBicubic2dGradGPUKernel final : public user_op::OpKernel {
public:
UpsampleBicubic2dGradGPUKernel() = default;
~UpsampleBicubic2dGradGPUKernel() = default;
private:
void Compute(user_op::KernelComputeContext* ctx) const override {
user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0);
Memset<DeviceType::kGPU>(ctx->device_ctx(), dx_tensor->mut_dptr<T>(), 0,
dx_tensor->shape().elem_cnt() * sizeof(T));
const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0);
const float height_scale = ctx->Attr<float>("height_scale");
const float width_scale = ctx->Attr<float>("width_scale");
const bool align_corners = ctx->Attr<bool>("align_corners");
const int nbatch = dx_tensor->shape().At(0);
const int channels = dx_tensor->shape().At(1);
const int64_t in_height = dx_tensor->shape().At(2);
const int64_t in_width = dx_tensor->shape().At(3);
const int64_t out_height = dy_tensor->shape().At(2);
const int64_t out_width = dy_tensor->shape().At(3);
const int64_t elem_cnt = out_height * out_width;
if (in_height == out_height && in_width == out_width) {
Memcpy<DeviceType::kGPU>(
ctx->device_ctx(), dx_tensor->mut_dptr<void>(), dy_tensor->dptr<void>(),
dy_tensor->shape().elem_cnt() * GetSizeOfDataType(dy_tensor->data_type()));
} else {
const T scale_height = GetAreaPixelScale(in_height, out_height, align_corners, height_scale);
const T scale_width = GetAreaPixelScale(in_width, out_width, align_corners, width_scale);
RUN_CUDA_KERNEL((UpsampleBicubic2dBackward<T>), ctx->device_ctx(), elem_cnt, elem_cnt,
dy_tensor->dptr<T>(), nbatch, channels, in_height, in_width, out_height,
out_width, scale_height, scale_width, align_corners,
dx_tensor->mut_dptr<T>());
}
}
bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
};
#define REGISTER_UPSAMPLE_BICUBIC_GPU_KERNEL(dtype) \
REGISTER_USER_KERNEL("upsample_bicubic_2d") \
.SetCreateFn<UpsampleBicubic2dGPUKernel<dtype>>() \
.SetIsMatchedHob((user_op::HobDeviceTag() == "gpu") \
& (user_op::HobDataType("y", 0) == GetDataType<dtype>::value)); \
REGISTER_USER_KERNEL("upsample_bicubic_2d_grad") \
.SetCreateFn<UpsampleBicubic2dGradGPUKernel<dtype>>() \
.SetIsMatchedHob((user_op::HobDeviceTag() == "gpu") \
& (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
REGISTER_UPSAMPLE_BICUBIC_GPU_KERNEL(float)
REGISTER_UPSAMPLE_BICUBIC_GPU_KERNEL(double)
REGISTER_UPSAMPLE_BICUBIC_GPU_KERNEL(int)
} // namespace oneflow
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "oneflow/core/framework/framework.h"
#include "oneflow/core/kernel/new_kernel_util.h"
#include "oneflow/core/common/nd_index_offset_helper.h"
#include "oneflow/user/kernels/upsample_kernel.h"
namespace oneflow {
namespace {
template<typename T>
static void UpsampleBilinear2DForward(const int64_t elem_cnt, const T* in_dptr,
NdIndexOffsetHelper<int64_t, 4> in_helper,
NdIndexOffsetHelper<int64_t, 4> out_helper,
const int64_t in_height, const int64_t in_width,
const T scale_h, const T scale_w, const bool align_corners,
T* out_dptr) {
for (int64_t index = 0; index < elem_cnt; ++index) {
int64_t n, c, h, w;
out_helper.OffsetToNdIndex(index, n, c, h, w);
BilinearParam<T> params;
GetBilinearParam(align_corners, h, w, in_height, in_width, scale_h, scale_w, &params);
const int64_t top_offset = in_helper.NdIndexToOffset(n, c, params.top_h_index, 0);
const int64_t bottom_offset = in_helper.NdIndexToOffset(n, c, params.bottom_h_index, 0);
const T top_left = in_dptr[top_offset + params.left_w_index];
const T top_right = in_dptr[top_offset + params.right_w_index];
const T bottom_left = in_dptr[bottom_offset + params.left_w_index];
const T bottom_right = in_dptr[bottom_offset + params.right_w_index];
const T top = top_left + (top_right - top_left) * params.w_lerp;
const T bottom = bottom_left + (bottom_right - bottom_left) * params.w_lerp;
out_dptr[index] = top + (bottom - top) * params.h_lerp;
}
}
template<typename T>
static void UpsampleBilinearBackward(const int64_t elem_cnt, const T* dy_dptr,
NdIndexOffsetHelper<int64_t, 4> dy_helper,
NdIndexOffsetHelper<int64_t, 4> dx_helper,
const int64_t dx_height, const int64_t dx_width,
const T scale_h, const T scale_w, const bool align_corners,
T* dx_dptr) {
for (int64_t index = 0; index < elem_cnt; ++index) {
int64_t n, c, h, w;
dy_helper.OffsetToNdIndex(index, n, c, h, w);
BilinearParam<T> params;
GetBilinearParam(align_corners, h, w, dx_height, dx_width, scale_h, scale_w, &params);
const int64_t top_offset = dx_helper.NdIndexToOffset(n, c, params.top_h_index, 0);
const int64_t bottom_offset = dx_helper.NdIndexToOffset(n, c, params.bottom_h_index, 0);
const T dy = dy_dptr[index];
const T dbottom = params.h_lerp * dy;
T* dx_dptr_bottom_offset = dx_dptr + bottom_offset;
*(dx_dptr_bottom_offset + params.left_w_index) += static_cast<T>((1 - params.w_lerp) * dbottom);
*(dx_dptr_bottom_offset + params.right_w_index) += static_cast<T>(params.w_lerp * dbottom);
const T dtop = dy - dbottom;
T* dx_dptr_top_offset = dx_dptr + top_offset;
*(dx_dptr_top_offset + params.left_w_index) += static_cast<T>((1 - params.w_lerp) * dtop);
*(dx_dptr_top_offset + params.right_w_index) += static_cast<T>(params.w_lerp * dtop);
}
}
} // namespace
template<typename T>
class UpsampleBilinear2DCPUKernel final : public user_op::OpKernel {
public:
UpsampleBilinear2DCPUKernel() = default;
~UpsampleBilinear2DCPUKernel() = default;
private:
void Compute(user_op::KernelComputeContext* ctx) const override {
const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0);
user_op::Tensor* y_tensor = ctx->Tensor4ArgNameAndIndex("y", 0);
const float height_scale = ctx->Attr<float>("height_scale");
const float width_scale = ctx->Attr<float>("width_scale");
const bool align_corners = ctx->Attr<bool>("align_corners");
const int64_t elem_cnt = y_tensor->shape().elem_cnt();
NdIndexOffsetHelper<int64_t, 4> in_helper(x_tensor->shape().At(0), x_tensor->shape().At(1),
x_tensor->shape().At(2), x_tensor->shape().At(3));
NdIndexOffsetHelper<int64_t, 4> out_helper(y_tensor->shape().At(0), y_tensor->shape().At(1),
y_tensor->shape().At(2), y_tensor->shape().At(3));
const int64_t nbatch = x_tensor->shape().At(0);
const int64_t channels = x_tensor->shape().At(1);
const int64_t in_height = x_tensor->shape().At(2);
const int64_t in_width = x_tensor->shape().At(3);
const int64_t out_height = y_tensor->shape().At(2);
const int64_t out_width = y_tensor->shape().At(3);
if (in_height == out_height && in_width == out_width) {
memcpy(y_tensor->mut_dptr<void>(), x_tensor->dptr<void>(),
sizeof(T) * nbatch * channels * in_height * in_width);
} else {
const T scale_height = GetAreaPixelScale(in_height, out_height, align_corners, height_scale);
const T scale_width = GetAreaPixelScale(in_width, out_width, align_corners, width_scale);
UpsampleBilinear2DForward<T>(elem_cnt, x_tensor->dptr<T>(), in_helper, out_helper, in_height,
in_width, scale_height, scale_width, align_corners,
y_tensor->mut_dptr<T>());
}
}
bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
};
template<typename T>
class UpsampleBilinear2DGradCPUKernel final : public user_op::OpKernel {
public:
UpsampleBilinear2DGradCPUKernel() = default;
~UpsampleBilinear2DGradCPUKernel() = default;
private:
void Compute(user_op::KernelComputeContext* ctx) const override {
user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0);
Memset<DeviceType::kCPU>(ctx->device_ctx(), dx_tensor->mut_dptr<T>(), 0,
dx_tensor->shape().elem_cnt() * sizeof(T));
const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0);
const float height_scale = ctx->Attr<float>("height_scale");
const float width_scale = ctx->Attr<float>("width_scale");
const bool align_corners = ctx->Attr<bool>("align_corners");
const int64_t elem_cnt = dy_tensor->shape().elem_cnt();
NdIndexOffsetHelper<int64_t, 4> dy_helper(dy_tensor->shape().At(0), dy_tensor->shape().At(1),
dy_tensor->shape().At(2), dy_tensor->shape().At(3));
NdIndexOffsetHelper<int64_t, 4> dx_helper(dx_tensor->shape().At(0), dx_tensor->shape().At(1),
dx_tensor->shape().At(2), dx_tensor->shape().At(3));
const int64_t nbatch = dx_tensor->shape().At(0);
const int64_t channels = dx_tensor->shape().At(1);
const int64_t in_height = dx_tensor->shape().At(2);
const int64_t in_width = dx_tensor->shape().At(3);
const int64_t out_height = dy_tensor->shape().At(2);
const int64_t out_width = dy_tensor->shape().At(3);
if (in_height == out_height && in_width == out_width) {
memcpy(dx_tensor->mut_dptr<void>(), dy_tensor->dptr<void>(),
sizeof(T) * nbatch * channels * in_height * in_width);
} else {
const T scale_height = GetAreaPixelScale(in_height, out_height, align_corners, height_scale);
const T scale_width = GetAreaPixelScale(in_width, out_width, align_corners, width_scale);
UpsampleBilinearBackward<T>(elem_cnt, dy_tensor->dptr<T>(), dy_helper, dx_helper, in_height,
in_width, scale_height, scale_width, align_corners,
dx_tensor->mut_dptr<T>());
}
}
bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
};
#define REGISTER_UPSAMPLE_BILINEAR_2D_CPU_KERNEL(dtype) \
REGISTER_USER_KERNEL("upsample_bilinear_2d") \
.SetCreateFn<UpsampleBilinear2DCPUKernel<dtype>>() \
.SetIsMatchedHob((user_op::HobDeviceTag() == "cpu") \
& (user_op::HobDataType("y", 0) == GetDataType<dtype>::value)); \
REGISTER_USER_KERNEL("upsample_bilinear_2d_grad") \
.SetCreateFn<UpsampleBilinear2DGradCPUKernel<dtype>>() \
.SetIsMatchedHob((user_op::HobDeviceTag() == "cpu") \
& (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
REGISTER_UPSAMPLE_BILINEAR_2D_CPU_KERNEL(float)
REGISTER_UPSAMPLE_BILINEAR_2D_CPU_KERNEL(double)
REGISTER_UPSAMPLE_BILINEAR_2D_CPU_KERNEL(int)
} // namespace oneflow
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "oneflow/core/framework/framework.h"
#include "oneflow/core/kernel/new_kernel_util.h"
#include "oneflow/core/common/nd_index_offset_helper.h"
#include "oneflow/core/cuda/atomic.cuh"
#include "oneflow/user/kernels/upsample_kernel.h"
namespace oneflow {
namespace {
template<typename T>
__global__ void UpsampleBilinear2DForward(const int64_t elem_cnt, const T* in_dptr,
NdIndexOffsetHelper<int64_t, 4> in_helper,
NdIndexOffsetHelper<int64_t, 4> out_helper,
const int64_t in_height, const int64_t in_width,
const T scale_h, const T scale_w,
const bool align_corners, T* out_dptr) {
CUDA_1D_KERNEL_LOOP(index, elem_cnt) {
int64_t n, c, h, w;
out_helper.OffsetToNdIndex(index, n, c, h, w);
BilinearParam<T> params;
GetBilinearParam(align_corners, h, w, in_height, in_width, scale_h, scale_w, &params);
const int64_t top_offset = in_helper.NdIndexToOffset(n, c, params.top_h_index, 0);
const int64_t bottom_offset = in_helper.NdIndexToOffset(n, c, params.bottom_h_index, 0);
const T top_left = in_dptr[top_offset + params.left_w_index];
const T top_right = in_dptr[top_offset + params.right_w_index];
const T bottom_left = in_dptr[bottom_offset + params.left_w_index];
const T bottom_right = in_dptr[bottom_offset + params.right_w_index];
const T top = top_left + (top_right - top_left) * params.w_lerp;
const T bottom = bottom_left + (bottom_right - bottom_left) * params.w_lerp;
out_dptr[index] = top + (bottom - top) * params.h_lerp;
}
}
template<typename T>
__global__ void UpsampleBilinearBackward(const int64_t elem_cnt, const T* dy_dptr,
NdIndexOffsetHelper<int64_t, 4> dy_helper,
NdIndexOffsetHelper<int64_t, 4> dx_helper,
const int64_t dx_height, const int64_t dx_width,
const T scale_h, const T scale_w, const bool align_corners,
T* dx_dptr) {
CUDA_1D_KERNEL_LOOP(index, elem_cnt) {
int64_t n, c, h, w;
dy_helper.OffsetToNdIndex(index, n, c, h, w);
BilinearParam<T> params;
GetBilinearParam(align_corners, h, w, dx_height, dx_width, scale_h, scale_w, &params);
const int64_t top_offset = dx_helper.NdIndexToOffset(n, c, params.top_h_index, 0);
const int64_t bottom_offset = dx_helper.NdIndexToOffset(n, c, params.bottom_h_index, 0);
const T dy = dy_dptr[index];
const T dbottom = params.h_lerp * dy;
T* dx_dptr_bottom_offset = dx_dptr + bottom_offset;
cuda::atomic::Add(dx_dptr_bottom_offset + params.left_w_index,
static_cast<T>((1 - params.w_lerp) * dbottom));
cuda::atomic::Add(dx_dptr_bottom_offset + params.right_w_index,
static_cast<T>(params.w_lerp * dbottom));
const T dtop = dy - dbottom;
T* dx_dptr_top_offset = dx_dptr + top_offset;
cuda::atomic::Add(dx_dptr_top_offset + params.left_w_index,
static_cast<T>((1 - params.w_lerp) * dtop));
cuda::atomic::Add(dx_dptr_top_offset + params.right_w_index,
static_cast<T>(params.w_lerp * dtop));
}
}
} // namespace
template<typename T>
class UpsampleBilinear2DGPUKernel final : public user_op::OpKernel {
public:
UpsampleBilinear2DGPUKernel() = default;
~UpsampleBilinear2DGPUKernel() = default;
private:
void Compute(user_op::KernelComputeContext* ctx) const override {
const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0);
user_op::Tensor* y_tensor = ctx->Tensor4ArgNameAndIndex("y", 0);
const float height_scale = ctx->Attr<float>("height_scale");
const float width_scale = ctx->Attr<float>("width_scale");
const bool align_corners = ctx->Attr<bool>("align_corners");
const int64_t elem_cnt = y_tensor->shape().elem_cnt();
NdIndexOffsetHelper<int64_t, 4> in_helper(x_tensor->shape().At(0), x_tensor->shape().At(1),
x_tensor->shape().At(2), x_tensor->shape().At(3));
NdIndexOffsetHelper<int64_t, 4> out_helper(y_tensor->shape().At(0), y_tensor->shape().At(1),
y_tensor->shape().At(2), y_tensor->shape().At(3));
const int64_t in_height = x_tensor->shape().At(2);
const int64_t in_width = x_tensor->shape().At(3);
const int64_t out_height = y_tensor->shape().At(2);
const int64_t out_width = y_tensor->shape().At(3);
if (in_height == out_height && in_width == out_width) {
Memcpy<DeviceType::kGPU>(
ctx->device_ctx(), y_tensor->mut_dptr<void>(), x_tensor->dptr<void>(),
x_tensor->shape().elem_cnt() * GetSizeOfDataType(x_tensor->data_type()));
} else {
const T scale_height = GetAreaPixelScale(in_height, out_height, align_corners, height_scale);
const T scale_width = GetAreaPixelScale(in_width, out_width, align_corners, width_scale);
RUN_CUDA_KERNEL((UpsampleBilinear2DForward<T>), ctx->device_ctx(), elem_cnt, elem_cnt,
x_tensor->dptr<T>(), in_helper, out_helper, in_height, in_width, scale_height,
scale_width, align_corners, y_tensor->mut_dptr<T>());
}
}
bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
};
template<typename T>
class UpsampleBilinear2DGradGPUKernel final : public user_op::OpKernel {
public:
UpsampleBilinear2DGradGPUKernel() = default;
~UpsampleBilinear2DGradGPUKernel() = default;
private:
void Compute(user_op::KernelComputeContext* ctx) const override {
user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0);
Memset<DeviceType::kGPU>(ctx->device_ctx(), dx_tensor->mut_dptr<T>(), 0,
dx_tensor->shape().elem_cnt() * sizeof(T));
const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0);
const float height_scale = ctx->Attr<float>("height_scale");
const float width_scale = ctx->Attr<float>("width_scale");
const bool align_corners = ctx->Attr<bool>("align_corners");
const int64_t elem_cnt = dy_tensor->shape().elem_cnt();
NdIndexOffsetHelper<int64_t, 4> dy_helper(dy_tensor->shape().At(0), dy_tensor->shape().At(1),
dy_tensor->shape().At(2), dy_tensor->shape().At(3));
NdIndexOffsetHelper<int64_t, 4> dx_helper(dx_tensor->shape().At(0), dx_tensor->shape().At(1),
dx_tensor->shape().At(2), dx_tensor->shape().At(3));
const int64_t in_height = dx_tensor->shape().At(2);
const int64_t in_width = dx_tensor->shape().At(3);
const int64_t out_height = dy_tensor->shape().At(2);
const int64_t out_width = dy_tensor->shape().At(3);
if (in_height == out_height && in_width == out_width) {
Memcpy<DeviceType::kGPU>(
ctx->device_ctx(), dx_tensor->mut_dptr<void>(), dy_tensor->dptr<void>(),
dy_tensor->shape().elem_cnt() * GetSizeOfDataType(dy_tensor->data_type()));
} else {
const T scale_height = GetAreaPixelScale(in_height, out_height, align_corners, height_scale);
const T scale_width = GetAreaPixelScale(in_width, out_width, align_corners, width_scale);
RUN_CUDA_KERNEL((UpsampleBilinearBackward<T>), ctx->device_ctx(), elem_cnt, elem_cnt,
dy_tensor->dptr<T>(), dy_helper, dx_helper, in_height, in_width, scale_height,
scale_width, align_corners, dx_tensor->mut_dptr<T>());
}
}
bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
};
#define REGISTER_UPSAMPLE_BILINEAR_2D_GPU_KERNEL(dtype) \
REGISTER_USER_KERNEL("upsample_bilinear_2d") \
.SetCreateFn<UpsampleBilinear2DGPUKernel<dtype>>() \
.SetIsMatchedHob((user_op::HobDeviceTag() == "gpu") \
& (user_op::HobDataType("y", 0) == GetDataType<dtype>::value)); \
REGISTER_USER_KERNEL("upsample_bilinear_2d_grad") \
.SetCreateFn<UpsampleBilinear2DGradGPUKernel<dtype>>() \
.SetIsMatchedHob((user_op::HobDeviceTag() == "gpu") \
& (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
REGISTER_UPSAMPLE_BILINEAR_2D_GPU_KERNEL(float)
REGISTER_UPSAMPLE_BILINEAR_2D_GPU_KERNEL(double)
REGISTER_UPSAMPLE_BILINEAR_2D_GPU_KERNEL(int)
} // namespace oneflow
......@@ -15,6 +15,16 @@ limitations under the License.
*/
#include "oneflow/core/common/nd_index_offset_helper.h"
template<typename T>
OF_DEVICE_FUNC T GetLinearInputIndex(const int64_t out_dim_idx, const T scale, bool align_corners) {
if (align_corners) {
return static_cast<T>(scale * out_dim_idx);
} else {
T src_idx = scale * (out_dim_idx + 0.5) - 0.5;
return static_cast<T>(src_idx < 0 ? 0 : src_idx);
}
}
OF_DEVICE_FUNC static int64_t GetNearestInputIndex(const int64_t out_dim_idx, const float scale,
const int64_t in_dim_size) {
int64_t index = static_cast<int64_t>(std::floor((static_cast<float>(out_dim_idx) * scale)));
......@@ -37,6 +47,17 @@ OF_DEVICE_FUNC T GetAreaPixelScale(const int64_t input_size, const int64_t outpu
}
}
template<typename T>
OF_DEVICE_FUNC T GetAreaPixel(const T scale, const int64_t dst_index, bool align_corners,
bool cubic = false) {
if (align_corners) {
return scale * dst_index;
} else {
T src_idx = scale * (dst_index + 0.5) - 0.5;
return (!cubic && src_idx < 0) ? static_cast<T>(0) : src_idx;
}
}
template<typename T>
struct BilinearParam {
int64_t top_h_index;
......@@ -78,3 +99,60 @@ OF_DEVICE_FUNC void GetBilinearParam(const bool align_corners, const int64_t h,
params->right_w_index = w1 + w1p;
params->w_lerp = w1r - w1;
}
template<typename T>
OF_DEVICE_FUNC void upsample_increment_value_bounded(T* data, int64_t width, int64_t height,
int64_t x, int64_t y, T value) {
int64_t access_x = std::max(std::min(x, width - 1), static_cast<int64_t>(0));
int64_t access_y = std::max(std::min(y, height - 1), static_cast<int64_t>(0));
data[access_y * width + access_x] += value;
}
template<typename T>
OF_DEVICE_FUNC T upsample_get_value_bounded(const T* data, const int64_t width,
const int64_t height, const int64_t x,
const int64_t y) {
int64_t access_x = x;
access_x = access_x > width - 1 ? width - 1 : access_x;
access_x = access_x < 0 ? 0 : access_x;
int64_t access_y = y;
access_y = access_y > height - 1 ? height - 1 : access_y;
access_y = access_y < 0 ? 0 : access_y;
return data[access_y * width + access_x];
}
// Based on
// https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm
template<typename T>
OF_DEVICE_FUNC T cubic_convolution1(const T x, const T A) {
return ((A + 2.0) * x - (A + 3.0)) * x * x + 1.0;
}
template<typename T>
OF_DEVICE_FUNC T cubic_convolution2(const T x, const T A) {
return ((A * x - 5.0 * A) * x + 8.0 * A) * x - 4.0 * A;
}
template<typename T>
OF_DEVICE_FUNC void get_cubic_upsample_coefficients(T coeffs[4], const T t) {
T A = -0.75;
T x1 = t;
coeffs[0] = cubic_convolution2<T>(x1 + 1.0, A);
coeffs[1] = cubic_convolution1<T>(x1, A);
// opposite coefficients
T x2 = 1.0 - t;
coeffs[2] = cubic_convolution1<T>(x2, A);
coeffs[3] = cubic_convolution2<T>(x2 + 1.0, A);
}
template<typename T>
OF_DEVICE_FUNC T cubic_interp1d(const T x0, const T x1, const T x2, const T x3, const T t) {
T coeffs[4];
get_cubic_upsample_coefficients<T>(coeffs, t);
return x0 * coeffs[0] * 1.0 + x1 * coeffs[1] * 1.0 + x2 * coeffs[2] * 1.0 + x3 * coeffs[3] * 1.0;
}
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "oneflow/core/framework/framework.h"
#include "oneflow/core/kernel/new_kernel_util.h"
#include "oneflow/core/common/nd_index_offset_helper.h"
#include "oneflow/user/kernels/upsample_kernel.h"
namespace oneflow {
namespace {
template<typename T>
static void UpsampleLinear1DForward(const int64_t elem_cnt, const T* in_dptr,
NdIndexOffsetHelper<int64_t, 3> in_helper,
NdIndexOffsetHelper<int64_t, 3> out_helper, const int in_height,
const float scale_factor, bool align_corners, T* out_dptr) {
for (int64_t index = 0; index < elem_cnt; ++index) {
int64_t n, c, h;
out_helper.OffsetToNdIndex(index, n, c, h);
const T h1r = GetLinearInputIndex(h, scale_factor, align_corners);
const int64_t h1 = h1r;
const int64_t h1p = (h1 < in_height - 1) ? 1 : 0;
const T h1lambda = h1r - h1;
const T h0lambda = static_cast<T>(1.) - h1lambda;
out_dptr[index] = h0lambda * in_dptr[in_helper.NdIndexToOffset(n, c, h1)]
+ h1lambda * in_dptr[in_helper.NdIndexToOffset(n, c, h1 + h1p)];
}
}
template<typename T>
static void UpsampleLinear1DBackward(const int64_t elem_cnt, const T* dy_dptr,
NdIndexOffsetHelper<int64_t, 3> dy_helper,
NdIndexOffsetHelper<int64_t, 3> dx_helper, const int in_height,
const float scale_factor, bool align_corners, T* dx_dptr) {
for (int64_t index = 0; index < elem_cnt; ++index) {
int64_t n, c, h;
dy_helper.OffsetToNdIndex(index, n, c, h);
const T h1r = GetLinearInputIndex(h, scale_factor, align_corners);
const int64_t h1 = h1r;
const int64_t h1p = (h1 < in_height - 1) ? 1 : 0;
const T h1lambda = h1r - h1;
const T h0lambda = static_cast<T>(1.) - h1lambda;
*(dx_dptr + dx_helper.NdIndexToOffset(n, c, h1)) += h0lambda * dy_dptr[index];
*(dx_dptr + dx_helper.NdIndexToOffset(n, c, h1 + h1p)) += h1lambda * dy_dptr[index];
}
}
} // namespace
template<typename T>
class UpsampleLinear1DCPUKernel final : public user_op::OpKernel {
public:
UpsampleLinear1DCPUKernel() = default;
~UpsampleLinear1DCPUKernel() = default;
private:
void Compute(user_op::KernelComputeContext* ctx) const override {
const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0);
user_op::Tensor* y_tensor = ctx->Tensor4ArgNameAndIndex("y", 0);
const float height_scale = ctx->Attr<float>("scale_factor");
const bool align_corners = ctx->Attr<bool>("align_corners");
const int64_t elem_cnt = y_tensor->shape().elem_cnt();
NdIndexOffsetHelper<int64_t, 3> in_helper(x_tensor->shape().At(0), x_tensor->shape().At(1),
x_tensor->shape().At(2));
NdIndexOffsetHelper<int64_t, 3> out_helper(y_tensor->shape().At(0), y_tensor->shape().At(1),
y_tensor->shape().At(2));
const int64_t nbatch = x_tensor->shape().At(0);
const int64_t channels = x_tensor->shape().At(1);
const int64_t in_height = x_tensor->shape().At(2);
const int64_t out_height = y_tensor->shape().At(2);
if (in_height == out_height) {
memcpy(y_tensor->mut_dptr<void>(), x_tensor->dptr<void>(),
sizeof(T) * nbatch * channels * in_height);
} else {
const T scale_height = GetAreaPixelScale(in_height, out_height, align_corners, height_scale);
UpsampleLinear1DForward<T>(elem_cnt, x_tensor->dptr<T>(), in_helper, out_helper, in_height,
scale_height, align_corners, y_tensor->mut_dptr<T>());
}
}
bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
};
template<typename T>
class UpsampleLinearGrad1DCPUKernel final : public user_op::OpKernel {
public:
UpsampleLinearGrad1DCPUKernel() = default;
~UpsampleLinearGrad1DCPUKernel() = default;
private:
void Compute(user_op::KernelComputeContext* ctx) const override {
user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0);
Memset<DeviceType::kCPU>(ctx->device_ctx(), dx_tensor->mut_dptr<T>(), 0,
dx_tensor->shape().elem_cnt() * sizeof(T));
const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0);
const float height_scale = ctx->Attr<float>("scale_factor");
const bool align_corners = ctx->Attr<bool>("align_corners");
NdIndexOffsetHelper<int64_t, 3> dy_helper(dy_tensor->shape().At(0), dy_tensor->shape().At(1),
dy_tensor->shape().At(2));
NdIndexOffsetHelper<int64_t, 3> dx_helper(dx_tensor->shape().At(0), dx_tensor->shape().At(1),
dx_tensor->shape().At(2));
const int64_t elem_cnt = dy_tensor->shape().elem_cnt();
const int64_t nbatch = dx_tensor->shape().At(0);
const int64_t channels = dx_tensor->shape().At(1);
const int64_t in_height = dx_tensor->shape().At(2);
const int64_t out_height = dy_tensor->shape().At(2);
if (in_height == out_height) {
memcpy(dx_tensor->mut_dptr<void>(), dy_tensor->dptr<void>(),
sizeof(T) * nbatch * channels * in_height);
} else {
const T scale_height = GetAreaPixelScale(in_height, out_height, align_corners, height_scale);
UpsampleLinear1DBackward<T>(elem_cnt, dy_tensor->dptr<T>(), dy_helper, dx_helper, in_height,
scale_height, align_corners, dx_tensor->mut_dptr<T>());
}
}
bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
};
#define REGISTER_UPSAMPLELINEAR1D_CPU_KERNEL(dtype) \
REGISTER_USER_KERNEL("upsample_linear_1d") \
.SetCreateFn<UpsampleLinear1DCPUKernel<dtype>>() \
.SetIsMatchedHob((user_op::HobDeviceTag() == "cpu") \
& (user_op::HobDataType("y", 0) == GetDataType<dtype>::value)); \
REGISTER_USER_KERNEL("upsample_linear_1d_grad") \
.SetCreateFn<UpsampleLinearGrad1DCPUKernel<dtype>>() \
.SetIsMatchedHob((user_op::HobDeviceTag() == "cpu") \
& (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
REGISTER_UPSAMPLELINEAR1D_CPU_KERNEL(float)
REGISTER_UPSAMPLELINEAR1D_CPU_KERNEL(double)
REGISTER_UPSAMPLELINEAR1D_CPU_KERNEL(int)
} // namespace oneflow
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "oneflow/core/framework/framework.h"
#include "oneflow/core/kernel/new_kernel_util.h"
#include "oneflow/core/common/nd_index_offset_helper.h"
#include "oneflow/core/cuda/atomic.cuh"
#include "oneflow/user/kernels/upsample_kernel.h"
namespace oneflow {
namespace {
template<typename T>
__global__ void UpsampleLinear1DForward(const int64_t elem_cnt, const T* in_dptr,
NdIndexOffsetHelper<int64_t, 3> in_helper,
NdIndexOffsetHelper<int64_t, 3> out_helper,
const int in_height, const float scale_factor,
bool align_corners, T* out_dptr) {
CUDA_1D_KERNEL_LOOP(index, elem_cnt) {
int64_t n, c, h;
out_helper.OffsetToNdIndex(index, n, c, h);
const T h1r = GetLinearInputIndex(h, scale_factor, align_corners);
const int64_t h1 = h1r;
const int64_t h1p = (h1 < in_height - 1) ? 1 : 0;
const T h1lambda = h1r - h1;
const T h0lambda = static_cast<T>(1.) - h1lambda;
out_dptr[index] = h0lambda * in_dptr[in_helper.NdIndexToOffset(n, c, h1)]
+ h1lambda * in_dptr[in_helper.NdIndexToOffset(n, c, h1 + h1p)];
}
}
template<typename T>
__global__ void UpsampleLinear1DBackward(const int64_t elem_cnt, const T* dy_dptr,
NdIndexOffsetHelper<int64_t, 3> dy_helper,
NdIndexOffsetHelper<int64_t, 3> dx_helper,
const int in_height, const float scale_factor,
bool align_corners, T* dx_dptr) {
CUDA_1D_KERNEL_LOOP(index, elem_cnt) {
int64_t n, c, h;
dy_helper.OffsetToNdIndex(index, n, c, h);
const T h1r = GetLinearInputIndex(h, scale_factor, align_corners);
const int64_t h1 = h1r;
const int64_t h1p = (h1 < in_height - 1) ? 1 : 0;
const T h1lambda = h1r - h1;
const T h0lambda = static_cast<T>(1.) - h1lambda;
cuda::atomic::Add(dx_dptr + dx_helper.NdIndexToOffset(n, c, h1), h0lambda * dy_dptr[index]);
cuda::atomic::Add(dx_dptr + dx_helper.NdIndexToOffset(n, c, h1 + h1p),
h1lambda * dy_dptr[index]);
}
}
} // namespace
template<typename T>
class UpsampleLinear1DGPUKernel final : public user_op::OpKernel {
public:
UpsampleLinear1DGPUKernel() = default;
~UpsampleLinear1DGPUKernel() = default;
private:
void Compute(user_op::KernelComputeContext* ctx) const override {
const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0);
user_op::Tensor* y_tensor = ctx->Tensor4ArgNameAndIndex("y", 0);
const float height_scale = ctx->Attr<float>("scale_factor");
const bool align_corners = ctx->Attr<bool>("align_corners");
const int64_t elem_cnt = y_tensor->shape().elem_cnt();
NdIndexOffsetHelper<int64_t, 3> in_helper(x_tensor->shape().At(0), x_tensor->shape().At(1),
x_tensor->shape().At(2));
NdIndexOffsetHelper<int64_t, 3> out_helper(y_tensor->shape().At(0), y_tensor->shape().At(1),
y_tensor->shape().At(2));
const int64_t in_height = x_tensor->shape().At(2);
const int64_t out_height = y_tensor->shape().At(2);
if (in_height == out_height) {
Memcpy<DeviceType::kGPU>(
ctx->device_ctx(), y_tensor->mut_dptr<void>(), x_tensor->dptr<void>(),
x_tensor->shape().elem_cnt() * GetSizeOfDataType(x_tensor->data_type()));
} else {
const T scale_height = GetAreaPixelScale(in_height, out_height, align_corners, height_scale);
RUN_CUDA_KERNEL((UpsampleLinear1DForward<T>), ctx->device_ctx(), elem_cnt, elem_cnt,
x_tensor->dptr<T>(), in_helper, out_helper, in_height, scale_height,
align_corners, y_tensor->mut_dptr<T>());
}
}
bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
};
template<typename T>
class UpsampleLinearGrad1DGPUKernel final : public user_op::OpKernel {
public:
UpsampleLinearGrad1DGPUKernel() = default;
~UpsampleLinearGrad1DGPUKernel() = default;
private:
void Compute(user_op::KernelComputeContext* ctx) const override {
user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0);
Memset<DeviceType::kGPU>(ctx->device_ctx(), dx_tensor->mut_dptr<T>(), 0,
dx_tensor->shape().elem_cnt() * sizeof(T));
const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0);
const float height_scale = ctx->Attr<float>("scale_factor");
const bool align_corners = ctx->Attr<bool>("align_corners");
NdIndexOffsetHelper<int64_t, 3> dy_helper(dy_tensor->shape().At(0), dy_tensor->shape().At(1),
dy_tensor->shape().At(2));
NdIndexOffsetHelper<int64_t, 3> dx_helper(dx_tensor->shape().At(0), dx_tensor->shape().At(1),
dx_tensor->shape().At(2));
const int64_t elem_cnt = dy_tensor->shape().elem_cnt();
const int64_t in_height = dx_tensor->shape().At(2);
const int64_t out_height = dy_tensor->shape().At(2);
if (in_height == out_height) {
Memcpy<DeviceType::kGPU>(
ctx->device_ctx(), dx_tensor->mut_dptr<void>(), dy_tensor->dptr<void>(),
dy_tensor->shape().elem_cnt() * GetSizeOfDataType(dy_tensor->data_type()));
} else {
const T scale_height = GetAreaPixelScale(in_height, out_height, align_corners, height_scale);
RUN_CUDA_KERNEL((UpsampleLinear1DBackward<T>), ctx->device_ctx(), elem_cnt, elem_cnt,
dy_tensor->dptr<T>(), dy_helper, dx_helper, in_height, scale_height,
align_corners, dx_tensor->mut_dptr<T>());
}
}
bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
};
#define REGISTER_UPSAMPLELINEAR1D_GPU_KERNEL(dtype) \
REGISTER_USER_KERNEL("upsample_linear_1d") \
.SetCreateFn<UpsampleLinear1DGPUKernel<dtype>>() \
.SetIsMatchedHob((user_op::HobDeviceTag() == "gpu") \
& (user_op::HobDataType("y", 0) == GetDataType<dtype>::value)); \
REGISTER_USER_KERNEL("upsample_linear_1d_grad") \
.SetCreateFn<UpsampleLinearGrad1DGPUKernel<dtype>>() \
.SetIsMatchedHob((user_op::HobDeviceTag() == "gpu") \
& (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
REGISTER_UPSAMPLELINEAR1D_GPU_KERNEL(float)
REGISTER_UPSAMPLELINEAR1D_GPU_KERNEL(double)
REGISTER_UPSAMPLELINEAR1D_GPU_KERNEL(int)
} // namespace oneflow
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "oneflow/core/framework/framework.h"
#include "oneflow/core/kernel/new_kernel_util.h"
#include "oneflow/core/common/nd_index_offset_helper.h"
#include "oneflow/user/kernels/upsample_kernel.h"
namespace oneflow {
namespace {
template<typename T>
static void UpsampleNearest1DForward(const int64_t elem_cnt, const T* in_dptr,
NdIndexOffsetHelper<int64_t, 3> in_helper,
NdIndexOffsetHelper<int64_t, 3> out_helper,
const int64_t in_height, const float scale_factor,
T* out_dptr) {
for (int64_t index = 0; index < elem_cnt; ++index) {
int64_t n, c, h;
out_helper.OffsetToNdIndex(index, n, c, h);
const int64_t in_h = GetNearestInputIndex(h, scale_factor, in_height);
out_dptr[index] = in_dptr[in_helper.NdIndexToOffset(n, c, in_h)];
}
}
template<typename T>
static void UpsampleNearest1DBackward(const int64_t elem_cnt, const T* dy_dptr,
NdIndexOffsetHelper<int64_t, 3> dy_helper,
NdIndexOffsetHelper<int64_t, 3> dx_helper,
const int64_t in_height, const float scale_factor,
T* dx_dptr) {
for (int64_t index = 0; index < elem_cnt; ++index) {
int64_t n, c, h;
dy_helper.OffsetToNdIndex(index, n, c, h);
const int64_t dx_h = GetNearestInputIndex(h, scale_factor, in_height);
*(dx_dptr + dx_helper.NdIndexToOffset(n, c, dx_h)) += dy_dptr[index];
}
}
template<typename T>
static void UpsampleNearest2DForward(const int64_t elem_cnt, const T* in_dptr,
NdIndexOffsetHelper<int64_t, 4> in_helper,
NdIndexOffsetHelper<int64_t, 4> out_helper,
const int64_t in_height, const int64_t in_width,
const float scale_h, const float scale_w, T* out_dptr) {
for (int64_t index = 0; index < elem_cnt; ++index) {
int64_t n, c, h, w;
out_helper.OffsetToNdIndex(index, n, c, h, w);
const int64_t in_h = GetNearestInputIndex(h, scale_h, in_height);
const int64_t in_w = GetNearestInputIndex(w, scale_w, in_width);
out_dptr[index] = in_dptr[in_helper.NdIndexToOffset(n, c, in_h, in_w)];
}
}
template<typename T>
static void UpsampleNearest2DBackward(const int64_t elem_cnt, const T* dy_dptr,
NdIndexOffsetHelper<int64_t, 4> dy_helper,
NdIndexOffsetHelper<int64_t, 4> dx_helper,
const int64_t dx_height, const int64_t dx_width,
const float scale_h, const float scale_w, T* dx_dptr) {
for (int64_t index = 0; index < elem_cnt; ++index) {
int64_t n, c, h, w;
dy_helper.OffsetToNdIndex(index, n, c, h, w);
const int64_t dx_h = GetNearestInputIndex(h, scale_h, dx_height);
const int64_t dx_w = GetNearestInputIndex(w, scale_w, dx_width);
*(dx_dptr + dx_helper.NdIndexToOffset(n, c, dx_h, dx_w)) += dy_dptr[index];
}
}
template<typename T>
static void UpsampleNearest3DForward(const int64_t elem_cnt, const T* in_dptr,
NdIndexOffsetHelper<int64_t, 5> in_helper,
NdIndexOffsetHelper<int64_t, 5> out_helper,
const int64_t in_depth, const int64_t in_height,
const int64_t in_width, const float scale_d,
const float scale_h, const float scale_w, T* out_dptr) {
for (int64_t index = 0; index < elem_cnt; ++index) {
int64_t n, c, d, h, w;
out_helper.OffsetToNdIndex(index, n, c, d, h, w);
const int64_t in_h = GetNearestInputIndex(h, scale_h, in_height);
const int64_t in_w = GetNearestInputIndex(w, scale_w, in_width);
const int64_t in_d = GetNearestInputIndex(d, scale_d, in_depth);
out_dptr[index] = in_dptr[in_helper.NdIndexToOffset(n, c, in_d, in_h, in_w)];
}
}
template<typename T>
static void UpsampleNearest3DBackward(const int64_t elem_cnt, const T* dy_dptr,
NdIndexOffsetHelper<int64_t, 5> dy_helper,
NdIndexOffsetHelper<int64_t, 5> dx_helper,
const int64_t in_depth, const int64_t in_height,
const int64_t in_width, const float scale_d,
const float scale_h, const float scale_w, T* dx_dptr) {
for (int64_t index = 0; index < elem_cnt; ++index) {
int64_t n, c, d, h, w;
dy_helper.OffsetToNdIndex(index, n, c, d, h, w);
const int64_t dx_h = GetNearestInputIndex(h, scale_h, in_height);
const int64_t dx_w = GetNearestInputIndex(w, scale_w, in_width);
const int64_t in_d = GetNearestInputIndex(d, scale_d, in_depth);
*(dx_dptr + dx_helper.NdIndexToOffset(n, c, in_d, dx_h, dx_w)) += dy_dptr[index];
}
}
} // namespace
template<typename T>
class UpsampleNearest1DCPUKernel final : public user_op::OpKernel {
public:
UpsampleNearest1DCPUKernel() = default;
~UpsampleNearest1DCPUKernel() = default;
private:
void Compute(user_op::KernelComputeContext* ctx) const override {
const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0);
user_op::Tensor* y_tensor = ctx->Tensor4ArgNameAndIndex("y", 0);
const float height_scale = ctx->Attr<float>("scale_factor");
const int64_t elem_cnt = y_tensor->shape().elem_cnt();
const int64_t nbatch = x_tensor->shape().At(0);
const int64_t channels = x_tensor->shape().At(1);
const int64_t in_height = x_tensor->shape().At(2);
const int64_t out_height = y_tensor->shape().At(2);
if (in_height == out_height) {
memcpy(y_tensor->mut_dptr<void>(), x_tensor->dptr<void>(),
sizeof(T) * nbatch * channels * in_height);
} else {
NdIndexOffsetHelper<int64_t, 3> in_helper(x_tensor->shape().At(0), x_tensor->shape().At(1),
x_tensor->shape().At(2));
NdIndexOffsetHelper<int64_t, 3> out_helper(y_tensor->shape().At(0), y_tensor->shape().At(1),
y_tensor->shape().At(2));
UpsampleNearest1DForward<T>(elem_cnt, x_tensor->dptr<T>(), in_helper, out_helper,
x_tensor->shape().At(2), 1.f / height_scale,
y_tensor->mut_dptr<T>());
}
}
bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
};
template<typename T>
class UpsampleNearestGrad1DCPUKernel final : public user_op::OpKernel {
public:
UpsampleNearestGrad1DCPUKernel() = default;
~UpsampleNearestGrad1DCPUKernel() = default;
private:
void Compute(user_op::KernelComputeContext* ctx) const override {
user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0);
Memset<DeviceType::kCPU>(ctx->device_ctx(), dx_tensor->mut_dptr<T>(), 0,
dx_tensor->shape().elem_cnt() * sizeof(T));
const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0);
const float height_scale = ctx->Attr<float>("scale_factor");
const int64_t elem_cnt = dy_tensor->shape().elem_cnt();
const int64_t nbatch = dx_tensor->shape().At(0);
const int64_t channels = dx_tensor->shape().At(1);
const int64_t in_height = dx_tensor->shape().At(2);
const int64_t out_height = dy_tensor->shape().At(2);
if (in_height == out_height) {
memcpy(dx_tensor->mut_dptr<void>(), dy_tensor->dptr<void>(),
sizeof(T) * nbatch * channels * in_height);
} else {
NdIndexOffsetHelper<int64_t, 3> dy_helper(dy_tensor->shape().At(0), dy_tensor->shape().At(1),
dy_tensor->shape().At(2));
NdIndexOffsetHelper<int64_t, 3> dx_helper(dx_tensor->shape().At(0), dx_tensor->shape().At(1),
dx_tensor->shape().At(2));
UpsampleNearest1DBackward<T>(elem_cnt, dy_tensor->dptr<T>(), dy_helper, dx_helper,
dx_tensor->shape().At(2), 1.f / height_scale,
dx_tensor->mut_dptr<T>());
}
}
bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
};
#define REGISTER_UPSAMPNEAREST1D_CPU_KERNEL(dtype) \
REGISTER_USER_KERNEL("upsample_nearest_1d") \
.SetCreateFn<UpsampleNearest1DCPUKernel<dtype>>() \
.SetIsMatchedHob((user_op::HobDeviceTag() == "cpu") \
& (user_op::HobDataType("y", 0) == GetDataType<dtype>::value)); \
REGISTER_USER_KERNEL("upsample_nearest_1d_grad") \
.SetCreateFn<UpsampleNearestGrad1DCPUKernel<dtype>>() \
.SetIsMatchedHob((user_op::HobDeviceTag() == "cpu") \
& (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
REGISTER_UPSAMPNEAREST1D_CPU_KERNEL(float)
REGISTER_UPSAMPNEAREST1D_CPU_KERNEL(double)
REGISTER_UPSAMPNEAREST1D_CPU_KERNEL(int)
template<typename T>
class UpsampleNearest2DCPUKernel final : public user_op::OpKernel {
public:
UpsampleNearest2DCPUKernel() = default;
~UpsampleNearest2DCPUKernel() = default;
private:
void Compute(user_op::KernelComputeContext* ctx) const override {
const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0);
user_op::Tensor* y_tensor = ctx->Tensor4ArgNameAndIndex("y", 0);
const int64_t nbatch = x_tensor->shape().At(0);
const int64_t channels = x_tensor->shape().At(1);
const int64_t in_height = x_tensor->shape().At(2);
const int64_t in_width = x_tensor->shape().At(3);
const int64_t out_height = y_tensor->shape().At(2);
const int64_t out_width = y_tensor->shape().At(3);
const float height_scale = ctx->Attr<float>("height_scale");
const float width_scale = ctx->Attr<float>("width_scale");
const int64_t elem_cnt = y_tensor->shape().elem_cnt();
if (in_height == out_height && in_width == out_width) {
memcpy(y_tensor->mut_dptr<void>(), x_tensor->dptr<void>(),
sizeof(T) * nbatch * channels * in_height * in_width);
} else {
NdIndexOffsetHelper<int64_t, 4> in_helper(x_tensor->shape().At(0), x_tensor->shape().At(1),
x_tensor->shape().At(2), x_tensor->shape().At(3));
NdIndexOffsetHelper<int64_t, 4> out_helper(y_tensor->shape().At(0), y_tensor->shape().At(1),
y_tensor->shape().At(2), y_tensor->shape().At(3));
UpsampleNearest2DForward<T>(elem_cnt, x_tensor->dptr<T>(), in_helper, out_helper,
x_tensor->shape().At(2), x_tensor->shape().At(3),
1.f / height_scale, 1.f / width_scale, y_tensor->mut_dptr<T>());
}
}
bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
};
template<typename T>
class UpsampleNearest2DGradCPUKernel final : public user_op::OpKernel {
public:
UpsampleNearest2DGradCPUKernel() = default;
~UpsampleNearest2DGradCPUKernel() = default;
private:
void Compute(user_op::KernelComputeContext* ctx) const override {
user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0);
Memset<DeviceType::kCPU>(ctx->device_ctx(), dx_tensor->mut_dptr<T>(), 0,
dx_tensor->shape().elem_cnt() * sizeof(T));
const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0);
const int64_t nbatch = dx_tensor->shape().At(0);
const int64_t channels = dx_tensor->shape().At(1);
const int64_t in_height = dx_tensor->shape().At(2);
const int64_t in_width = dx_tensor->shape().At(3);
const int64_t out_height = dy_tensor->shape().At(2);
const int64_t out_width = dy_tensor->shape().At(3);
const float height_scale = ctx->Attr<float>("height_scale");
const float width_scale = ctx->Attr<float>("width_scale");
const int64_t elem_cnt = dy_tensor->shape().elem_cnt();
if (in_height == out_height && in_width == out_width) {
memcpy(dx_tensor->mut_dptr<void>(), dy_tensor->dptr<void>(),
sizeof(T) * nbatch * channels * in_height * in_width);
} else {
NdIndexOffsetHelper<int64_t, 4> dy_helper(dy_tensor->shape().At(0), dy_tensor->shape().At(1),
dy_tensor->shape().At(2), dy_tensor->shape().At(3));
NdIndexOffsetHelper<int64_t, 4> dx_helper(dx_tensor->shape().At(0), dx_tensor->shape().At(1),
dx_tensor->shape().At(2), dx_tensor->shape().At(3));
UpsampleNearest2DBackward<T>(elem_cnt, dy_tensor->dptr<T>(), dy_helper, dx_helper,
dx_tensor->shape().At(2), dx_tensor->shape().At(3),
1.f / height_scale, 1.f / width_scale, dx_tensor->mut_dptr<T>());
}
}
bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
};
#define REGISTER_UPSAMPLE_NEAREST_2D_CPU_KERNEL(dtype) \
REGISTER_USER_KERNEL("upsample_nearest_2d") \
.SetCreateFn<UpsampleNearest2DCPUKernel<dtype>>() \
.SetIsMatchedHob((user_op::HobDeviceTag() == "cpu") \
& (user_op::HobDataType("y", 0) == GetDataType<dtype>::value)); \
REGISTER_USER_KERNEL("upsample_nearest_2d_grad") \
.SetCreateFn<UpsampleNearest2DGradCPUKernel<dtype>>() \
.SetIsMatchedHob((user_op::HobDeviceTag() == "cpu") \
& (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
REGISTER_UPSAMPLE_NEAREST_2D_CPU_KERNEL(float)
REGISTER_UPSAMPLE_NEAREST_2D_CPU_KERNEL(double)
REGISTER_UPSAMPLE_NEAREST_2D_CPU_KERNEL(int)
template<typename T>
class UpsampleNearest3DCPUKernel final : public user_op::OpKernel {
public:
UpsampleNearest3DCPUKernel() = default;
~UpsampleNearest3DCPUKernel() = default;
private:
void Compute(user_op::KernelComputeContext* ctx) const override {
const user_op::Tensor* x_blob = ctx->Tensor4ArgNameAndIndex("x", 0);
user_op::Tensor* y_blob = ctx->Tensor4ArgNameAndIndex("y", 0);
const float depth_scale = ctx->Attr<float>("depth_scale");
const float height_scale = ctx->Attr<float>("height_scale");
const float width_scale = ctx->Attr<float>("width_scale");
const int64_t elem_cnt = y_blob->shape().elem_cnt();
NdIndexOffsetHelper<int64_t, 5> in_helper(x_blob->shape().At(0), x_blob->shape().At(1),
x_blob->shape().At(2), x_blob->shape().At(3),
x_blob->shape().At(4));
NdIndexOffsetHelper<int64_t, 5> out_helper(y_blob->shape().At(0), y_blob->shape().At(1),
y_blob->shape().At(2), y_blob->shape().At(3),
y_blob->shape().At(4));
UpsampleNearest3DForward<T>(elem_cnt, x_blob->dptr<T>(), in_helper, out_helper,
x_blob->shape().At(2), x_blob->shape().At(3), x_blob->shape().At(4),
1.f / depth_scale, 1.f / height_scale, 1.f / width_scale,
y_blob->mut_dptr<T>());
}
bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
};
template<typename T>
class UpsampleNearestGrad3DCPUKernel final : public user_op::OpKernel {
public:
UpsampleNearestGrad3DCPUKernel() = default;
~UpsampleNearestGrad3DCPUKernel() = default;
private:
void Compute(user_op::KernelComputeContext* ctx) const override {
user_op::Tensor* dx_blob = ctx->Tensor4ArgNameAndIndex("dx", 0);
if (dx_blob == nullptr) { return; }
Memset<DeviceType::kCPU>(ctx->device_ctx(), dx_blob->mut_dptr<T>(), 0,
dx_blob->shape().elem_cnt() * sizeof(T));
const user_op::Tensor* dy_blob = ctx->Tensor4ArgNameAndIndex("dy", 0);
const float depth_scale = ctx->Attr<float>("depth_scale");
const float height_scale = ctx->Attr<float>("height_scale");
const float width_scale = ctx->Attr<float>("width_scale");
const int64_t elem_cnt = dy_blob->shape().elem_cnt();
NdIndexOffsetHelper<int64_t, 5> dy_helper(dy_blob->shape().At(0), dy_blob->shape().At(1),
dy_blob->shape().At(2), dy_blob->shape().At(3),
dy_blob->shape().At(4));
NdIndexOffsetHelper<int64_t, 5> dx_helper(dx_blob->shape().At(0), dx_blob->shape().At(1),
dx_blob->shape().At(2), dx_blob->shape().At(3),
dx_blob->shape().At(4));
UpsampleNearest3DBackward<T>(elem_cnt, dy_blob->dptr<T>(), dy_helper, dx_helper,
dx_blob->shape().At(2), dx_blob->shape().At(3),
dx_blob->shape().At(4), 1.f / depth_scale, 1.f / height_scale,
1.f / width_scale, dx_blob->mut_dptr<T>());
}
bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
};
#define REGISTER_UPSAMPNEAREST3D_CPU_KERNEL(dtype) \
REGISTER_USER_KERNEL("upsample_nearest_3d") \
.SetCreateFn<UpsampleNearest3DCPUKernel<dtype>>() \
.SetIsMatchedHob((user_op::HobDeviceTag() == "cpu") \
& (user_op::HobDataType("y", 0) == GetDataType<dtype>::value)); \
REGISTER_USER_KERNEL("upsample_nearest_3d_grad") \
.SetCreateFn<UpsampleNearestGrad3DCPUKernel<dtype>>() \
.SetIsMatchedHob((user_op::HobDeviceTag() == "cpu") \
& (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
REGISTER_UPSAMPNEAREST3D_CPU_KERNEL(float)
REGISTER_UPSAMPNEAREST3D_CPU_KERNEL(double)
REGISTER_UPSAMPNEAREST3D_CPU_KERNEL(int)
} // namespace oneflow
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "oneflow/core/framework/framework.h"
#include "oneflow/core/kernel/new_kernel_util.h"
#include "oneflow/core/common/nd_index_offset_helper.h"
#include "oneflow/core/cuda/atomic.cuh"
#include "oneflow/user/kernels/upsample_kernel.h"
namespace oneflow {
namespace {
template<typename T>
__global__ void UpsampleNearest1DForward(const int64_t elem_cnt, const T* in_dptr,
NdIndexOffsetHelper<int64_t, 3> in_helper,
NdIndexOffsetHelper<int64_t, 3> out_helper,
const int64_t in_height, const float scale_factor,
T* out_dptr) {
CUDA_1D_KERNEL_LOOP(index, elem_cnt) {
int64_t n, c, h;
out_helper.OffsetToNdIndex(index, n, c, h);
const int64_t in_h = GetNearestInputIndex(h, scale_factor, in_height);
out_dptr[index] = in_dptr[in_helper.NdIndexToOffset(n, c, in_h)];
}
}
template<typename T>
__global__ void UpsampleNearest1DBackward(const int64_t elem_cnt, const T* dy_dptr,
NdIndexOffsetHelper<int64_t, 3> dy_helper,
NdIndexOffsetHelper<int64_t, 3> dx_helper,
const int64_t in_height, const float scale_factor,
T* dx_dptr) {
CUDA_1D_KERNEL_LOOP(index, elem_cnt) {
int64_t n, c, h;
dy_helper.OffsetToNdIndex(index, n, c, h);
const int64_t dx_h = GetNearestInputIndex(h, scale_factor, in_height);
cuda::atomic::Add(dx_dptr + dx_helper.NdIndexToOffset(n, c, dx_h), dy_dptr[index]);
}
}
template<typename T>
__global__ void UpsampleNearest2DForward(const int64_t elem_cnt, const T* in_dptr,
NdIndexOffsetHelper<int64_t, 4> in_helper,
NdIndexOffsetHelper<int64_t, 4> out_helper,
const int64_t in_height, const int64_t in_width,
const float scale_h, const float scale_w, T* out_dptr) {
CUDA_1D_KERNEL_LOOP(index, elem_cnt) {
int64_t n, c, h, w;
out_helper.OffsetToNdIndex(index, n, c, h, w);
const int64_t in_h = GetNearestInputIndex(h, scale_h, in_height);
const int64_t in_w = GetNearestInputIndex(w, scale_w, in_width);
out_dptr[index] = in_dptr[in_helper.NdIndexToOffset(n, c, in_h, in_w)];
}
}
template<typename T>
__global__ void UpsampleNearest2DBackward(const int64_t elem_cnt, const T* dy_dptr,
NdIndexOffsetHelper<int64_t, 4> dy_helper,
NdIndexOffsetHelper<int64_t, 4> dx_helper,
const int64_t dx_height, const int64_t dx_width,
const float scale_h, const float scale_w, T* dx_dptr) {
CUDA_1D_KERNEL_LOOP(index, elem_cnt) {
int64_t n, c, h, w;
dy_helper.OffsetToNdIndex(index, n, c, h, w);
const int64_t dx_h = GetNearestInputIndex(h, scale_h, dx_height);
const int64_t dx_w = GetNearestInputIndex(w, scale_w, dx_width);
cuda::atomic::Add(dx_dptr + dx_helper.NdIndexToOffset(n, c, dx_h, dx_w), dy_dptr[index]);
}
}
template<typename T>
__global__ void UpsampleNearest3DForward(const int64_t elem_cnt, const T* in_dptr,
NdIndexOffsetHelper<int64_t, 5> in_helper,
NdIndexOffsetHelper<int64_t, 5> out_helper,
const int64_t in_depth, const int64_t in_height,
const int64_t in_width, const float scale_d,
const float scale_h, const float scale_w, T* out_dptr) {
CUDA_1D_KERNEL_LOOP(index, elem_cnt) {
int64_t n, c, d, h, w;
out_helper.OffsetToNdIndex(index, n, c, d, h, w);
const int64_t in_h = GetNearestInputIndex(h, scale_h, in_height);
const int64_t in_w = GetNearestInputIndex(w, scale_w, in_width);
const int64_t in_d = GetNearestInputIndex(d, scale_d, in_depth);
out_dptr[index] = in_dptr[in_helper.NdIndexToOffset(n, c, in_d, in_h, in_w)];
}
}
template<typename T>
__global__ void UpsampleNearest3DBackward(const int64_t elem_cnt, const T* dy_dptr,
NdIndexOffsetHelper<int64_t, 5> dy_helper,
NdIndexOffsetHelper<int64_t, 5> dx_helper,
const int64_t in_depth, const int64_t in_height,
const int64_t in_width, const float scale_d,
const float scale_h, const float scale_w, T* dx_dptr) {
CUDA_1D_KERNEL_LOOP(index, elem_cnt) {
int64_t n, c, d, h, w;
dy_helper.OffsetToNdIndex(index, n, c, d, h, w);
const int64_t dx_h = GetNearestInputIndex(h, scale_h, in_height);
const int64_t dx_w = GetNearestInputIndex(w, scale_w, in_width);
const int64_t in_d = GetNearestInputIndex(d, scale_d, in_depth);
cuda::atomic::Add(dx_dptr + dx_helper.NdIndexToOffset(n, c, in_d, dx_h, dx_w), dy_dptr[index]);
}
}
} // namespace
template<typename T>
class UpsampleNearest1DGPUKernel final : public user_op::OpKernel {
public:
UpsampleNearest1DGPUKernel() = default;
~UpsampleNearest1DGPUKernel() = default;
private:
void Compute(user_op::KernelComputeContext* ctx) const override {
const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0);
user_op::Tensor* y_tensor = ctx->Tensor4ArgNameAndIndex("y", 0);
const float height_scale = ctx->Attr<float>("scale_factor");
const int64_t elem_cnt = y_tensor->shape().elem_cnt();
const int64_t in_height = x_tensor->shape().At(2);
const int64_t out_height = y_tensor->shape().At(2);
if (in_height == out_height) {
Memcpy<DeviceType::kGPU>(
ctx->device_ctx(), y_tensor->mut_dptr<void>(), x_tensor->dptr<void>(),
x_tensor->shape().elem_cnt() * GetSizeOfDataType(x_tensor->data_type()));
} else {
NdIndexOffsetHelper<int64_t, 3> in_helper(x_tensor->shape().At(0), x_tensor->shape().At(1),
x_tensor->shape().At(2));
NdIndexOffsetHelper<int64_t, 3> out_helper(y_tensor->shape().At(0), y_tensor->shape().At(1),
y_tensor->shape().At(2));
RUN_CUDA_KERNEL((UpsampleNearest1DForward<T>), ctx->device_ctx(), elem_cnt, elem_cnt,
x_tensor->dptr<T>(), in_helper, out_helper, x_tensor->shape().At(2),
1.f / height_scale, y_tensor->mut_dptr<T>());
}
}
bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
};
template<typename T>
class UpsampleNearestGrad1DGPUKernel final : public user_op::OpKernel {
public:
UpsampleNearestGrad1DGPUKernel() = default;
~UpsampleNearestGrad1DGPUKernel() = default;
private:
void Compute(user_op::KernelComputeContext* ctx) const override {
user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0);
Memset<DeviceType::kGPU>(ctx->device_ctx(), dx_tensor->mut_dptr<T>(), 0,
dx_tensor->shape().elem_cnt() * sizeof(T));
const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0);
const float height_scale = ctx->Attr<float>("scale_factor");
const int64_t elem_cnt = dy_tensor->shape().elem_cnt();
const int64_t in_height = dx_tensor->shape().At(2);
const int64_t out_height = dy_tensor->shape().At(2);
if (in_height == out_height) {
Memcpy<DeviceType::kGPU>(
ctx->device_ctx(), dx_tensor->mut_dptr<void>(), dy_tensor->dptr<void>(),
dy_tensor->shape().elem_cnt() * GetSizeOfDataType(dy_tensor->data_type()));
} else {
NdIndexOffsetHelper<int64_t, 3> dy_helper(dy_tensor->shape().At(0), dy_tensor->shape().At(1),
dy_tensor->shape().At(2));
NdIndexOffsetHelper<int64_t, 3> dx_helper(dx_tensor->shape().At(0), dx_tensor->shape().At(1),
dx_tensor->shape().At(2));
RUN_CUDA_KERNEL((UpsampleNearest1DBackward<T>), ctx->device_ctx(), elem_cnt, elem_cnt,
dy_tensor->dptr<T>(), dy_helper, dx_helper, dx_tensor->shape().At(2),
1.f / height_scale, dx_tensor->mut_dptr<T>());
}
}
bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
};
#define REGISTER_UPSAMPNEAREST1D_GPU_KERNEL(dtype) \
REGISTER_USER_KERNEL("upsample_nearest_1d") \
.SetCreateFn<UpsampleNearest1DGPUKernel<dtype>>() \
.SetIsMatchedHob((user_op::HobDeviceTag() == "gpu") \
& (user_op::HobDataType("y", 0) == GetDataType<dtype>::value)); \
REGISTER_USER_KERNEL("upsample_nearest_1d_grad") \
.SetCreateFn<UpsampleNearestGrad1DGPUKernel<dtype>>() \
.SetIsMatchedHob((user_op::HobDeviceTag() == "gpu") \
& (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
REGISTER_UPSAMPNEAREST1D_GPU_KERNEL(float)
REGISTER_UPSAMPNEAREST1D_GPU_KERNEL(double)
REGISTER_UPSAMPNEAREST1D_GPU_KERNEL(int)
template<typename T>
class UpsampleNearest2DGPUKernel final : public user_op::OpKernel {
public:
UpsampleNearest2DGPUKernel() = default;
~UpsampleNearest2DGPUKernel() = default;
private:
void Compute(user_op::KernelComputeContext* ctx) const override {
const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0);
user_op::Tensor* y_tensor = ctx->Tensor4ArgNameAndIndex("y", 0);
const float height_scale = ctx->Attr<float>("height_scale");
const float width_scale = ctx->Attr<float>("width_scale");
const int64_t elem_cnt = y_tensor->shape().elem_cnt();
const int64_t in_height = x_tensor->shape().At(2);
const int64_t in_width = x_tensor->shape().At(3);
const int64_t out_height = y_tensor->shape().At(2);
const int64_t out_width = y_tensor->shape().At(3);
if (in_height == out_height && in_width == out_width) {
Memcpy<DeviceType::kGPU>(
ctx->device_ctx(), y_tensor->mut_dptr<void>(), x_tensor->dptr<void>(),
x_tensor->shape().elem_cnt() * GetSizeOfDataType(x_tensor->data_type()));
} else {
NdIndexOffsetHelper<int64_t, 4> in_helper(x_tensor->shape().At(0), x_tensor->shape().At(1),
x_tensor->shape().At(2), x_tensor->shape().At(3));
NdIndexOffsetHelper<int64_t, 4> out_helper(y_tensor->shape().At(0), y_tensor->shape().At(1),
y_tensor->shape().At(2), y_tensor->shape().At(3));
RUN_CUDA_KERNEL((UpsampleNearest2DForward<T>), ctx->device_ctx(), elem_cnt, elem_cnt,
x_tensor->dptr<T>(), in_helper, out_helper, x_tensor->shape().At(2),
x_tensor->shape().At(3), 1.f / height_scale, 1.f / width_scale,
y_tensor->mut_dptr<T>());
}
}
bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
};
template<typename T>
class UpsampleNearest2DGradGPUKernel final : public user_op::OpKernel {
public:
UpsampleNearest2DGradGPUKernel() = default;
~UpsampleNearest2DGradGPUKernel() = default;
private:
void Compute(user_op::KernelComputeContext* ctx) const override {
user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0);
Memset<DeviceType::kGPU>(ctx->device_ctx(), dx_tensor->mut_dptr<T>(), 0,
dx_tensor->shape().elem_cnt() * sizeof(T));
const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0);
const float height_scale = ctx->Attr<float>("height_scale");
const float width_scale = ctx->Attr<float>("width_scale");
const int64_t elem_cnt = dy_tensor->shape().elem_cnt();
const int64_t in_height = dx_tensor->shape().At(2);
const int64_t in_width = dx_tensor->shape().At(3);
const int64_t out_height = dy_tensor->shape().At(2);
const int64_t out_width = dy_tensor->shape().At(3);
if (in_height == out_height && in_width == out_width) {
Memcpy<DeviceType::kGPU>(
ctx->device_ctx(), dx_tensor->mut_dptr<void>(), dy_tensor->dptr<void>(),
dy_tensor->shape().elem_cnt() * GetSizeOfDataType(dy_tensor->data_type()));
} else {
NdIndexOffsetHelper<int64_t, 4> dy_helper(dy_tensor->shape().At(0), dy_tensor->shape().At(1),
dy_tensor->shape().At(2), dy_tensor->shape().At(3));
NdIndexOffsetHelper<int64_t, 4> dx_helper(dx_tensor->shape().At(0), dx_tensor->shape().At(1),
dx_tensor->shape().At(2), dx_tensor->shape().At(3));
RUN_CUDA_KERNEL((UpsampleNearest2DBackward<T>), ctx->device_ctx(), elem_cnt, elem_cnt,
dy_tensor->dptr<T>(), dy_helper, dx_helper, dx_tensor->shape().At(2),
dx_tensor->shape().At(3), 1.f / height_scale, 1.f / width_scale,
dx_tensor->mut_dptr<T>());
}
}
bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
};
#define REGISTER_UPSAMPLE_NEAREST_2D_GPU_KERNEL(dtype) \
REGISTER_USER_KERNEL("upsample_nearest_2d") \
.SetCreateFn<UpsampleNearest2DGPUKernel<dtype>>() \
.SetIsMatchedHob((user_op::HobDeviceTag() == "gpu") \
& (user_op::HobDataType("y", 0) == GetDataType<dtype>::value)); \
REGISTER_USER_KERNEL("upsample_nearest_2d_grad") \
.SetCreateFn<UpsampleNearest2DGradGPUKernel<dtype>>() \
.SetIsMatchedHob((user_op::HobDeviceTag() == "gpu") \
& (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
REGISTER_UPSAMPLE_NEAREST_2D_GPU_KERNEL(float)
REGISTER_UPSAMPLE_NEAREST_2D_GPU_KERNEL(double)
REGISTER_UPSAMPLE_NEAREST_2D_GPU_KERNEL(int)
template<typename T>
class UpsampleNearest3DGPUKernel final : public user_op::OpKernel {
public:
UpsampleNearest3DGPUKernel() = default;
~UpsampleNearest3DGPUKernel() = default;
private:
void Compute(user_op::KernelComputeContext* ctx) const override {
const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0);
user_op::Tensor* y_tensor = ctx->Tensor4ArgNameAndIndex("y", 0);
const float height_scale = ctx->Attr<float>("height_scale");
const float width_scale = ctx->Attr<float>("width_scale");
const float depth_scale = ctx->Attr<float>("depth_scale");
const int64_t elem_cnt = y_tensor->shape().elem_cnt();
NdIndexOffsetHelper<int64_t, 5> in_helper(x_tensor->shape().At(0), x_tensor->shape().At(1),
x_tensor->shape().At(2), x_tensor->shape().At(3),
x_tensor->shape().At(4));
NdIndexOffsetHelper<int64_t, 5> out_helper(y_tensor->shape().At(0), y_tensor->shape().At(1),
y_tensor->shape().At(2), y_tensor->shape().At(3),
y_tensor->shape().At(4));
RUN_CUDA_KERNEL((UpsampleNearest3DForward<T>), ctx->device_ctx(), elem_cnt, elem_cnt,
x_tensor->dptr<T>(), in_helper, out_helper, x_tensor->shape().At(2),
x_tensor->shape().At(3), x_tensor->shape().At(4), 1.f / depth_scale,
1.f / height_scale, 1.f / width_scale, y_tensor->mut_dptr<T>());
}
bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
};
template<typename T>
class UpsampleNearestGrad3DGPUKernel final : public user_op::OpKernel {
public:
UpsampleNearestGrad3DGPUKernel() = default;
~UpsampleNearestGrad3DGPUKernel() = default;
private:
void Compute(user_op::KernelComputeContext* ctx) const override {
user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0);
Memset<DeviceType::kGPU>(ctx->device_ctx(), dx_tensor->mut_dptr<T>(), 0,
dx_tensor->shape().elem_cnt() * sizeof(T));
const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0);
const float height_scale = ctx->Attr<float>("height_scale");
const float width_scale = ctx->Attr<float>("width_scale");
const float depth_scale = ctx->Attr<float>("depth_scale");
const int64_t elem_cnt = dy_tensor->shape().elem_cnt();
NdIndexOffsetHelper<int64_t, 5> dy_helper(dy_tensor->shape().At(0), dy_tensor->shape().At(1),
dy_tensor->shape().At(2), dy_tensor->shape().At(3),
dy_tensor->shape().At(4));
NdIndexOffsetHelper<int64_t, 5> dx_helper(dx_tensor->shape().At(0), dx_tensor->shape().At(1),
dx_tensor->shape().At(2), dx_tensor->shape().At(3),
dx_tensor->shape().At(4));
RUN_CUDA_KERNEL((UpsampleNearest3DBackward<T>), ctx->device_ctx(), elem_cnt, elem_cnt,
dy_tensor->dptr<T>(), dy_helper, dx_helper, dx_tensor->shape().At(2),
dx_tensor->shape().At(3), dx_tensor->shape().At(4), 1.f / depth_scale,
1.f / height_scale, 1.f / width_scale, dx_tensor->mut_dptr<T>());
}
bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
};
#define REGISTER_UPSAMPNEAREST3D_GPU_KERNEL(dtype) \
REGISTER_USER_KERNEL("upsample_nearest_3d") \
.SetCreateFn<UpsampleNearest3DGPUKernel<dtype>>() \
.SetIsMatchedHob((user_op::HobDeviceTag() == "gpu") \
& (user_op::HobDataType("y", 0) == GetDataType<dtype>::value)); \
REGISTER_USER_KERNEL("upsample_nearest_3d_grad") \
.SetCreateFn<UpsampleNearestGrad3DGPUKernel<dtype>>() \
.SetIsMatchedHob((user_op::HobDeviceTag() == "gpu") \
& (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
REGISTER_UPSAMPNEAREST3D_GPU_KERNEL(float)
REGISTER_UPSAMPNEAREST3D_GPU_KERNEL(double)
REGISTER_UPSAMPNEAREST3D_GPU_KERNEL(int)
} // namespace oneflow
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "oneflow/core/framework/framework.h"
#include "oneflow/core/kernel/new_kernel_util.h"
#include "oneflow/core/common/nd_index_offset_helper.h"
#include "oneflow/user/kernels/upsample_kernel.h"
namespace oneflow {
namespace {
template<typename T>
static void UpsampleTrilinear3DForward(const int64_t elem_cnt, const T* in_dptr,
NdIndexOffsetHelper<int64_t, 5> in_helper,
NdIndexOffsetHelper<int64_t, 5> out_helper,
const int64_t in_depth, const int64_t in_height,
const int64_t in_width, const T rdepth, const T rheight,
const T rwidth, const bool align_corners, T* out_dptr) {
for (int64_t index = 0; index < elem_cnt; ++index) {
int64_t n, c, d, h, w;
out_helper.OffsetToNdIndex(index, n, c, d, h, w);
const T t1r = GetAreaPixel(rdepth, d, align_corners);
const int64_t t1 = t1r;
const int64_t t1p = (t1 < in_depth - 1) ? 1 : 0;
const T t1lambda = t1r - t1;
const T t0lambda = static_cast<T>(1.) - t1lambda;
const T h1r = GetAreaPixel(rheight, h, align_corners);
const int64_t h1 = h1r;
const int64_t h1p = (h1 < in_height - 1) ? 1 : 0;
const T h1lambda = h1r - h1;
const T h0lambda = static_cast<T>(1.) - h1lambda;
const T w1r = GetAreaPixel(rwidth, w, align_corners);
const int64_t w1 = w1r;
const int64_t w1p = (w1 < in_width - 1) ? 1 : 0;
const T w1lambda = w1r - w1;
const T w0lambda = static_cast<T>(1.) - w1lambda;
const T* pos1 = &in_dptr[in_helper.NdIndexToOffset(n, c, t1, h1, w1)];
out_dptr[index] =
t0lambda
* (h0lambda * (w0lambda * pos1[0] + w1lambda * pos1[w1p])
+ h1lambda
* (w0lambda * pos1[h1p * in_width] + w1lambda * pos1[h1p * in_width + w1p]))
+ t1lambda
* (h0lambda
* (w0lambda * pos1[t1p * in_height * in_width]
+ w1lambda * pos1[t1p * in_height * in_width + w1p])
+ h1lambda
* (w0lambda * pos1[t1p * in_height * in_width + h1p * in_width]
+ w1lambda * pos1[t1p * in_height * in_width + h1p * in_width + w1p]));
}
}
template<typename T>
static void UpsampleTrilinear3DBackward(const int64_t elem_cnt, const T* dy_dptr,
NdIndexOffsetHelper<int64_t, 5> dy_helper,
NdIndexOffsetHelper<int64_t, 5> dx_helper,
const int64_t in_depth, const int64_t in_height,
const int64_t in_width, const T rdepth, const T rheight,
const T rwidth, const bool align_corners, T* dx_dptr) {
for (int64_t index = 0; index < elem_cnt; ++index) {
int64_t n, c, d, h, w;
dy_helper.OffsetToNdIndex(index, n, c, d, h, w);
const T t1r = GetAreaPixel(rdepth, d, align_corners);
const int64_t t1 = t1r;
const int64_t t1p = (t1 < in_depth - 1) ? 1 : 0;
const T t1lambda = t1r - t1;
const T t0lambda = static_cast<T>(1.) - t1lambda;
const T h1r = GetAreaPixel(rheight, h, align_corners);
const int64_t h1 = h1r;
const int64_t h1p = (h1 < in_height - 1) ? 1 : 0;
const T h1lambda = h1r - h1;
const T h0lambda = static_cast<T>(1.) - h1lambda;
const T w1r = GetAreaPixel(rwidth, w, align_corners);
const int64_t w1 = w1r;
const int64_t w1p = (w1 < in_width - 1) ? 1 : 0;
const T w1lambda = w1r - w1;
const T w0lambda = static_cast<T>(1.) - w1lambda;
T* pos1 = &dx_dptr[dx_helper.NdIndexToOffset(n, c, t1, h1, w1)];
const T* pos2 = &dy_dptr[index];
pos1[0] += t0lambda * h0lambda * w0lambda * pos2[0];
pos1[w1p] += t0lambda * h0lambda * w1lambda * pos2[0];
pos1[h1p * in_width] += t0lambda * h1lambda * w0lambda * pos2[0];
pos1[h1p * in_width + w1p] += t0lambda * h1lambda * w1lambda * pos2[0];
pos1[t1p * in_height * in_width] += t1lambda * h0lambda * w0lambda * pos2[0];
pos1[t1p * in_height * in_width + w1p] += t1lambda * h0lambda * w1lambda * pos2[0];
pos1[t1p * in_height * in_width + h1p * in_width] += t1lambda * h1lambda * w0lambda * pos2[0];
pos1[t1p * in_height * in_width + h1p * in_width + w1p] +=
t1lambda * h1lambda * w1lambda * pos2[0];
}
}
} // namespace
template<typename T>
class UpsampleTrilinear3DCPUKernel final : public user_op::OpKernel {
public:
UpsampleTrilinear3DCPUKernel() = default;
~UpsampleTrilinear3DCPUKernel() = default;
private:
void Compute(user_op::KernelComputeContext* ctx) const override {
const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0);
user_op::Tensor* y_tensor = ctx->Tensor4ArgNameAndIndex("y", 0);
const float depth_scale = ctx->Attr<float>("depth_scale");
const float height_scale = ctx->Attr<float>("height_scale");
const float width_scale = ctx->Attr<float>("width_scale");
const bool align_corners = ctx->Attr<bool>("align_corners");
const int64_t elem_cnt = y_tensor->shape().elem_cnt();
NdIndexOffsetHelper<int64_t, 5> in_helper(x_tensor->shape().At(0), x_tensor->shape().At(1),
x_tensor->shape().At(2), x_tensor->shape().At(3),
x_tensor->shape().At(4));
NdIndexOffsetHelper<int64_t, 5> out_helper(y_tensor->shape().At(0), y_tensor->shape().At(1),
y_tensor->shape().At(2), y_tensor->shape().At(3),
y_tensor->shape().At(4));
const int64_t in_depth = x_tensor->shape().At(2);
const int64_t in_height = x_tensor->shape().At(3);
const int64_t in_width = x_tensor->shape().At(4);
const int64_t out_depth = y_tensor->shape().At(2);
const int64_t out_height = y_tensor->shape().At(3);
const int64_t out_width = y_tensor->shape().At(4);
const T scale_depth = GetAreaPixelScale(in_depth, out_depth, align_corners, depth_scale);
const T scale_height = GetAreaPixelScale(in_height, out_height, align_corners, height_scale);
const T scale_width = GetAreaPixelScale(in_width, out_width, align_corners, width_scale);
UpsampleTrilinear3DForward<T>(elem_cnt, x_tensor->dptr<T>(), in_helper, out_helper,
x_tensor->shape().At(2), x_tensor->shape().At(3),
x_tensor->shape().At(4), scale_depth, scale_height, scale_width,
align_corners, y_tensor->mut_dptr<T>());
}
bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
};
template<typename T>
class UpsampleTrilinearGrad3DCPUKernel final : public user_op::OpKernel {
public:
UpsampleTrilinearGrad3DCPUKernel() = default;
~UpsampleTrilinearGrad3DCPUKernel() = default;
private:
void Compute(user_op::KernelComputeContext* ctx) const override {
user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0);
Memset<DeviceType::kCPU>(ctx->device_ctx(), dx_tensor->mut_dptr<T>(), 0,
dx_tensor->shape().elem_cnt() * sizeof(T));
const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0);
const float depth_scale = ctx->Attr<float>("depth_scale");
const float height_scale = ctx->Attr<float>("height_scale");
const float width_scale = ctx->Attr<float>("width_scale");
const bool align_corners = ctx->Attr<bool>("align_corners");
const int64_t elem_cnt = dy_tensor->shape().elem_cnt();
NdIndexOffsetHelper<int64_t, 5> dy_helper(dy_tensor->shape().At(0), dy_tensor->shape().At(1),
dy_tensor->shape().At(2), dy_tensor->shape().At(3),
dy_tensor->shape().At(4));
NdIndexOffsetHelper<int64_t, 5> dx_helper(dx_tensor->shape().At(0), dx_tensor->shape().At(1),
dx_tensor->shape().At(2), dx_tensor->shape().At(3),
dx_tensor->shape().At(4));
const int64_t in_depth = dx_tensor->shape().At(2);
const int64_t in_height = dx_tensor->shape().At(3);
const int64_t in_width = dx_tensor->shape().At(4);
const int64_t out_depth = dy_tensor->shape().At(2);
const int64_t out_height = dy_tensor->shape().At(3);
const int64_t out_width = dy_tensor->shape().At(4);
const T scale_depth = GetAreaPixelScale(in_depth, out_depth, align_corners, depth_scale);
const T scale_height = GetAreaPixelScale(in_height, out_height, align_corners, height_scale);
const T scale_width = GetAreaPixelScale(in_width, out_width, align_corners, width_scale);
UpsampleTrilinear3DBackward<T>(elem_cnt, dy_tensor->dptr<T>(), dy_helper, dx_helper,
dx_tensor->shape().At(2), dx_tensor->shape().At(3),
dx_tensor->shape().At(4), scale_depth, scale_height, scale_width,
align_corners, dx_tensor->mut_dptr<T>());
}
bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
};
#define REGISTER_UPSAMPTRILINEAR3D_CPU_KERNEL(dtype) \
REGISTER_USER_KERNEL("upsample_trilinear_3d") \
.SetCreateFn<UpsampleTrilinear3DCPUKernel<dtype>>() \
.SetIsMatchedHob((user_op::HobDeviceTag() == "cpu") \
& (user_op::HobDataType("y", 0) == GetDataType<dtype>::value)); \
REGISTER_USER_KERNEL("upsample_trilinear_3d_grad") \
.SetCreateFn<UpsampleTrilinearGrad3DCPUKernel<dtype>>() \
.SetIsMatchedHob((user_op::HobDeviceTag() == "cpu") \
& (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
REGISTER_UPSAMPTRILINEAR3D_CPU_KERNEL(float)
REGISTER_UPSAMPTRILINEAR3D_CPU_KERNEL(double)
REGISTER_UPSAMPTRILINEAR3D_CPU_KERNEL(int)
} // namespace oneflow
/*
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "oneflow/core/framework/framework.h"
#include "oneflow/core/kernel/new_kernel_util.h"
#include "oneflow/core/common/nd_index_offset_helper.h"
#include "oneflow/core/cuda/atomic.cuh"
#include "oneflow/user/kernels/upsample_kernel.h"
namespace oneflow {
namespace {
template<typename T>
__global__ void UpsampleTrilinear3DForward(const int64_t elem_cnt, const T* in_dptr,
NdIndexOffsetHelper<int64_t, 5> in_helper,
NdIndexOffsetHelper<int64_t, 5> out_helper,
const int64_t in_depth, const int64_t in_height,
const int64_t in_width, const T rdepth, const T rheight,
const T rwidth, const bool align_corners, T* out_dptr) {
CUDA_1D_KERNEL_LOOP(index, elem_cnt) {
int64_t n, c, d, h, w;
out_helper.OffsetToNdIndex(index, n, c, d, h, w);
const T t1r = GetAreaPixel(rdepth, d, align_corners);
const int64_t t1 = t1r;
const int64_t t1p = (t1 < in_depth - 1) ? 1 : 0;
const T t1lambda = t1r - t1;
const T t0lambda = static_cast<T>(1.) - t1lambda;
const T h1r = GetAreaPixel(rheight, h, align_corners);
const int64_t h1 = h1r;
const int64_t h1p = (h1 < in_height - 1) ? 1 : 0;
const T h1lambda = h1r - h1;
const T h0lambda = static_cast<T>(1.) - h1lambda;
const T w1r = GetAreaPixel(rwidth, w, align_corners);
const int64_t w1 = w1r;
const int64_t w1p = (w1 < in_width - 1) ? 1 : 0;
const T w1lambda = w1r - w1;
const T w0lambda = static_cast<T>(1.) - w1lambda;
const T* pos1 = &in_dptr[in_helper.NdIndexToOffset(n, c, t1, h1, w1)];
out_dptr[index] =
t0lambda
* (h0lambda * (w0lambda * pos1[0] + w1lambda * pos1[w1p])
+ h1lambda
* (w0lambda * pos1[h1p * in_width] + w1lambda * pos1[h1p * in_width + w1p]))
+ t1lambda
* (h0lambda
* (w0lambda * pos1[t1p * in_height * in_width]
+ w1lambda * pos1[t1p * in_height * in_width + w1p])
+ h1lambda
* (w0lambda * pos1[t1p * in_height * in_width + h1p * in_width]
+ w1lambda * pos1[t1p * in_height * in_width + h1p * in_width + w1p]));
}
}
template<typename T>
__global__ void UpsampleTrilinear3DBackward(const int64_t elem_cnt, const T* dy_dptr,
NdIndexOffsetHelper<int64_t, 5> dy_helper,
NdIndexOffsetHelper<int64_t, 5> dx_helper,
const int64_t in_depth, const int64_t in_height,
const int64_t in_width, const T rdepth, const T rheight,
const T rwidth, const bool align_corners, T* dx_dptr) {
CUDA_1D_KERNEL_LOOP(index, elem_cnt) {
int64_t n, c, d, h, w;
dy_helper.OffsetToNdIndex(index, n, c, d, h, w);
const T t1r = GetAreaPixel(rdepth, d, align_corners);
const int64_t t1 = t1r;
const int64_t t1p = (t1 < in_depth - 1) ? 1 : 0;
const T t1lambda = t1r - t1;
const T t0lambda = static_cast<T>(1.) - t1lambda;
const T h1r = GetAreaPixel(rheight, h, align_corners);
const int64_t h1 = h1r;
const int64_t h1p = (h1 < in_height - 1) ? 1 : 0;
const T h1lambda = h1r - h1;
const T h0lambda = static_cast<T>(1.) - h1lambda;
const T w1r = GetAreaPixel(rwidth, w, align_corners);
const int64_t w1 = w1r;
const int64_t w1p = (w1 < in_width - 1) ? 1 : 0;
const T w1lambda = w1r - w1;
const T w0lambda = static_cast<T>(1.) - w1lambda;
T* pos1 = &dx_dptr[dx_helper.NdIndexToOffset(n, c, t1, h1, w1)];
const T* pos2 = &dy_dptr[index];
cuda::atomic::Add(pos1 + 0, t0lambda * h0lambda * w0lambda * pos2[0]);
cuda::atomic::Add(pos1 + w1p, t0lambda * h0lambda * w1lambda * pos2[0]);
cuda::atomic::Add(pos1 + h1p * in_width, t0lambda * h1lambda * w0lambda * pos2[0]);
cuda::atomic::Add(pos1 + h1p * in_width + w1p, t0lambda * h1lambda * w1lambda * pos2[0]);
cuda::atomic::Add(pos1 + t1p * in_height * in_width, t1lambda * h0lambda * w0lambda * pos2[0]);
cuda::atomic::Add(pos1 + t1p * in_height * in_width + w1p,
t1lambda * h0lambda * w1lambda * pos2[0]);
cuda::atomic::Add(pos1 + t1p * in_height * in_width + h1p * in_width,
t1lambda * h1lambda * w0lambda * pos2[0]);
cuda::atomic::Add(pos1 + t1p * in_height * in_width + h1p * in_width + w1p,
t1lambda * h1lambda * w1lambda * pos2[0]);
}
}
} // namespace
template<typename T>
class UpsampleTrilinear3DGPUKernel final : public user_op::OpKernel {
public:
UpsampleTrilinear3DGPUKernel() = default;
~UpsampleTrilinear3DGPUKernel() = default;
private:
void Compute(user_op::KernelComputeContext* ctx) const override {
const user_op::Tensor* x_tensor = ctx->Tensor4ArgNameAndIndex("x", 0);
user_op::Tensor* y_tensor = ctx->Tensor4ArgNameAndIndex("y", 0);
const float depth_scale = ctx->Attr<float>("depth_scale");
const float height_scale = ctx->Attr<float>("height_scale");
const float width_scale = ctx->Attr<float>("width_scale");
const bool align_corners = ctx->Attr<bool>("align_corners");
const int64_t elem_cnt = y_tensor->shape().elem_cnt();
NdIndexOffsetHelper<int64_t, 5> in_helper(x_tensor->shape().At(0), x_tensor->shape().At(1),
x_tensor->shape().At(2), x_tensor->shape().At(3),
x_tensor->shape().At(4));
NdIndexOffsetHelper<int64_t, 5> out_helper(y_tensor->shape().At(0), y_tensor->shape().At(1),
y_tensor->shape().At(2), y_tensor->shape().At(3),
y_tensor->shape().At(4));
const int64_t in_depth = x_tensor->shape().At(2);
const int64_t in_height = x_tensor->shape().At(3);
const int64_t in_width = x_tensor->shape().At(4);
const int64_t out_depth = y_tensor->shape().At(2);
const int64_t out_height = y_tensor->shape().At(3);
const int64_t out_width = y_tensor->shape().At(4);
const T scale_depth = GetAreaPixelScale(in_depth, out_depth, align_corners, depth_scale);
const T scale_height = GetAreaPixelScale(in_height, out_height, align_corners, height_scale);
const T scale_width = GetAreaPixelScale(in_width, out_width, align_corners, width_scale);
RUN_CUDA_KERNEL((UpsampleTrilinear3DForward<T>), ctx->device_ctx(), elem_cnt, elem_cnt,
x_tensor->dptr<T>(), in_helper, out_helper, x_tensor->shape().At(2),
x_tensor->shape().At(3), x_tensor->shape().At(4), scale_depth, scale_height,
scale_width, align_corners, y_tensor->mut_dptr<T>());
}
bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
};
template<typename T>
class UpsampleTrilinearGrad3DGPUKernel final : public user_op::OpKernel {
public:
UpsampleTrilinearGrad3DGPUKernel() = default;
~UpsampleTrilinearGrad3DGPUKernel() = default;
private:
void Compute(user_op::KernelComputeContext* ctx) const override {
user_op::Tensor* dx_tensor = ctx->Tensor4ArgNameAndIndex("dx", 0);
Memset<DeviceType::kGPU>(ctx->device_ctx(), dx_tensor->mut_dptr<T>(), 0,
dx_tensor->shape().elem_cnt() * sizeof(T));
const user_op::Tensor* dy_tensor = ctx->Tensor4ArgNameAndIndex("dy", 0);
const float depth_scale = ctx->Attr<float>("depth_scale");
const float height_scale = ctx->Attr<float>("height_scale");
const float width_scale = ctx->Attr<float>("width_scale");
const bool align_corners = ctx->Attr<bool>("align_corners");
const int64_t elem_cnt = dy_tensor->shape().elem_cnt();
NdIndexOffsetHelper<int64_t, 5> dy_helper(dy_tensor->shape().At(0), dy_tensor->shape().At(1),
dy_tensor->shape().At(2), dy_tensor->shape().At(3),
dy_tensor->shape().At(4));
NdIndexOffsetHelper<int64_t, 5> dx_helper(dx_tensor->shape().At(0), dx_tensor->shape().At(1),
dx_tensor->shape().At(2), dx_tensor->shape().At(3),
dx_tensor->shape().At(4));
const int64_t in_depth = dx_tensor->shape().At(2);
const int64_t in_height = dx_tensor->shape().At(3);
const int64_t in_width = dx_tensor->shape().At(4);
const int64_t out_depth = dy_tensor->shape().At(2);
const int64_t out_height = dy_tensor->shape().At(3);
const int64_t out_width = dy_tensor->shape().At(4);
const T scale_depth = GetAreaPixelScale(in_depth, out_depth, align_corners, depth_scale);
const T scale_height = GetAreaPixelScale(in_height, out_height, align_corners, height_scale);
const T scale_width = GetAreaPixelScale(in_width, out_width, align_corners, width_scale);
RUN_CUDA_KERNEL((UpsampleTrilinear3DBackward<T>), ctx->device_ctx(), elem_cnt, elem_cnt,
dy_tensor->dptr<T>(), dy_helper, dx_helper, dx_tensor->shape().At(2),
dx_tensor->shape().At(3), dx_tensor->shape().At(4), scale_depth, scale_height,
scale_width, align_corners, dx_tensor->mut_dptr<T>());
}
bool AlwaysComputeWhenAllOutputsEmpty() const override { return false; }
};
#define REGISTER_UPSAMPTRILINEAR3D_GPU_KERNEL(dtype) \
REGISTER_USER_KERNEL("upsample_trilinear_3d") \
.SetCreateFn<UpsampleTrilinear3DGPUKernel<dtype>>() \
.SetIsMatchedHob((user_op::HobDeviceTag() == "gpu") \
& (user_op::HobDataType("y", 0) == GetDataType<dtype>::value)); \
REGISTER_USER_KERNEL("upsample_trilinear_3d_grad") \
.SetCreateFn<UpsampleTrilinearGrad3DGPUKernel<dtype>>() \
.SetIsMatchedHob((user_op::HobDeviceTag() == "gpu") \
& (user_op::HobDataType("dx", 0) == GetDataType<dtype>::value));
REGISTER_UPSAMPTRILINEAR3D_GPU_KERNEL(float)
REGISTER_UPSAMPTRILINEAR3D_GPU_KERNEL(double)
REGISTER_UPSAMPTRILINEAR3D_GPU_KERNEL(int)
} // namespace oneflow
This diff is collapsed.
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment